Update urgent_issue_reminder.yml - run daily

The action will run daily, alerting about urgent issues not touched in the last 7 days.
Merge 'test.py: dtest: port next_gating tests from commitlog_test.py' from Evgeniy Naydanov
2025-08-20 14:49:33 +03:00 · 2025-08-19 17:25:07 +03:00 · 2025-08-19 17:21:18 +03:00 · 2025-08-19 13:17:29 +03:00 · 2025-08-19 13:13:22 +03:00 · 2025-08-19 13:09:18 +03:00
867 changed files with 35515 additions and 17365 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -52,7 +52,7 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
        if is_draft:
            backport_pr.add_to_labels("conflicts")
            pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
-            pr_comment += "Please resolve them and remove the 'conflicts' label. The PR will be made ready for review automatically."
+            pr_comment += "Please resolve them and mark this PR as ready for review"
            backport_pr.create_issue_comment(pr_comment)
        logging.info(f"Assigned PR to original author: {pr.user}")
        return backport_pr
@@ -126,31 +126,20 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):


 def with_github_keyword_prefix(repo, pr):
-    # GitHub issue pattern: #123, scylladb/scylladb#123, or full GitHub URLs
-    github_pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
-    
-    # JIRA issue pattern: PKG-92 or https://scylladb.atlassian.net/browse/PKG-92
-    jira_pattern = r"(?:fix(?:|es|ed))\s*:?\s*(?:(?:https://scylladb\.atlassian\.net/browse/)?([A-Z]+-\d+))"
-    
-    # Check PR body for GitHub issues
-    github_match = re.findall(github_pattern, pr.body, re.IGNORECASE)
-    # Check PR body for JIRA issues
-    jira_match = re.findall(jira_pattern, pr.body, re.IGNORECASE)
-    
-    match = github_match or jira_match
-
-    if match:
+    pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
+    match = re.findall(pattern, pr.body, re.IGNORECASE)
+    if not match:
+        for commit in pr.get_commits():
+            match = re.findall(pattern, commit.commit.message, re.IGNORECASE)
+            if match:
+                print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
+                break
+    if not match:
+        print(f'No valid close reference for {pr.number}')
+        return False
+    else:
        return True

-    for commit in pr.get_commits():
-        github_match = re.findall(github_pattern, commit.commit.message, re.IGNORECASE)
-        jira_match = re.findall(jira_pattern, commit.commit.message, re.IGNORECASE)
-        if github_match or jira_match:
-            print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
-            return True
-
-    print(f'No valid close reference for {pr.number}')
-    return False

 def main():
    args = parse_args()
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
+            const pattern = `Fixes:? (?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -1,53 +0,0 @@
-name: Backport with Jira Integration
-
-on:
-  push:
-    branches:
-      - master
-      - next-*.*
-      - branch-*.*
-  pull_request_target:
-    types: [labeled, closed]
-    branches: 
-      - master
-      - next
-      - next-*.*
-      - branch-*.*
-
-jobs:
-  backport-on-push:
-    if: github.event_name == 'push'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'push'
-      base_branch: ${{ github.ref }}
-      commits: ${{ github.event.before }}..${{ github.sha }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-on-label:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'labeled'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      head_commit: ${{ github.event.pull_request.base.sha }}
-      label_name: ${{ github.event.label.name }}
-      pr_state: ${{ github.event.pull_request.state }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-chain:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'chain'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      pr_body: ${{ github.event.pull_request.body }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_status_in_progress.yml
+++ b/.github/workflows/call_jira_status_in_progress.yml
@@ -0,0 +1,11 @@
+name: Call Jira Status In Progress
+
+on:
+  pull_request:
+    types: [opened]
+
+jobs:
+  call-jira-status-in-progress:
+    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_progress.yml@main
+    secrets:
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_status_in_review.yml
+++ b/.github/workflows/call_jira_status_in_review.yml
@@ -0,0 +1,11 @@
+name: Call Jira Status In Review
+
+on:
+  pull_request:
+    types: [ready_for_review, review_requested]
+
+jobs:
+  call-jira-status-in-review:
+    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_review.yml@main
+    secrets:
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_status_ready_for_merge.yml
+++ b/.github/workflows/call_jira_status_ready_for_merge.yml
@@ -0,0 +1,13 @@
+name: Call Jira Status Ready For Merge
+
+on:
+  pull_request:
+    types: [labeled]
+
+jobs:
+  call-jira-status-update:
+    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_ready_for_merge.yml@main
+    with:
+      label_name: 'status/merge_candidate'
+    secrets:
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/conflict_reminder.yaml
+++ b/.github/workflows/conflict_reminder.yaml
@@ -10,7 +10,7 @@ on:
      - 'master'
      - 'branch-*'
  schedule:
-    - cron: '0 10 * * 1,4'  # Runs every Monday and Thursday at 10:00am
+    - cron: '0 10 * * 1'  # Runs every Monday at 10:00am

 jobs:
  notify_conflict_prs:
@@ -42,10 +42,10 @@ jobs:
            const recentPrs = prs.filter(pr => new Date(pr.created_at) >= twoMonthsAgo);
            const validBaseBranches = ['master'];
            const branchPrefix = 'branch-';
-            const threeDaysAgo = new Date();
+            const oneWeekAgo = new Date();
            const conflictLabel = 'conflicts';
-            threeDaysAgo.setDate(threeDaysAgo.getDate() - 3);
-            console.log(`Three days ago: ${threeDaysAgo.toISOString()}`);
+            oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
+            console.log(`One week ago: ${oneWeekAgo.toISOString()}`);

            for (const pr of recentPrs) {
              console.log(`Checking PR #${pr.number} on base branch '${pr.base.ref}'`);
@@ -57,8 +57,8 @@ jobs:
              }
              const updatedDate = new Date(pr.updated_at);
              console.log(`PR #${pr.number} last updated at: ${updatedDate.toISOString()}`);
-              if (!isPushEvent && updatedDate >= threeDaysAgo) {
-                console.log(`PR #${pr.number} skipped: updated within last 3 days`);
+              if (!isPushEvent && updatedDate >= oneWeekAgo) {
+                console.log(`PR #${pr.number} skipped: updated within last week`);
                continue;
              }
              if (pr.assignee === null) {
@@ -90,34 +90,41 @@ jobs:
                const hasConflictLabel = pr.labels.some(label => label.name === conflictLabel);
                console.log(`PR #${pr.number} has conflict label: ${hasConflictLabel}`);

+                // Fetch comments to check for existing notifications
+                const comments = await github.paginate(github.rest.issues.listComments, {
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: pr.number,
+                  per_page: 100,
+                });
+                
+                // Find last notification comment from the bot
+                const notificationPrefix = `@${pr.assignee.login}, this PR has merge conflicts with the base branch.`;
+                const lastNotification = comments
+                  .filter(c =>
+                    c.user.type === "Bot" &&
+                    c.body.startsWith(notificationPrefix)
+                  )
+                  .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
+
+                // Check if we should skip notification based on recent notification
+                let shouldSkipNotification = false;
+                if (lastNotification) {
+                  const lastNotified = new Date(lastNotification.created_at);
+                  if (lastNotified >= oneWeekAgo) {
+                    console.log(`PR #${pr.number} skipped: last notification was less than 1 week ago`);
+                    shouldSkipNotification = true;
+                  }
+                }
+
+                // Additional check for push events on draft PRs with conflict labels
                if (
                  isPushEvent &&
                  pr.draft === true &&
-                  hasConflictLabel
+                  hasConflictLabel &&
+                  shouldSkipNotification
                ) {
-                  // Fetch comments to find last bot notification
-                  const comments = await github.paginate(github.rest.issues.listComments, {
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    issue_number: pr.number,
-                    per_page: 100,
-                  });
-                  // Find last notification comment from the bot (by body and user)
-                  const botLogin = context.actor;
-                  const notificationPrefix = `@${pr.assignee.login}, this PR has merge conflicts with the base branch.`;
-                  const lastNotification = comments
-                    .filter(c =>
-                      c.user.type === "Bot" &&
-                      c.body.startsWith(notificationPrefix)
-                    )
-                    .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
-                  if (lastNotification) {
-                    const lastNotified = new Date(lastNotification.created_at);
-                    if (lastNotified >= threeDaysAgo) {
-                      console.log(`PR #${pr.number} skipped: last notification was less than 3 days ago`);
-                      continue;
-                    }
-                  }
+                  continue;
                }

                if (!hasConflictLabel) {
@@ -129,8 +136,9 @@ jobs:
                  });
                  console.log(`Added 'conflicts' label to PR #${pr.number}`);
                }
+                
                const assignee = pr.assignee.login;
-                if (assignee) {
+                if (assignee && !shouldSkipNotification) {
                  await github.rest.issues.createComment({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
--- a/.github/workflows/urgent_issue_reminder.yml
+++ b/.github/workflows/urgent_issue_reminder.yml
@@ -2,7 +2,7 @@ name: Urgent Issue Reminder

 on:
  schedule:
-    - cron: '10 8 * * 1' # Runs every Monday at 8 AM
+    - cron: '10 8 * * *' # Runs daily at 8 AM

 jobs:
  reminder:
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@ compile_commands.json
 .envrc
 clang_build
 .idea/
+nuke
+rust/target
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../scylla-seastar
+	url = ../seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,7 +171,6 @@ target_sources(scylla-main
    client_data.cc
    clocks-impl.cc
    collection_mutation.cc
-    compress.cc
    converting_mutation_partition_applier.cc
    counters.cc
    sstable_dict_autotrainer.cc
@@ -181,7 +180,7 @@ target_sources(scylla-main
    generic_server.cc
    debug.cc
    init.cc
-    keys.cc
+    keys/keys.cc
    multishard_mutation_query.cc
    mutation_query.cc
    node_ops/task_manager_module.cc
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1,9 +1,6 @@
 This project includes code developed by the Apache Software Foundation (http://www.apache.org/),
 especially Apache Cassandra.

-It includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
-These files are located in utils/arch/powerpc/crc32-vpmsum. Their license may be found in licenses/LICENSE-crc32-vpmsum.TXT.
-
 It includes modified code from https://gitbox.apache.org/repos/asf?p=cassandra-dtest.git (owned by The Apache Software Foundation)

 It includes modified tests from https://github.com/etcd-io/etcd.git (owned by The etcd Authors)
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.3.9
+VERSION=2025.4.0-dev

 if test -f version
 then
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -167,4 +167,8 @@ future<> controller::request_stop_server() {
    });
 }

+future<utils::chunked_vector<client_data>> controller::get_client_data() {
+    return _server.local().get_client_data();
+}
+
 }
--- a/alternator/controller.hh
+++ b/alternator/controller.hh
@@ -90,6 +90,10 @@ public:
    virtual future<> start_server() override;
    virtual future<> stop_server() override;
    virtual future<> request_stop_server() override;
+    // This virtual function is called (on each shard separately) when the
+    // virtual table "system.clients" is read. It is expected to generate a
+    // list of clients connected to this server (on this shard).
+    virtual future<utils::chunked_vector<client_data>> get_client_data() override;
 };

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -247,7 +247,19 @@ void executor::supplement_table_info(rjson::value& descr, const schema& schema,
 // bytes (dash and UUID), and since directory names are limited to 255 bytes,
 // we need to limit table names to 222 bytes, instead of 255.
 // See https://github.com/scylladb/scylla/issues/4480
-static constexpr int max_table_name_length = 222;
+// We actually have two limits here,
+// * max_table_name_length is the limit that Alternator will impose on names
+//   of new Alternator tables.
+// * max_auxiliary_table_name_length is the potentially higher absolute limit
+//   that Scylla imposes on the names of auxiliary tables that Alternator
+//   wants to create internally - i.e. materialized views or CDC log tables.
+// The second limit might mean that it is not possible to add a GSI to an
+// existing table, because the name of the new auxiliary table may go over
+// the limit. The second limit is also one of the reasons why the first limit
+// is set lower than 222 - to have room to enable streams which add the extra
+// suffix "_scylla_cdc_log" to the table name.
+static constexpr int max_table_name_length = 192;
+static constexpr int max_auxiliary_table_name_length = 222;

 static bool valid_table_name_chars(std::string_view name) {
    for (auto c : name) {
@@ -263,11 +275,16 @@ static bool valid_table_name_chars(std::string_view name) {
    return true;
 }

+// validate_table_name() validates the TableName parameter in a request - it
+// should only be called in CreateTable or when a request looking for an
+// existing table failed to find it. validate_table_name() throws the
+// appropriate api_error if this validation fails.
 // The DynamoDB developer guide, https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html#HowItWorks.NamingRules
-// specifies that table names "names must be between 3 and 255 characters long
-// and can contain only the following characters: a-z, A-Z, 0-9, _ (underscore), - (dash), . (dot)
-// validate_table_name throws the appropriate api_error if this validation fails.
-static void validate_table_name(const std::string& name) {
+// specifies that table "names must be between 3 and 255 characters long and
+// can contain only the following characters: a-z, A-Z, 0-9, _ (underscore),
+// - (dash), . (dot)". However, Alternator only allows max_table_name_length
+// characters (see above) - not 255.
+static void validate_table_name(std::string_view name) {
    if (name.length() < 3 || name.length() > max_table_name_length) {
        throw api_error::validation(
                format("TableName must be at least 3 characters long and at most {} characters long", max_table_name_length));
@@ -278,6 +295,27 @@ static void validate_table_name(const std::string& name) {
    }
 }

+// Validate that a CDC log table could be created for the base table with a
+// given table_name, and if not, throw a user-visible api_error::validation.
+// It is not possible to create a CDC log table if the table name is so long
+// that adding the 15-character suffix "_scylla_cdc_log" (cdc_log_suffix)
+// makes it go over max_auxiliary_table_name_length.
+// Note that if max_table_name_length is set to less than 207 (which is
+// max_auxiliary_table_name_length-15), then this function will never
+// fail. However, it's still important to call it in UpdateTable, in case
+// we have pre-existing tables with names longer than this to avoid #24598.
+static void validate_cdc_log_name_length(std::string_view table_name) {
+    if (cdc::log_name(table_name).length() > max_auxiliary_table_name_length) {
+        // CDC will add cdc_log_suffix ("_scylla_cdc_log") to the table name
+        // to create its log table, and this will exceed the maximum allowed
+        // length. To provide a more helpful error message, we assume that
+        // cdc::log_name() always adds a suffix of the same length.
+        int suffix_len = cdc::log_name(table_name).length() - table_name.length();
+        throw api_error::validation(fmt::format("Streams cannot be enabled to a table whose name is longer than {} characters: {}",
+            max_auxiliary_table_name_length - suffix_len, table_name));
+    }
+}
+
 // In DynamoDB index names are local to a table, while in Scylla, materialized
 // view names are global (in a keyspace). So we need to compose a unique name
 // for the view taking into account both the table's name and the index name.
@@ -296,10 +334,10 @@ static std::string view_name(std::string_view table_name, std::string_view index
                fmt::format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
    }
    std::string ret = std::string(table_name) + delim + std::string(index_name);
-    if (ret.length() > max_table_name_length && validate_len) {
+    if (ret.length() > max_auxiliary_table_name_length && validate_len) {
        throw api_error::validation(
                fmt::format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
-                        table_name, index_name, max_table_name_length - delim.size()));
+                        table_name, index_name, max_auxiliary_table_name_length - delim.size()));
    }
    return ret;
 }
@@ -345,15 +383,19 @@ schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::valu
    if (!table_name) {
        return nullptr;
    }
+    return find_table(proxy, *table_name);
+}
+
+schema_ptr executor::find_table(service::storage_proxy& proxy, std::string_view table_name) {
    try {
-        return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(*table_name), *table_name);
+        return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(table_name), table_name);
    } catch(data_dictionary::no_such_column_family&) {
        // DynamoDB returns validation error even when table does not exist
        // and the table name is invalid.
-        validate_table_name(table_name.value());
+        validate_table_name(table_name);

        throw api_error::resource_not_found(
-                fmt::format("Requested resource not found: Table: {} not found", *table_name));
+                fmt::format("Requested resource not found: Table: {} not found", table_name));
    }
 }

@@ -367,24 +409,39 @@ schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request)
    return schema;
 }

-static std::tuple<bool, std::string_view, std::string_view> try_get_internal_table(data_dictionary::database db, std::string_view table_name) {
+// try_get_internal_table() handles the special case that the given table_name
+// begins with INTERNAL_TABLE_PREFIX (".scylla.alternator."). In that case,
+// this function assumes that the rest of the name refers to an internal
+// Scylla table (e.g., system table) and returns the schema of that table -
+// or an exception if it doesn't exist. Otherwise, if table_name does not
+// start with INTERNAL_TABLE_PREFIX, this function returns an empty schema_ptr
+// and the caller should look for a normal Alternator table with that name.
+static schema_ptr try_get_internal_table(data_dictionary::database db, std::string_view table_name) {
    size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX);
    if (it != 0) {
-        return {false, "", ""};
+        return schema_ptr{};
    }
    table_name.remove_prefix(executor::INTERNAL_TABLE_PREFIX.size());
    size_t delim = table_name.find_first_of('.');
    if (delim == std::string_view::npos) {
-        return {false, "", ""};
+        return schema_ptr{};
    }
    std::string_view ks_name = table_name.substr(0, delim);
    table_name.remove_prefix(ks_name.size() + 1);
    // Only internal keyspaces can be accessed to avoid leakage
    auto ks = db.try_find_keyspace(ks_name);
    if (!ks || !ks->is_internal()) {
-        return {false, "", ""};
+        return schema_ptr{};
    }
-    return {true, ks_name, table_name};
+    try {
+        return db.find_schema(ks_name, table_name);
+    } catch (data_dictionary::no_such_column_family&) {
+        // DynamoDB returns validation error even when table does not exist
+        // and the table name is invalid.
+        validate_table_name(table_name);
+        throw api_error::resource_not_found(
+            fmt::format("Requested resource not found: Internal table: {}.{} not found", ks_name, table_name));
+        }
 }

 // get_table_or_view() is similar to to get_table(), except it returns either
@@ -397,18 +454,8 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
    table_or_view_type type = table_or_view_type::base;
    std::string table_name = get_table_name(request);

-    auto [is_internal_table, internal_ks_name, internal_table_name] = try_get_internal_table(proxy.data_dictionary(), table_name);
-    if (is_internal_table) {
-        try {
-            return { proxy.data_dictionary().find_schema(sstring(internal_ks_name), sstring(internal_table_name)), type };
-        } catch (data_dictionary::no_such_column_family&) {
-            // DynamoDB returns validation error even when table does not exist
-            // and the table name is invalid.
-            validate_table_name(table_name);
-
-            throw api_error::resource_not_found(
-                fmt::format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
-        }
+    if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) {
+        return {s, type};
    }

    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
@@ -451,6 +498,24 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
    }
 }

+// get_table_for_write() is similar to get_table(), but additionally, if the
+// configuration allows this, may also allow writing to system table with
+// prefix INTERNAL_TABLE_PREFIX. This is analogous to the function
+// get_table_or_view() above which allows *reading* internal tables.
+static schema_ptr get_table_for_write(service::storage_proxy& proxy, const rjson::value& request) {
+    std::string table_name = get_table_name(request);
+    if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) {
+        if (!proxy.data_dictionary().get_config().alternator_allow_system_table_write()) {
+            throw api_error::resource_not_found(fmt::format(
+                "Table {} is an internal table, and writing to it is forbidden"
+                " by the alternator_allow_system_table_write configuration",
+                table_name));
+        }
+        return s;
+    }
+    return executor::find_table(proxy, table_name);
+}
+
 // Convenience function for getting the value of a string attribute, or a
 // default value if it is missing. If the attribute exists, but is not a
 // string, a descriptive api_error is thrown.
@@ -780,6 +845,23 @@ future<> verify_permission(
    if (!enforce_authorization) {
        co_return;
    }
+    // Unfortunately, the fix for issue #23218 did not modify the function
+    // that we use here - check_has_permissions(). So if we want to allow
+    // writes to internal tables (from try_get_internal_table()) only to a
+    // superuser, we need to explicitly check it here.
+    if (permission_to_check == auth::permission::MODIFY && is_internal_keyspace(schema->ks_name())) {
+        if (!client_state.user() ||
+            !client_state.user()->name ||
+            !co_await client_state.get_auth_service()->underlying_role_manager().is_superuser(*client_state.user()->name)) {
+                sstring username = "<anonymous>";
+                if (client_state.user() && client_state.user()->name) {
+                    username = client_state.user()->name.value();
+                }
+                throw api_error::access_denied(fmt::format(
+                    "Write access denied on internal table {}.{} to role {} because it is not a superuser",
+                    schema->ks_name(), schema->cf_name(), username));
+        }
+    }
    auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
    if (!co_await client_state.check_has_permission(auth::command_desc(permission_to_check, resource))) {
        sstring username = "<anonymous>";
@@ -818,9 +900,6 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    elogger.trace("Deleting table {}", request);

    std::string table_name = get_table_name(request);
-    // DynamoDB returns validation error even when table does not exist
-    // and the table name is invalid.
-    validate_table_name(table_name);

    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    tracing::add_table_name(trace_state, keyspace_name, table_name);
@@ -836,10 +915,13 @@ future<executor::request_return_type> executor::delete_table(client_state& clien

            std::optional<data_dictionary::table> tbl = p.local().data_dictionary().try_find_table(keyspace_name, table_name);
            if (!tbl) {
+                // DynamoDB returns validation error even when table does not exist
+                // and the table name is invalid.
+                validate_table_name(table_name);
                throw api_error::resource_not_found(fmt::format("Requested resource not found: Table: {} not found", table_name));
            }

-            auto m = co_await service::prepare_column_family_drop_announcement(p.local(), keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
+            auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
            auto m2 = co_await service::prepare_keyspace_drop_announcement(_proxy.local_db(), keyspace_name, group0_guard.write_timestamp());

            std::move(m2.begin(), m2.end(), std::back_inserter(m));
@@ -1562,7 +1644,9 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli

    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
    if (stream_specification && stream_specification->IsObject()) {
-        executor::add_stream_options(*stream_specification, builder, sp);
+        if (executor::add_stream_options(*stream_specification, builder, sp)) {
+            validate_cdc_log_name_length(builder.cf_name());
+        }
    }

    // Parse the "Tags" parameter early, so we can avoid creating the table
@@ -1599,7 +1683,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
    for (;;) {
        auto group0_guard = co_await mm.start_group0_operation();
        auto ts = group0_guard.write_timestamp();
-        std::vector<mutation> schema_mutations;
+        utils::chunked_vector<mutation> schema_mutations;
        auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features());
        // Alternator Streams doesn't yet work when the table uses tablets (#16317)
        if (stream_specification && stream_specification->IsObject()) {
@@ -1624,20 +1708,13 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
            // This should never happen, the ID is supposed to be unique
            co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
        }
-        co_await service::prepare_new_column_family_announcement(schema_mutations, sp, *ksm, schema, ts);
+        std::vector<schema_ptr> schemas;
+        schemas.push_back(schema);
        for (schema_builder& view_builder : view_builders) {
-            view_ptr view(view_builder.build());
-            db::schema_tables::add_table_or_view_to_schema_mutation(
-                view, ts, true, schema_mutations);
-            // add_table_or_view_to_schema_mutation() is a low-level function that
-            // doesn't call the callbacks that prepare_new_view_announcement()
-            // calls. So we need to call this callback here :-( If we don't, among
-            // other things *tablets* will not be created for the new view.
-            // These callbacks need to be called in a Seastar thread.
-            co_await seastar::async([&sp, &ksm, &view, &schema_mutations, ts] {
-                return sp.local_db().get_notifier().before_create_column_family(*ksm, *view, schema_mutations, ts);
-            });
+            schemas.push_back(view_builder.build());
        }
+        co_await service::prepare_new_column_families_announcement(schema_mutations, sp, *ksm, schemas, ts);
+
        // If a role is allowed to create a table, we must give it permissions to
        // use (and eventually delete) the specific table it just created (and
        // also the view tables). This is known as "auto-grant".
@@ -1760,7 +1837,9 @@ future<executor::request_return_type> executor::update_table(client_state& clien
            rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
            if (stream_specification && stream_specification->IsObject()) {
                empty_request = false;
-                add_stream_options(*stream_specification, builder, p.local());
+                if (add_stream_options(*stream_specification, builder, p.local())) {
+                    validate_cdc_log_name_length(builder.cf_name());
+                }
                // Alternator Streams doesn't yet work when the table uses tablets (#16317)
                auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
                if (stream_enabled && stream_enabled->IsBool()) {
@@ -2288,7 +2367,7 @@ static lw_shared_ptr<query::read_command> previous_item_read_command(service::st
    // wildcard selection...) but here we read the entire item anyway. We
    // should take the column list from selection instead of building it here.
    auto regular_columns =
-            schema->regular_columns() | std::views::transform([] (const column_definition& cdef) { return cdef.id; })
+            schema->regular_columns() | std::views::transform(&column_definition::id)
            |  std::ranges::to<query::column_id_vector>();
    auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
@@ -2350,7 +2429,7 @@ rmw_operation::parse_returnvalues_on_condition_check_failure(const rjson::value&

 rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& request)
    : _request(std::move(request))
-    , _schema(get_table(proxy, _request))
+    , _schema(get_table_for_write(proxy, _request))
    , _write_isolation(get_write_isolation_for_schema(_schema))
    , _consumed_capacity(_request)
    , _returnvalues(parse_returnvalues(_request))
@@ -2377,9 +2456,21 @@ std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::re
 }

 rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(schema_ptr schema) {
-    const auto& tags = get_tags_of_table_or_throw(schema);
-    auto it = tags.find(WRITE_ISOLATION_TAG_KEY);
-    if (it == tags.end() || it->second.empty()) {
+    const auto tags_ptr = db::get_tags_of_table(schema);
+    if (!tags_ptr) {
+        // Tags missing entirely from this table. This can't happen for a
+        // normal Alternator table, but can happen if get_table_for_write()
+        // allowed writing to a non-Alternator table (e.g., an internal table).
+        // If it is a system table, LWT will not work (and is also pointless
+        // for non-distributed tables), so use UNSAFE_RMW.
+        if(is_internal_keyspace(schema->ks_name())) {
+            return write_isolation::UNSAFE_RMW;
+        } else {
+            return default_write_isolation;
+        }
+    }
+    auto it = tags_ptr->find(WRITE_ISOLATION_TAG_KEY);
+    if (it == tags_ptr->end() || it->second.empty()) {
        return default_write_isolation;
    }
    return parse_write_isolation(it->second);
@@ -2388,12 +2479,15 @@ rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(sch
 // shard_for_execute() checks whether execute() must be called on a specific
 // other shard. Running execute() on a specific shard is necessary only if it
 // will use LWT (storage_proxy::cas()). This is because cas() can only be
-// called on the specific shard owning (as per cas_shard()) _pk's token.
+// called on the specific shard owning (as per get_cas_shard()) _pk's token.
 // Knowing if execute() will call cas() or not may depend on whether there is
 // a read-before-write, but not just on it - depending on configuration,
 // execute() may unconditionally use cas() for every write. Unfortunately,
 // this requires duplicating here a bit of logic from execute().
-std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_write) {
+// The returned cas_shard must be passed to execute() to ensure
+// the tablet shard won't change. The caller must hold the returned object for
+// the duration of execution, even if we were already on the right shard - so it doesn't move.
+std::optional<service::cas_shard> rmw_operation::shard_for_execute(bool needs_read_before_write) {
    if (_write_isolation == write_isolation::FORBID_RMW ||
        (_write_isolation == write_isolation::LWT_RMW_ONLY && !needs_read_before_write) ||
        _write_isolation == write_isolation::UNSAFE_RMW) {
@@ -2401,12 +2495,8 @@ std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_
    }
    // If we're still here, cas() *will* be called by execute(), so let's
    // find the appropriate shard to run it on:
-    auto token = dht::get_token(*_schema, _pk);
-    auto desired_shard = service::storage_proxy::cas_shard(*_schema, token);
-    if (desired_shard == this_shard_id()) {
-        return {};
-    }
-    return desired_shard;
+    const auto token = dht::get_token(*_schema, _pk);
+    return service::cas_shard(*_schema, token);
 }

 // Build the return value from the different RMW operations (UpdateItem,
@@ -2451,6 +2541,7 @@ static future<std::unique_ptr<rjson::value>> get_previous_item(
 }

 future<executor::request_return_type> rmw_operation::execute(service::storage_proxy& proxy,
+        std::optional<service::cas_shard> cas_shard,
        service::client_state& client_state,
        tracing::trace_state_ptr trace_state,
        service_permit permit,
@@ -2473,7 +2564,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
                if (!m) {
                    return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("The conditional request failed", std::move(_return_attributes)));
                }
-                return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this,&wcu_total] () mutable {
+                return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this,&wcu_total] () mutable {
                    return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
                });
            });
@@ -2481,10 +2572,13 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
    } else if (_write_isolation != write_isolation::LWT_ALWAYS) {
        std::optional<mutation> m = apply(nullptr, api::new_timestamp());
        SCYLLA_ASSERT(m); // !needs_read_before_write, so apply() did not check a condition
-        return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this, &wcu_total] () mutable {
+        return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this, &wcu_total] () mutable {
            return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
        });
    }
+    if (!cas_shard) {
+        on_internal_error(elogger, "cas_shard is not set");
+    }
    // If we're still here, we need to do this write using LWT:
    global_stats.write_using_lwt++;
    per_table_stats.write_using_lwt++;
@@ -2493,7 +2587,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
    auto read_command = needs_read_before_write ?
            previous_item_read_command(proxy, schema(), _ck, selection) :
            nullptr;
-    return proxy.cas(schema(), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
+    return proxy.cas(schema(), std::move(*cas_shard), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
            {timeout, std::move(permit), client_state, trace_state},
            db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM, timeout, timeout).then([this, read_command, &wcu_total] (bool is_applied) mutable {
        if (!is_applied) {
@@ -2618,18 +2712,19 @@ future<executor::request_return_type> executor::put_item(client_state& client_st

    co_await verify_permission(_enforce_authorization, client_state, op->schema(), auth::permission::MODIFY);

-    if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
+    auto cas_shard = op->shard_for_execute(needs_read_before_write);
+
+    if (cas_shard && !cas_shard->this_shard()) {
        _stats.api_operations.put_item--; // uncount on this shard, will be counted in other shard
        _stats.shard_bounce_for_lwt++;
-        co_return co_await container().invoke_on(*shard, _ssg,
+        co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
                [request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
                (executor& e) mutable {
            return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
                                     (service::client_state& client_state) mutable {
-                //FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
-                // to another shard - once it is solved, this place can use a similar solution. Instead of passing
-                // empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
-                // so that it's destructed only after all background operations are finished as well.
+                //FIXME: Instead of passing empty_service_permit() to the background operation,
+                // the current permit's lifetime should be prolonged, so that it's destructed
+                // only after all background operations are finished as well.
                return e.put_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
            });
        });
@@ -2637,7 +2732,7 @@ future<executor::request_return_type> executor::put_item(client_state& client_st
    lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
    per_table_stats->api_operations.put_item++;
    uint64_t wcu_total = 0;
-    auto res = co_await op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
+    auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
    per_table_stats->wcu_total[stats::wcu_types::PUT_ITEM] += wcu_total;
    _stats.wcu_total[stats::wcu_types::PUT_ITEM] += wcu_total;
    per_table_stats->api_operations.put_item_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -2720,26 +2815,27 @@ future<executor::request_return_type> executor::delete_item(client_state& client

    co_await verify_permission(_enforce_authorization, client_state, op->schema(), auth::permission::MODIFY);

-    if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
+    auto cas_shard = op->shard_for_execute(needs_read_before_write);
+
+    if (cas_shard && !cas_shard->this_shard()) {
        _stats.api_operations.delete_item--; // uncount on this shard, will be counted in other shard
        _stats.shard_bounce_for_lwt++;
        per_table_stats->shard_bounce_for_lwt++;
-        co_return co_await container().invoke_on(*shard, _ssg,
+        co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
                [request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
                (executor& e) mutable {
            return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
                                     (service::client_state& client_state) mutable {
-                //FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
-                // to another shard - once it is solved, this place can use a similar solution. Instead of passing
-                // empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
-                // so that it's destructed only after all background operations are finished as well.
+                //FIXME: Instead of passing  empty_service_permit() to the background operation,
+                // the current permit's lifetime should be prolonged, so that it's destructed
+                // only after all background operations are finished as well.
                return e.delete_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
            });
        });
    }
    per_table_stats->api_operations.delete_item++;
    uint64_t wcu_total = 0;
-    auto res = co_await op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
+    auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
    per_table_stats->wcu_total[stats::wcu_types::DELETE_ITEM] += wcu_total;
    _stats.wcu_total[stats::wcu_types::DELETE_ITEM] += wcu_total;
    per_table_stats->api_operations.delete_item_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -2749,10 +2845,12 @@ future<executor::request_return_type> executor::delete_item(client_state& client

 static schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) {
    sstring table_name = batch_request->name.GetString(); // JSON keys are always strings
-    validate_table_name(table_name);
    try {
        return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
    } catch(data_dictionary::no_such_column_family&) {
+        // DynamoDB returns validation error even when table does not exist
+        // and the table name is invalid.
+        validate_table_name(table_name);
        throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
    }
 }
@@ -2798,11 +2896,11 @@ public:
    }
 };

-static future<> cas_write(service::storage_proxy& proxy, schema_ptr schema, dht::decorated_key dk, std::vector<put_or_delete_item>&& mutation_builders,
+static future<> cas_write(service::storage_proxy& proxy, schema_ptr schema, service::cas_shard cas_shard, dht::decorated_key dk, std::vector<put_or_delete_item>&& mutation_builders,
        service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit) {
    auto timeout = executor::default_timeout();
    auto op = seastar::make_shared<put_or_delete_item_cas_request>(schema, std::move(mutation_builders));
-    return proxy.cas(schema, op, nullptr, to_partition_ranges(dk),
+    return proxy.cas(schema, std::move(cas_shard), op, nullptr, to_partition_ranges(dk),
            {timeout, std::move(permit), client_state, trace_state},
            db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM,
            timeout, timeout).discard_result();
@@ -2848,7 +2946,7 @@ static future<> do_batch_write(service::storage_proxy& proxy,
    });
    if (!needs_lwt) {
        // Do a normal write, without LWT:
-        std::vector<mutation> mutations;
+        utils::chunked_vector<mutation> mutations;
        mutations.reserve(mutation_builders.size());
        api::timestamp_type now = api::new_timestamp();
        for (auto& b : mutation_builders) {
@@ -2874,12 +2972,12 @@ static future<> do_batch_write(service::storage_proxy& proxy,
        }
        return parallel_for_each(std::move(key_builders), [&proxy, &client_state, &stats, trace_state, ssg, permit = std::move(permit)] (auto& e) {
            stats.write_using_lwt++;
-            auto desired_shard = service::storage_proxy::cas_shard(*e.first.schema, e.first.dk.token());
-            if (desired_shard == this_shard_id()) {
-                return cas_write(proxy, e.first.schema, e.first.dk, std::move(e.second), client_state, trace_state, permit);
+            auto desired_shard = service::cas_shard(*e.first.schema, e.first.dk.token());
+            if (desired_shard.this_shard()) {
+                return cas_write(proxy, e.first.schema, std::move(desired_shard), e.first.dk, std::move(e.second), client_state, trace_state, permit);
            } else {
                stats.shard_bounce_for_lwt++;
-                return proxy.container().invoke_on(desired_shard, ssg,
+                return proxy.container().invoke_on(desired_shard.shard(), ssg,
                            [cs = client_state.move_to_other_shard(),
                             mb = e.second,
                             dk = e.first.dk,
@@ -2892,13 +2990,19 @@ static future<> do_batch_write(service::storage_proxy& proxy,
                                              trace_state = tracing::trace_state_ptr(gt)]
                                              (service::client_state& client_state) mutable {
                        auto schema = proxy.data_dictionary().find_schema(ks, cf);
-                        //FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
-                        // to another shard - once it is solved, this place can use a similar solution. Instead of passing
-                        // empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
-                        // so that it's destructed only after all background operations are finished as well.
-                        return cas_write(proxy, schema, dk, std::move(mb), client_state, std::move(trace_state), empty_service_permit());
+
+                        // The desired_shard on the original shard remains alive for the duration
+                        // of cas_write on this shard and prevents any tablet operations.
+                        // However, we need a local instance of cas_shard on this shard
+                        // to pass it to sp::cas, so we just create a new one.
+                        service::cas_shard cas_shard(*schema, dk.token());
+
+                        //FIXME: Instead of passing empty_service_permit() to the background operation,
+                        // the current permit's lifetime should be prolonged, so that it's destructed
+                        // only after all background operations are finished as well.
+                        return cas_write(proxy, schema, std::move(cas_shard), dk, std::move(mb), client_state, std::move(trace_state), empty_service_permit());
                    });
-                });
+                }).finally([desired_shard = std::move(desired_shard)]{});
            }
        });
    }
@@ -3409,16 +3513,16 @@ future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schem
        shared_ptr<cql3::selection::selection> selection,
        foreign_ptr<lw_shared_ptr<query::result>> query_result,
        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
-        noncopyable_function<void(uint64_t)> item_callback) {
+        uint64_t& rcu_half_units) {
    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        uint64_t item_length_in_bytes = 0;
-        describe_single_item(*selection, result_row, *attrs_to_get, item, &item_length_in_bytes);
-        item_callback(item_length_in_bytes);
+        rcu_consumed_capacity_counter consumed_capacity;
+        describe_single_item(*selection, result_row, *attrs_to_get, item, &consumed_capacity._total_bytes);
+        rcu_half_units += consumed_capacity.get_half_units();
        ret.push_back(std::move(item));
        co_await coroutine::maybe_yield();
    }
@@ -4009,6 +4113,9 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
            }
        }
    }
+    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
+        _return_attributes = std::move(*previous_item);
+    }
    if (_attribute_updates) {
        for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
            // Note that it.key() is the name of the column, *it is the operation
@@ -4118,9 +4225,6 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
        // don't report the new item in the returned Attributes.
        _return_attributes = rjson::null_value();
    }
-    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
-        _return_attributes = std::move(*previous_item);
-    }
    // ReturnValues=UPDATED_OLD/NEW never return an empty Attributes field,
    // even if a new item was created. Instead it should be missing entirely.
    if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
@@ -4143,18 +4247,19 @@ future<executor::request_return_type> executor::update_item(client_state& client

    co_await verify_permission(_enforce_authorization, client_state, op->schema(), auth::permission::MODIFY);

-    if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
+    auto cas_shard = op->shard_for_execute(needs_read_before_write);
+
+    if (cas_shard && !cas_shard->this_shard()) {
        _stats.api_operations.update_item--; // uncount on this shard, will be counted in other shard
        _stats.shard_bounce_for_lwt++;
-        co_return co_await container().invoke_on(*shard, _ssg,
+        co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
                [request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
                (executor& e) mutable {
            return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
                                     (service::client_state& client_state) mutable {
-                //FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
-                // to another shard - once it is solved, this place can use a similar solution. Instead of passing
-                // empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
-                // so that it's destructed only after all background operations are finished as well.
+                //FIXME: Instead of passing empty_service_permit() to the background operation,
+                // the current permit's lifetime should be prolonged, so that it's destructed
+                // only after all background operations are finished as well.
                return e.update_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
            });
        });
@@ -4162,7 +4267,7 @@ future<executor::request_return_type> executor::update_item(client_state& client
    lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
    per_table_stats->api_operations.update_item++;
    uint64_t wcu_total = 0;
-    auto res = co_await op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
+    auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
    per_table_stats->wcu_total[stats::wcu_types::UPDATE_ITEM] += wcu_total;
    _stats.wcu_total[stats::wcu_types::UPDATE_ITEM] += wcu_total;
    per_table_stats->api_operations.update_item_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -4236,7 +4341,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st

    //TODO(sarna): It would be better to fetch only some attributes of the map, not all
    auto regular_columns =
-            schema->regular_columns() | std::views::transform([] (const column_definition& cdef) { return cdef.id; })
+            schema->regular_columns() | std::views::transform(&column_definition::id)
            | std::ranges::to<query::column_id_vector>();

    auto selection = cql3::selection::selection::wildcard(schema);
@@ -4359,6 +4464,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
        }
    };
    std::vector<table_requests> requests;
+    std::vector<std::vector<uint64_t>> responses_sizes;
    uint batch_size = 0;
    for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
        table_requests rs(get_table_from_batch_request(_proxy, it));
@@ -4386,10 +4492,11 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    // If we got here, all "requests" are valid, so let's start the
    // requests for the different partitions all in parallel.
    std::vector<future<std::vector<rjson::value>>> response_futures;
-    std::vector<uint64_t> consumed_rcu_half_units_per_table(requests.size());
-    for (size_t i = 0; i < requests.size(); i++) {
-        const table_requests& rs = requests[i];
-        bool is_quorum = rs.cl == db::consistency_level::LOCAL_QUORUM;
+    responses_sizes.resize(requests.size());
+    size_t responses_sizes_pos = 0;
+    for (const auto& rs : requests) {
+        responses_sizes[responses_sizes_pos].resize(rs.requests.size());
+        size_t pos = 0;
        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
        per_table_stats->api_operations.batch_get_item_histogram.add(rs.requests.size());
        for (const auto &r : rs.requests) {
@@ -4405,24 +4512,23 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                }
            }
            auto regular_columns =
-                    rs.schema->regular_columns() | std::views::transform([] (const column_definition& cdef) { return cdef.id; })
+                    rs.schema->regular_columns() | std::views::transform(&column_definition::id)
                    | std::ranges::to<query::column_id_vector>();
            auto selection = cql3::selection::selection::wildcard(rs.schema);
            auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
                    query::tombstone_limit(_proxy.get_tombstone_limit()));
            command->allow_limit = db::allow_per_partition_rate_limit::yes;
-            const auto item_callback = [is_quorum, &rcus_per_table = consumed_rcu_half_units_per_table[i]](uint64_t size) {
-                rcus_per_table += rcu_consumed_capacity_counter::get_half_units(size, is_quorum);
-            };
            future<std::vector<rjson::value>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
-                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, item_callback = std::move(item_callback)] (service::storage_proxy::coordinator_query_result qr) mutable {
+                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, &response_size = responses_sizes[responses_sizes_pos][pos]] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), std::move(item_callback));
+                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), response_size);
            });
+            pos++;
            response_futures.push_back(std::move(f));
        }
+        responses_sizes_pos++;
    }

    // Wait for all requests to complete, and then return the response.
@@ -4434,11 +4540,14 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Responses", rjson::empty_object());
    rjson::add(response, "UnprocessedKeys", rjson::empty_object());
+    size_t rcu_half_units;
    auto fut_it = response_futures.begin();
+    responses_sizes_pos = 0;
    rjson::value consumed_capacity = rjson::empty_array();
-    for (size_t i = 0; i < requests.size(); i++) {
-        const table_requests& rs = requests[i];
+    for (const auto& rs : requests) {
        std::string table = table_name(*rs.schema);
+        size_t pos = 0;
+        rcu_half_units = 0;
        for (const auto &r : rs.requests) {
            auto& pk = r.first;
            auto& cks = r.second;
@@ -4453,6 +4562,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                for (rjson::value& json : results) {
                    rjson::push_back(response["Responses"][table], std::move(json));
                }
+                rcu_half_units += rcu_consumed_capacity_counter::get_half_units(responses_sizes[responses_sizes_pos][pos], rs.cl == db::consistency_level::LOCAL_QUORUM);
            } catch(...) {
                eptr = std::current_exception();
                // This read of potentially several rows in one partition,
@@ -4476,8 +4586,8 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second));
                }
            }
+            pos++;
        }
-        uint64_t rcu_half_units = consumed_rcu_half_units_per_table[i];
        _stats.rcu_half_units_total += rcu_half_units;
        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
        per_table_stats->rcu_half_units_total += rcu_half_units;
@@ -4487,6 +4597,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
            rjson::add(entry, "CapacityUnits", rcu_half_units*0.5);
            rjson::push_back(consumed_capacity, std::move(entry));
        }
+        responses_sizes_pos++;
    }

    if (should_add_rcu) {
@@ -4871,10 +4982,10 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
    co_await verify_permission(enforce_authorization, client_state, table_schema, auth::permission::SELECT);

    auto regular_columns =
-            table_schema->regular_columns() | std::views::transform([] (const column_definition& cdef) { return cdef.id; })
+            table_schema->regular_columns() | std::views::transform(&column_definition::id)
            | std::ranges::to<query::column_id_vector>();
    auto static_columns =
-            table_schema->static_columns() | std::views::transform([] (const column_definition& cdef) { return cdef.id; })
+            table_schema->static_columns() | std::views::transform(&column_definition::id)
            | std::ranges::to<query::column_id_vector>();
    auto selection = cql3::selection::selection::wildcard(table_schema);
    query::partition_slice::option_set opts = selection->get_query_options();
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -212,6 +212,7 @@ public:
 private:
    static thread_local utils::updateable_value<uint32_t> s_default_timeout_in_ms;
 public:
+    static schema_ptr find_table(service::storage_proxy&, std::string_view table_name);
    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);

 private:
@@ -229,15 +230,12 @@ public:
        const std::optional<attrs_to_get>&,
        uint64_t* = nullptr);

-    // Converts a multi-row selection result to JSON compatible with DynamoDB.
-    // For each row, this method calls item_callback, which takes the size of
-    // the item as the parameter.
    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
        const query::partition_slice&& slice,
        shared_ptr<cql3::selection::selection> selection,
        foreign_ptr<lw_shared_ptr<query::result>> query_result,
        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
-        noncopyable_function<void(uint64_t)> item_callback = {});
+        uint64_t& rcu_half_units);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<managed_bytes_opt>&,
@@ -246,7 +244,7 @@ public:
        uint64_t* item_length_in_bytes = nullptr,
        bool = false);

-    static void add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
+    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
 };
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -91,18 +91,6 @@ options {
        throw expressions_syntax_error(format("{} at char {}", err,
            ex->get_charPositionInLine()));
    }
-
-    // ANTLR3 tries to recover missing tokens - it tries to finish parsing
-    // and create valid objects, as if the missing token was there.
-    // But it has a bug and leaks these tokens.
-    // We override offending method and handle abandoned pointers.
-    std::vector<std::unique_ptr<TokenType>> _missing_tokens;
-    TokenType* getMissingSymbol(IntStreamType* istream, ExceptionBaseType* e,
-                                ANTLR_UINT32 expectedTokenType, BitsetListType* follow) {
-        auto token = BaseType::getMissingSymbol(istream, e, expectedTokenType, follow);
-        _missing_tokens.emplace_back(token);
-        return token;
-    }
 }
@lexer::context {
    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -10,11 +10,12 @@

 #include "seastarx.hh"
 #include "service/paxos/cas_request.hh"
+#include "service/cas_shard.hh"
 #include "utils/rjson.hh"
 #include "consumed_capacity.hh"
 #include "executor.hh"
 #include "tracing/trace_state.hh"
-#include "keys.hh"
+#include "keys/keys.hh"

 namespace alternator {

@@ -114,6 +115,7 @@ public:
    const rjson::value& request() const { return _request; }
    rjson::value&& move_request() && { return std::move(_request); }
    future<executor::request_return_type> execute(service::storage_proxy& proxy,
+            std::optional<service::cas_shard> cas_shard,
            service::client_state& client_state,
            tracing::trace_state_ptr trace_state,
            service_permit permit,
@@ -121,7 +123,7 @@ public:
            stats& global_stats,
            stats& per_table_stats,
            uint64_t& wcu_total);
-    std::optional<shard_id> shard_for_execute(bool needs_read_before_write);
+    std::optional<service::cas_shard> shard_for_execute(bool needs_read_before_write);
 };

 } // namespace alternator
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -13,7 +13,7 @@
 #include <optional>
 #include "types/types.hh"
 #include "schema/schema_fwd.hh"
-#include "keys.hh"
+#include "keys/keys.hh"
 #include "utils/rjson.hh"
 #include "utils/big_decimal.hh"

--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -30,6 +30,7 @@
 #include "gms/gossiper.hh"
 #include "utils/overloaded_functor.hh"
 #include "utils/aws_sigv4.hh"
+#include "client_data.hh"

 static logging::logger slogger("alternator-server");

@@ -430,6 +431,13 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    SCYLLA_ASSERT(req->content_stream);
    chunked_content content = co_await util::read_entire_stream(*req->content_stream);
    auto username = co_await verify_signature(*req, content);
+    // As long as the system_clients_entry object is alive, this request will
+    // be visible in the "system.clients" virtual table. When requested, this
+    // entry will be formatted by server::ongoing_request::make_client_data().
+    auto system_clients_entry = _ongoing_requests.emplace(
+        req->get_client_address(), req->get_header("User-Agent"),
+        username, current_scheduling_group(),
+        req->get_protocol_name() == "https");

    if (slogger.is_enabled(log_level::trace)) {
        std::string buf;
@@ -680,6 +688,37 @@ future<> server::json_parser::stop() {
    return std::move(_run_parse_json_thread);
 }

+// Convert an entry in the server's list of ongoing Alternator requests
+// (_ongoing_requests) into a client_data object. This client_data object
+// will then be used to produce a row for the "system.clients" virtual table.
+client_data server::ongoing_request::make_client_data() const {
+    client_data cd;
+    cd.ct = client_type::alternator;
+    cd.ip = _client_address.addr();
+    cd.port = _client_address.port();
+    cd.shard_id = this_shard_id();
+    cd.connection_stage = client_connection_stage::established;
+    cd.username = _username;
+    cd.scheduling_group_name = _scheduling_group.name();
+    cd.ssl_enabled = _is_https;
+    // For now, we save the full User-Agent header as the "driver name"
+    // and keep "driver_version" unset.
+    cd.driver_name = _user_agent;
+    // Leave "protocol_version" unset, it has no meaning in Alternator.
+    // Leave "hostname", "ssl_protocol" and "ssl_cipher_suite" unset.
+    // As reported in issue #9216, we never set these fields in CQL
+    // either (see cql_server::connection::make_client_data()).
+    return cd;
+}
+
+future<utils::chunked_vector<client_data>> server::get_client_data() {
+    utils::chunked_vector<client_data> ret;
+    co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
+        ret.emplace_back(r.make_client_data());
+    });
+    co_return ret;
+}
+
 const char* api_error::what() const noexcept {
    if (_what_string.empty()) {
        _what_string = fmt::format("{} {}: {}", std::to_underlying(_http_code), _type, _msg);
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include "alternator/executor.hh"
+#include "utils/scoped_item_list.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/condition-variable.hh>
 #include <seastar/http/httpd.hh>
@@ -20,6 +21,8 @@
 #include "utils/updateable_value.hh"
 #include <seastar/core/units.hh>

+struct client_data;
+
 namespace alternator {

 using chunked_content = rjson::chunked_content;
@@ -74,12 +77,30 @@ class server : public peering_sharded_service<server> {
    };
    json_parser _json_parser;

+    // The server maintains a list of ongoing requests, that are being handled
+    // by handle_api_request(). It uses this list in get_client_data(), which
+    // is called when reading the "system.clients" virtual table.
+    struct ongoing_request {
+        socket_address _client_address;
+        sstring _user_agent;
+        sstring _username;
+        scheduling_group _scheduling_group;
+        bool _is_https;
+        client_data make_client_data() const;
+    };
+    utils::scoped_item_list<ongoing_request> _ongoing_requests;
+
 public:
    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
            utils::updateable_value<bool> enforce_authorization, semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
    future<> stop();
+    // get_client_data() is called (on each shard separately) when the virtual
+    // table "system.clients" is read. It is expected to generate a list of
+    // clients connected to this server (on this shard). This function is
+    // called by alternator::controller::get_client_data().
+    future<utils::chunked_vector<client_data>> get_client_data();
 private:
    void set_routes(seastar::httpd::routes& r);
    // If verification succeeds, returns the authenticated user's username
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -1052,7 +1052,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
    });
 }

-void executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
+bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
    auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
    if (!stream_enabled || !stream_enabled->IsBool()) {
        throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
@@ -1086,10 +1086,12 @@ void executor::add_stream_options(const rjson::value& stream_specification, sche
                break;
        }
        builder.with_cdc_options(opts);
+        return true;
    } else {
        cdc::options opts;
        opts.enabled(false);
        builder.with_cdc_options(opts);
+        return false;
    }
 }

--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -286,7 +286,7 @@ static future<> expire_item(service::storage_proxy& proxy,
        auto ck = clustering_key::from_exploded(exploded_ck);
        m.partition().clustered_row(*schema, ck).apply(tombstone(ts, gc_clock::now()));
    }
-    std::vector<mutation> mutations;
+    utils::chunked_vector<mutation> mutations;
    mutations.push_back(std::move(m));
    return proxy.mutate(std::move(mutations),
        db::consistency_level::LOCAL_QUORUM,
@@ -313,7 +313,7 @@ static size_t random_offset(size_t min, size_t max) {
 //
 // The function is to be used with vnodes only
 static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_secondary_ranges(
-        const locator::effective_replication_map_ptr& erm,
+        const locator::effective_replication_map* erm,
        locator::host_id ep) {
    const auto& tm = *erm->get_token_metadata_ptr();
    const auto& sorted_tokens = tm.sorted_tokens();
@@ -324,6 +324,7 @@ static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_se
    auto prev_tok = sorted_tokens.back();
    for (const auto& tok : sorted_tokens) {
        co_await coroutine::maybe_yield();
+        // FIXME: pass is_vnode=true to get_natural_replicas since the token is in tm.sorted_tokens()
        host_id_vector_replica_set eps = erm->get_natural_replicas(tok);
        if (eps.size() <= 1 || eps[1] != ep) {
            prev_tok = tok;
@@ -393,7 +394,7 @@ class ranges_holder_primary {
    dht::token_range_vector _token_ranges;
 public:
    explicit ranges_holder_primary(dht::token_range_vector token_ranges) : _token_ranges(std::move(token_ranges)) {}
-    static future<ranges_holder_primary> make(const locator::vnode_effective_replication_map_ptr& erm, locator::host_id ep) {
+    static future<ranges_holder_primary> make(const locator::vnode_effective_replication_map* erm, locator::host_id ep) {
        co_return ranges_holder_primary(co_await erm->get_primary_ranges(ep));
    }
    std::size_t size() const { return _token_ranges.size(); }
@@ -413,7 +414,7 @@ public:
    explicit ranges_holder_secondary(std::vector<std::pair<dht::token_range, locator::host_id>> token_ranges, const gms::gossiper& g)
        : _token_ranges(std::move(token_ranges))
        , _gossiper(g) {}
-    static future<ranges_holder_secondary> make(const locator::effective_replication_map_ptr& erm, locator::host_id ep, const gms::gossiper& g) {
+    static future<ranges_holder_secondary> make(const locator::vnode_effective_replication_map* erm, locator::host_id ep, const gms::gossiper& g) {
        co_return ranges_holder_secondary(co_await get_secondary_ranges(erm, ep), g);
    }
    std::size_t size() const { return _token_ranges.size(); }
@@ -521,7 +522,7 @@ struct scan_ranges_context {
        // should be possible (and a must for issue #7751!).
        lw_shared_ptr<service::pager::paging_state> paging_state = nullptr;
        auto regular_columns =
-            s->regular_columns() | std::views::transform([] (const column_definition& cdef) { return cdef.id; })
+            s->regular_columns() | std::views::transform(&column_definition::id)
            | std::ranges::to<query::column_id_vector>();
        selection = cql3::selection::selection::wildcard(s);
        query::partition_slice::option_set opts = selection->get_query_options();
@@ -769,8 +770,12 @@ static future<bool> scan_table(
            }
        }
    } else {  // VNodes
-        locator::vnode_effective_replication_map_ptr erm =
-                db.real_database().find_keyspace(s->ks_name()).get_vnode_effective_replication_map();
+        locator::static_effective_replication_map_ptr ermp =
+                db.real_database().find_keyspace(s->ks_name()).get_static_effective_replication_map();
+        auto* erm = ermp->maybe_as_vnode_effective_replication_map();
+        if (!erm) {
+            on_internal_error(tlogger, format("Keyspace {} is local", s->ks_name()));
+        }
        auto my_host_id = erm->get_topology().my_host_id();
        token_ranges_owned_by_this_shard my_ranges(s, co_await ranges_holder_primary::make(erm, my_host_id));
        while (std::optional<dht::partition_range> range = my_ranges.next_partition_range()) {
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -984,7 +984,7 @@
         ]
      },
      {
-         "path":"/storage_service/cleanup_all/",
+         "path":"/storage_service/cleanup_all",
         "operations":[
            {
               "method":"POST",
@@ -994,30 +994,6 @@
               "produces":[
                  "application/json"
               ],
-               "parameters":[
-                    {
-                     "name":"global",
-                     "description":"true if cleanup of entire cluster is requested",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/storage_service/mark_node_as_clean",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Mark the node as clean. After that the node will not be considered as needing cleanup during automatic cleanup which is triggered by some topology operations",
-               "type":"void",
-               "nickname":"reset_cleanup_needed",
-               "produces":[
-                  "application/json"
-               ],
               "parameters":[]
            }
         ]
@@ -3201,6 +3177,38 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/storage_service/drop_quarantined_sstables",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Drops all quarantined sstables in all keyspaces or specified keyspace and tables",
+               "type":"void",
+               "nickname":"drop_quarantined_sstables",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace name to drop quarantined sstables from.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"tables",
+                     "description":"Comma-separated table names to drop quarantined sstables from.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
      }
   ],
   "models":{
@@ -3337,11 +3345,11 @@
         "properties":{
            "start_token":{
               "type":"string",
-               "description":"The range start token (exclusive)"
+               "description":"The range start token"
            },
            "end_token":{
               "type":"string",
-               "description":"The range end token (inclusive)"
+               "description":"The range start token"
            },
            "endpoints":{
               "type":"array",
--- a/api/api-doc/tasks.json
+++ b/api/api-doc/tasks.json
@@ -42,14 +42,6 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"consider_only_existing_data",
-                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
                  }
               ]
            }
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -28,10 +28,14 @@ template<class Mapper, class I, class Reducer>
 future<I> map_reduce_cf_raw(http_context& ctx, const sstring& name, I init,
        Mapper mapper, Reducer reducer) {
    auto uuid = parse_table_info(name, ctx.db.local()).id;
-    using mapper_type = std::function<std::unique_ptr<std::any>(replica::database&)>;
+    using mapper_type = std::function<future<std::unique_ptr<std::any>>(replica::database&)>;
    using reducer_type = std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)>;
    return ctx.db.map_reduce0(mapper_type([mapper, uuid](replica::database& db) {
-        return std::make_unique<std::any>(I(mapper(db.find_column_family(uuid))));
+        return futurize_invoke([mapper, &db, uuid] {
+            return mapper(db.find_column_family(uuid));
+        }).then([] (auto result) {
+            return std::make_unique<std::any>(I(std::move(result)));
+        });
    }), std::make_unique<std::any>(std::move(init)), reducer_type([reducer = std::move(reducer)] (std::unique_ptr<std::any> a, std::unique_ptr<std::any> b) mutable {
        return std::make_unique<std::any>(I(reducer(std::any_cast<I>(std::move(*a)), std::any_cast<I>(std::move(*b)))));
    })).then([] (std::unique_ptr<std::any> r) {
@@ -61,13 +65,12 @@ future<json::json_return_type> map_reduce_cf_time_histogram(http_context& ctx, c

 struct map_reduce_column_families_locally {
    std::any init;
-    std::function<std::unique_ptr<std::any>(replica::column_family&)> mapper;
+    std::function<future<std::unique_ptr<std::any>>(replica::column_family&)> mapper;
    std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)> reducer;
    future<std::unique_ptr<std::any>> operator()(replica::database& db) const {
        auto res = seastar::make_lw_shared<std::unique_ptr<std::any>>(std::make_unique<std::any>(init));
-        return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) {
-            *res = reducer(std::move(*res), mapper(*table.get()));
-            return make_ready_future();
+        return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) -> future<> {
+            *res = reducer(std::move(*res), co_await mapper(*table.get()));
        }).then([res] () {
            return std::move(*res);
        });
@@ -77,10 +80,14 @@ struct map_reduce_column_families_locally {
 template<class Mapper, class I, class Reducer>
 future<I> map_reduce_cf_raw(http_context& ctx, I init,
        Mapper mapper, Reducer reducer) {
-    using mapper_type = std::function<std::unique_ptr<std::any>(replica::column_family&)>;
+    using mapper_type = std::function<future<std::unique_ptr<std::any>>(replica::column_family&)>;
    using reducer_type = std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)>;
    auto wrapped_mapper = mapper_type([mapper = std::move(mapper)] (replica::column_family& cf) mutable {
-        return std::make_unique<std::any>(I(mapper(cf)));
+        return futurize_invoke([&cf, mapper] {
+            return mapper(cf);
+        }).then([] (auto result) {
+            return std::make_unique<std::any>(I(std::move(result)));
+        });
    });
    auto wrapped_reducer = reducer_type([reducer = std::move(reducer)] (std::unique_ptr<std::any> a, std::unique_ptr<std::any> b) mutable {
        return std::make_unique<std::any>(I(reducer(std::any_cast<I>(std::move(*a)), std::any_cast<I>(std::move(*b)))));
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -72,10 +72,9 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction_man
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) {
+                return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) -> future<> {
                    replica::table& cf = *table.get();
-                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
-                    return make_ready_future<>();
+                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = co_await cf.estimate_pending_compactions();
                }).then([&tasks] {
                    return std::move(tasks);
                });
@@ -118,7 +117,7 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction_man
            auto& cm = db.get_compaction_manager();
            return parallel_for_each(tables, [&] (const table_info& ti) {
                auto& t = db.find_column_family(ti.id);
-                return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                return t.parallel_foreach_compaction_group_view([&] (compaction::compaction_group_view& ts) {
                    return cm.stop_compaction(type, &ts);
                });
            });
--- a/api/config.cc
+++ b/api/config.cc
@@ -23,22 +23,6 @@ using namespace seastar::httpd;
 namespace sp = httpd::storage_proxy_json;
 namespace ss = httpd::storage_service_json;

-template<class T>
-json::json_return_type get_json_return_type(const T& val) {
-    return json::json_return_type(val);
-}
-
-/*
- * As commented on db::seed_provider_type is not used
- * and probably never will.
- *
- * Just in case, we will return its name
- */
-template<>
-json::json_return_type get_json_return_type(const db::seed_provider_type& val) {
-    return json::json_return_type(val.class_name);
-}
-
 std::string_view format_type(std::string_view type) {
    if (type == "int") {
        return "integer";
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -12,7 +12,6 @@
 #include "api/api-doc/storage_service.json.hh"
 #include "api/api-doc/storage_proxy.json.hh"
 #include "api/scrub_status.hh"
-#include "api/tasks.hh"
 #include "db/config.hh"
 #include "db/schema_tables.hh"
 #include "gms/feature_service.hh"
@@ -21,7 +20,6 @@
 #include "utils/hash.hh"
 #include <optional>
 #include <sstream>
-#include <stdexcept>
 #include <time.h>
 #include <algorithm>
 #include <functional>
@@ -234,7 +232,7 @@ future<scrub_info> parse_scrub_options(const http_context& ctx, sharded<db::snap
    scrub_info info;
    auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
    info.keyspace = std::move(keyspace);
-    info.column_families = table_infos | std::views::transform([] (auto ti) { return ti.name; }) | std::ranges::to<std::vector>();
+    info.column_families = table_infos | std::views::transform(&table_info::name) | std::ranges::to<std::vector>();
    auto scrub_mode_str = req->get_query_param("scrub_mode");
    auto scrub_mode = sstables::compaction_type_options::scrub::mode::abort;

@@ -361,6 +359,9 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair, s
            // if the option is not sane, repair_start() throws immediately, so
            // convert the exception to an HTTP error
            throw httpd::bad_param_exception(e.what());
+        } catch (const tablets_unsupported& e) {
+            throw base_exception("Cannot repair tablet keyspace. Use /storage_service/tablets/repair to repair tablet keyspaces.",
+                    http::reply::status_type::forbidden);
        }
    });

@@ -650,7 +651,7 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
            auto& ks = ctx.db.local().find_keyspace(keyspace);
            if (table.empty()) {
                ensure_tablets_disabled(ctx, keyspace, "storage_service/range_to_endpoint_map");
-                return ks.get_vnode_effective_replication_map();
+                return ks.get_static_effective_replication_map();
            } else {
                auto table_id = validate_table(ctx.db.local(), keyspace, table);
                auto& cf = ctx.db.local().find_column_family(table_id);
@@ -758,7 +759,18 @@ rest_force_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
 static
 future<json::json_return_type>
 rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
-        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
+        auto& db = ctx.db;
+        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
+        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        std::optional<flush_mode> fmopt;
+        if (!flush && !consider_only_existing_data) {
+            fmopt = flush_mode::skip;
+        }
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
        co_await task->done();
        co_return json_void();
 }
@@ -766,24 +778,33 @@ rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request>
 static
 future<json::json_return_type>
 rest_force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
-        if (task) {
-            co_await task->done();
+        auto& db = ctx.db;
+        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
+        const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
+        if (rs.is_local() || !rs.is_vnode_based()) {
+            auto reason = rs.is_local() ? "require" : "support";
+            apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
+            co_return json::json_return_type(0);
        }
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
+            auto msg = "Can not perform cleanup operation when topology changes";
+            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+            co_await coroutine::return_exception(std::runtime_error(msg));
+        }
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>(
+            {}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
+        co_await task->done();
        co_return json::json_return_type(0);
 }

 static
 future<json::json_return_type>
 rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        bool global = true;
-        if (auto global_param = req->get_query_param("global"); !global_param.empty()) {
-            global = validate_bool(global_param);
-        }
-
-        apilog.info("cleanup_all global={}", global);
-
-        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+        apilog.info("cleanup_all");
+        auto done = co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
            if (!ss.is_topology_coordinator_enabled()) {
                co_return false;
            }
@@ -793,35 +814,14 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
+        // fall back to the local global cleanup if topology coordinator is not enabled
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<global_cleanup_compaction_task_impl>({}, db);
        co_await task->done();
-
-        // Mark this node as clean
-        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
-            if (ss.is_topology_coordinator_enabled()) {
-                co_await ss.reset_cleanup_needed();
-            }
-        });
-
        co_return json::json_return_type(0);
 }

-static
-future<json::json_return_type>
-rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("reset_cleanup_needed");
-        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
-            if (!ss.is_topology_coordinator_enabled()) {
-                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
-            }
-            return ss.reset_cleanup_needed();
-        });
-        co_return json_void();
-}
-
 static
 future<json::json_return_type>
 rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
@@ -837,8 +837,14 @@ rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<
 static
 future<json::json_return_type>
 rest_upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req) {
+        auto& db = ctx.db;
        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
+        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
        co_await task->done();
        co_return json::json_return_type(0);
 }
@@ -1781,6 +1787,36 @@ rest_get_schema_versions(sharded<service::storage_service>& ss, std::unique_ptr<
        });
 }

+static
+future<json::json_return_type>
+rest_drop_quarantined_sstables(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto keyspace = req->get_query_param("keyspace");
+    try {
+        if (!keyspace.empty()) {
+            keyspace = validate_keyspace(ctx, keyspace);
+            auto it = req->query_parameters.find("tables");
+            auto table_infos = parse_table_infos(keyspace, ctx, it != req->query_parameters.end() ? it->second : "");
+
+            co_await ctx.db.invoke_on_all([&table_infos](replica::database& db) -> future<> {
+                return parallel_for_each(table_infos, [&db](const auto& table) -> future<> {
+                    const auto& [table_name, table_id] = table;
+                    return db.find_column_family(table_id).drop_quarantined_sstables();
+                });
+            });
+        } else {
+            co_await ctx.db.invoke_on_all([](replica::database& db) -> future<> {
+                return db.get_tables_metadata().parallel_for_each_table([](table_id, lw_shared_ptr<replica::table> t) -> future<> {
+                    return t->drop_quarantined_sstables();
+                });
+            });
+        }
+    } catch (...) {
+        apilog.error("drop_quarantined_sstables: failed with exception: {}", std::current_exception());
+        throw;
+    }
+
+    co_return json_void();
+}

 // Disambiguate between a function that returns a future and a function that returns a plain value, also
 // add std::ref() as a courtesy. Also handles ks_cf_func signatures.
@@ -1819,7 +1855,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_keyspace_compaction.set(r, rest_bind(rest_force_keyspace_compaction, ctx));
    ss::force_keyspace_cleanup.set(r, rest_bind(rest_force_keyspace_cleanup, ctx, ss));
    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
-    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
    ss::perform_keyspace_offstrategy_compaction.set(r, rest_bind(rest_perform_keyspace_offstrategy_compaction, ctx));
    ss::upgrade_sstables.set(r, rest_bind(rest_upgrade_sstables, ctx));
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
@@ -1885,6 +1920,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
    ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
    sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
+    ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
 }

 void unset_storage_service(http_context& ctx, routes& r) {
@@ -1904,7 +1940,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::force_keyspace_compaction.unset(r);
    ss::force_keyspace_cleanup.unset(r);
    ss::cleanup_all.unset(r);
-    ss::reset_cleanup_needed.unset(r);
    ss::perform_keyspace_offstrategy_compaction.unset(r);
    ss::upgrade_sstables.unset(r);
    ss::force_flush.unset(r);
@@ -1968,6 +2003,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::tablet_balancing_enable.unset(r);
    ss::quiesce_topology.unset(r);
    sp::get_schema_versions.unset(r);
+    ss::drop_quarantined_sstables.unset(r);
 }

 void set_load_meter(http_context& ctx, routes& r, service::load_meter& lm) {
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -36,65 +36,37 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    };
 }

-future<tasks::task_manager::task_ptr> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
-    auto& db = ctx.db;
-    auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-    auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-    auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
-    apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
-
-    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-    std::optional<compaction::flush_mode> fmopt;
-    if (!flush && !consider_only_existing_data) {
-        fmopt = compaction::flush_mode::skip;
-    }
-    return compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
-}
-
-future<tasks::task_manager::task_ptr> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) {
-    auto& db = ctx.db;
-    bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-    apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-    return compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-}
-
-future<tasks::task_manager::task_ptr> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-    auto& db = ctx.db;
-    auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-    const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
-    if (rs.get_type() == locator::replication_strategy_type::local || !rs.is_vnode_based()) {
-        auto reason = rs.get_type() == locator::replication_strategy_type::local ? "require" : "support";
-        apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
-        co_return nullptr;
-    }
-    apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
-    if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-        auto msg = "Can not perform cleanup operation when topology changes";
-        apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-        co_await coroutine::return_exception(std::runtime_error(msg));
-    }
-
-    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-    co_return co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
-        {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
-}
-
 void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
    t::force_keyspace_compaction_async.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
+        auto& db = ctx.db;
+        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+        apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        std::optional<flush_mode> fmopt;
+        if (!flush) {
+            fmopt = flush_mode::skip;
+        }
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
+
        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    t::force_keyspace_cleanup_async.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        tasks::task_id id = tasks::task_id::create_null_id();
-        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
-        if (task) {
-            id = task->get_status().id;
+        auto& db = ctx.db;
+        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
+        apilog.info("force_keyspace_cleanup_async: keyspace={} tables={}", keyspace, table_infos);
+        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
+            auto msg = "Can not perform cleanup operation when topology changes";
+            apilog.warn("force_keyspace_cleanup_async: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+            co_await coroutine::return_exception(std::runtime_error(msg));
        }
-        co_return json::json_return_type(id.to_sstring());
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
+
+        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    t::perform_keyspace_offstrategy_compaction_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
@@ -106,7 +78,14 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
    }));

    t::upgrade_sstables_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
+        auto& db = ctx.db;
+        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+
        co_return json::json_return_type(task->get_status().id.to_sstring());
    }));

--- a/api/tasks.hh
+++ b/api/tasks.hh
@@ -15,10 +15,6 @@ namespace seastar::httpd {
 class routes;
 }

-namespace seastar::http {
-struct request;
-}
-
 namespace service {
 class storage_service;
 }
@@ -29,8 +25,4 @@ struct http_context;
 void set_tasks_compaction_module(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl);
 void unset_tasks_compaction_module(http_context& ctx, httpd::routes& r);

-future<tasks::task_manager::task_ptr> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req);
-future<tasks::task_manager::task_ptr> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req);
-future<tasks::task_manager::task_ptr> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos);
-
 }
--- a/audit/audit_syslog_storage_helper.cc
+++ b/audit/audit_syslog_storage_helper.cc
@@ -108,7 +108,7 @@ future<> audit_syslog_storage_helper::write(const audit_info* audit_info,
    auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
    tm time;
    localtime_r(&now, &time);
-    sstring msg = seastar::format(R"(<{}>{:%h %e %T} scylla-audit: node="{}" category="{}" cl="{}" error="{}" keyspace="{}" query="{}" client_ip="{}" table="{}" username="{}")",
+    sstring msg = seastar::format(R"(<{}>{:%h %e %T} scylla-audit: node="{}", category="{}", cl="{}", error="{}", keyspace="{}", query="{}", client_ip="{}", table="{}", username="{}")",
                                    LOG_NOTICE | LOG_USER,
                                    time,
                                    node_ip,
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -126,7 +126,7 @@ future<> create_legacy_metadata_table_if_missing(

 static future<> announce_mutations_with_guard(
        ::service::raft_group0_client& group0_client,
-        std::vector<canonical_mutation> muts,
+        utils::chunked_vector<canonical_mutation> muts,
        ::service::group0_guard group0_guard,
        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
@@ -154,7 +154,7 @@ future<> announce_mutations_with_batching(
    });

    size_t memory_usage = 0;
-    std::vector<canonical_mutation> muts;
+    utils::chunked_vector<canonical_mutation> muts;

    // guard has to be taken before we execute code in gen as
    // it can do read-before-write and we want announce_mutations
@@ -204,7 +204,7 @@ future<> announce_mutations(
            internal_distributed_query_state(),
            timestamp,
            std::move(values));
-    std::vector<canonical_mutation> cmuts = {muts.begin(), muts.end()};
+    utils::chunked_vector<canonical_mutation> cmuts = {muts.begin(), muts.end()};
    co_await announce_mutations_with_guard(group0_client, std::move(cmuts), std::move(group0_guard), as, timeout);
 }

--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -233,9 +233,9 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
 }

 future<role_to_directly_granted_map>
-ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
+ldap_role_manager::query_all_directly_granted() {
    role_to_directly_granted_map result;
-    auto roles = co_await query_all(qs);
+    auto roles = co_await query_all();
    for (auto& role: roles) {
        auto granted_set = co_await query_granted(role, recursive_role_query::no);
        for (auto& granted: granted_set) {
@@ -247,8 +247,8 @@ ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
    co_return result;
 }

-future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
-    return _std_mgr.query_all(qs);
+future<role_set> ldap_role_manager::query_all() {
+    return _std_mgr.query_all();
 }

 future<> ldap_role_manager::create_role(std::string_view role_name) {
@@ -311,12 +311,12 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> ldap_role_manager::get_attribute(
-        std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
-    return _std_mgr.get_attribute(role_name, attribute_name, qs);
+        std::string_view role_name, std::string_view attribute_name) {
+    return _std_mgr.get_attribute(role_name, attribute_name);
 }

-future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
-    return _std_mgr.query_attribute_for_all(attribute_name, qs);
+future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name) {
+    return _std_mgr.query_attribute_for_all(attribute_name);
 }

 future<> ldap_role_manager::set_attribute(
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -75,9 +75,9 @@ class ldap_role_manager : public role_manager {

    future<role_set> query_granted(std::string_view, recursive_role_query) override;

-    future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
+    future<role_to_directly_granted_map> query_all_directly_granted() override;

-    future<role_set> query_all(::service::query_state&) override;
+    future<role_set> query_all() override;

    future<bool> exists(std::string_view) override;

@@ -85,9 +85,9 @@ class ldap_role_manager : public role_manager {

    future<bool> can_login(std::string_view) override;

-    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view, ::service::query_state&) override;
+    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view) override;

-    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view, ::service::query_state&) override;
+    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view) override;

    future<> set_attribute(std::string_view, std::string_view, std::string_view, ::service::group0_batch& mc) override;

--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -78,11 +78,11 @@ future<role_set> maintenance_socket_role_manager::query_granted(std::string_view
    return operation_not_supported_exception<role_set>("QUERY GRANTED");
 }

-future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted() {
    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
 }

-future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
+future<role_set> maintenance_socket_role_manager::query_all() {
    return operation_not_supported_exception<role_set>("QUERY ALL");
 }

@@ -98,11 +98,11 @@ future<bool> maintenance_socket_role_manager::can_login(std::string_view role_na
    return make_ready_future<bool>(true);
 }

-future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
+future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
    return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
 }

-future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
+future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name) {
    return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
 }

--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -53,9 +53,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;

-    virtual future<role_set> query_all(::service::query_state&) override;
+    virtual future<role_set> query_all() override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -63,9 +63,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/auth/resource.cc
+++ b/auth/resource.cc
@@ -193,9 +193,7 @@ service_level_resource_view::service_level_resource_view(const resource &r) {

 sstring encode_signature(std::string_view name, std::vector<data_type> args) {
    return seastar::format("{}[{}]", name,
-            fmt::join(args | std::views::transform([] (const data_type t) {
-                return t->name();
-            }), "^"));
+            fmt::join(args | std::views::transform(&abstract_type::name), "^"));
 }

 std::pair<sstring, std::vector<data_type>> decode_signature(std::string_view encoded_signature) {
@@ -221,9 +219,7 @@ std::pair<sstring, std::vector<data_type>> decode_signature(std::string_view enc
 static sstring decoded_signature_string(std::string_view encoded_signature) {
    auto [function_name, arg_types] = decode_signature(encoded_signature);
    return seastar::format("{}({})", cql3::util::maybe_quote(sstring(function_name)),
-            fmt::join(arg_types | std::views::transform([] (data_type t) {
-                return t->cql3_type_name();
-            }), ", "));
+            fmt::join(arg_types | std::views::transform(&abstract_type::cql3_type_name), ", "));
 }

 resource make_functions_resource(const cql3::functions::function& f) {
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -17,17 +17,12 @@
 #include <seastar/core/format.hh>
 #include <seastar/core/sstring.hh>

-#include "auth/common.hh"
 #include "auth/resource.hh"
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "exceptions/exceptions.hh"
 #include "service/raft/raft_group0_client.hh"

-namespace service {
-class query_state;
-};
-
 namespace auth {

 struct role_config final {
@@ -172,9 +167,9 @@ public:
    ///   (role2, role3)
    /// }
    ///  
-    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& = internal_distributed_query_state()) = 0;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted() = 0;

-    virtual future<role_set> query_all(::service::query_state& = internal_distributed_query_state()) = 0;
+    virtual future<role_set> query_all() = 0;

    virtual future<bool> exists(std::string_view role_name) = 0;

@@ -191,12 +186,12 @@ public:
    ///
    /// \returns the value of the named attribute, if one is set.
    ///
-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) = 0;

    ///
    /// \returns a mapping of each role's value for the named attribute, if one is set for the role.
    ///
-    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;
+    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name) = 0;

    /// Sets `attribute_name` with `attribute_value` for `role_name`.
    /// \returns an exceptional future with nonexistant_role if the role does not exist.
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -47,6 +47,7 @@
 #include "data_dictionary/keyspace_metadata.hh"
 #include "service/storage_service.hh"
 #include "service_permit.hh"
+#include "utils/managed_string.hh"

 using namespace std::chrono_literals;

@@ -83,7 +84,6 @@ private:
    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
-    void on_update_tablet_metadata(const locator::tablet_metadata_change_hint&) override {}

    void on_drop_keyspace(const sstring& ks_name) override {
        if (!legacy_mode(_qp)) {
@@ -476,12 +476,14 @@ future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_
        const bool can_login = co_await _role_manager->can_login(role);
        const bool is_superuser = co_await _role_manager->is_superuser(role);

+        sstring create_statement = produce_create_statement(formatted_role_name, maybe_hashed_password, can_login, is_superuser);
+
        result.push_back(cql3::description {
            // Roles do not belong to any keyspace.
            .keyspace = std::nullopt,
            .type = "role",
            .name = role,
-            .create_statement = produce_create_statement(formatted_role_name, maybe_hashed_password, can_login, is_superuser)
+            .create_statement = managed_string(create_statement)
        });
    }

@@ -622,19 +624,21 @@ future<std::vector<cql3::description>> service::describe_permissions() const {

    for (const auto& permissions : permission_list) {
        for (const auto& permission : permissions.permissions) {
+            sstring create_statement = describe_resource_kind(permission, permissions.resource, permissions.role_name);
+
            result.push_back(cql3::description {
                // Permission grants do not belong to any keyspace.
                .keyspace = std::nullopt,
                .type = "grant_permission",
                .name = permissions.role_name,
-                .create_statement = describe_resource_kind(permission, permissions.resource, permissions.role_name)
+                .create_statement = managed_string(create_statement)
            });
        }

        co_await coroutine::maybe_yield();
    }

-    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) noexcept {
+    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) {
        return std::make_tuple(std::ref(desc.name), std::ref(*desc.create_statement));
    });

--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -37,6 +37,7 @@
 #include "utils/class_registrator.hh"
 #include "service/migration_manager.hh"
 #include "password_authenticator.hh"
+#include "utils/managed_string.hh"

 namespace auth {

@@ -652,30 +653,21 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    });
 }

-future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
+future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted() {
    const sstring query = seastar::format("SELECT * FROM {}.{}",
            get_auth_ks_name(_qp),
            meta::role_members_table::name);

-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_to_directly_granted_map roles_map;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles_map, roles_map.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
-    );
+    co_await _qp.query_internal(query, [&roles_map] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
+        roles_map.insert({row.get_as<sstring>("member"), row.get_as<sstring>("role")});
+        co_return stop_iteration::no;
+    });

    co_return roles_map;
 }

-future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
+future<role_set> standard_role_manager::query_all() {
    const sstring query = seastar::format("SELECT {} FROM {}.{}",
            meta::roles_table::role_col_name,
            get_auth_ks_name(_qp),
@@ -693,7 +685,7 @@ future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
    const auto results = co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
-            qs,
+            internal_distributed_query_state(),
            cql3::query_processor::cache_internal::yes);

    role_set roles;
@@ -725,11 +717,11 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
    });
 }

-future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
+future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
-    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
+    const auto result_set = co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
    if (!result_set->empty()) {
        const cql3::untyped_result_set_row &row = result_set->one();
        co_return std::optional<sstring>(row.get_as<sstring>("value"));
@@ -737,11 +729,11 @@ future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_
    co_return std::optional<sstring>{};
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
-    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
-                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name) {
+    return query_all().then([this, attribute_name] (role_set roles) {
+        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles)] (attribute_vals &role_to_att_val) {
+            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name] (sstring role) {
+                return get_attribute(role, attribute_name).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
                    if (att_val) {
                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
                    }
@@ -786,25 +778,27 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
 future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {
    std::vector<cql3::description> result{};

-    const auto grants = co_await query_all_directly_granted(internal_distributed_query_state());
+    const auto grants = co_await query_all_directly_granted();
    result.reserve(grants.size());

    for (const auto& [grantee_role, granted_role] : grants) {
        const auto formatted_grantee = cql3::util::maybe_quote(grantee_role);
        const auto formatted_granted = cql3::util::maybe_quote(granted_role);

+        sstring create_statement = seastar::format("GRANT {} TO {};", formatted_granted, formatted_grantee);
+
        result.push_back(cql3::description {
            // Role grants do not belong to any keyspace.
            .keyspace = std::nullopt,
            .type = "grant_role",
            .name = granted_role,
-            .create_statement = seastar::format("GRANT {} TO {};", formatted_granted, formatted_grantee)
+            .create_statement = managed_string(create_statement)
        });

        co_await coroutine::maybe_yield();
    }

-    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) noexcept {
+    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) {
        return std::make_tuple(std::ref(desc.name), std::ref(*desc.create_statement));
    });

--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -66,9 +66,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;

-    virtual future<role_set> query_all(::service::query_state&) override;
+    virtual future<role_set> query_all() override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -76,9 +76,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -139,7 +139,7 @@ private:
    // size must not be zero.
    [[gnu::always_inline]]
    value_type* alloc(size_type size) {
-        if (__builtin_expect(size <= current_space_left(), true)) {
+        if (size <= current_space_left()) [[likely]] {
            auto ret = _current->data + _current->frag_size;
            _current->frag_size += size;
            _size += size;
@@ -249,7 +249,7 @@ public:
        }

        auto this_size = std::min(v.size(), size_t(current_space_left()));
-        if (__builtin_expect(this_size, true)) {
+        if (this_size) [[likely]] {
            memcpy(_current->data + _current->frag_size, v.begin(), this_size);
            _current->frag_size += this_size;
            _size += this_size;
@@ -268,6 +268,14 @@ public:
        write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
    }

+    // Writes the fragmented view
+    template<FragmentedView View>
+    void write(View v) {
+        for (bytes_view f : fragment_range(v)) {
+            write(f);
+        }
+    }
+
    bool is_linearized() const {
        return !_begin || !_begin->next;
    }
--- a/cdc/cdc_partitioner.cc
+++ b/cdc/cdc_partitioner.cc
@@ -12,7 +12,7 @@
 #include "sstables/key.hh"
 #include "utils/class_registrator.hh"
 #include "cdc/generation.hh"
-#include "keys.hh"
+#include "keys/keys.hh"

 namespace cdc {

--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -16,7 +16,7 @@

 #include "gms/endpoint_state.hh"
 #include "gms/versioned_value.hh"
-#include "keys.hh"
+#include "keys/keys.hh"
 #include "replica/database.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -56,17 +56,8 @@ using namespace std::chrono_literals;

 logging::logger cdc_log("cdc");

-namespace {
-
-// When dropping a column from a CDC log table, we set the drop timestamp
-// `column_drop_leeway` seconds into the future to ensure that for writes concurrent
-// with column drop, the write timestamp is before the column drop timestamp.
-constexpr auto column_drop_leeway = std::chrono::seconds(5);
-
-} // anonymous namespace
-
 namespace cdc {
-static schema_ptr create_log_schema(const schema&, api::timestamp_type, std::optional<table_id> = {}, schema_ptr = nullptr);
+static schema_ptr create_log_schema(const schema&, std::optional<table_id> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -167,7 +158,7 @@ public:
        });
    }

-    void on_before_create_column_family(const keyspace_metadata& ksm, const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+    void on_before_create_column_family(const keyspace_metadata& ksm, const schema& schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
        if (schema.cdc_options().enabled()) {
            auto& db = _ctxt._proxy.get_db().local();
            auto logname = log_name(schema.cf_name());
@@ -176,7 +167,7 @@ public:
            ensure_that_table_uses_vnodes(ksm, schema);

            // in seastar thread
-            auto log_schema = create_log_schema(schema, timestamp);
+            auto log_schema = create_log_schema(schema);

            auto log_mut = db::schema_tables::make_create_table_mutations(log_schema, timestamp);

@@ -184,7 +175,7 @@ public:
        }
    }

-    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
        bool is_cdc = new_schema.cdc_options().enabled();
        bool was_cdc = old_schema.cdc_options().enabled();

@@ -214,7 +205,7 @@ public:
            ensure_that_table_has_no_counter_columns(new_schema);
            ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema);

-            auto new_log_schema = create_log_schema(new_schema, timestamp, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp)
@@ -225,7 +216,7 @@ public:
        }
    }

-    void on_before_drop_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+    void on_before_drop_column_family(const schema& schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
        auto logname = log_name(schema.cf_name());
        auto& db = _ctxt._proxy.get_db().local();
        auto has_cdc_log = db.has_schema(schema.ks_name(), logname);
@@ -240,15 +231,15 @@ public:
        }
    }

-    future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>> augment_mutation_call(
+    future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>> augment_mutation_call(
        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations,
+        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
        db::consistency_level write_cl
    );

    template<typename Iter>
-    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, std::vector<mutation>&);
+    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, utils::chunked_vector<mutation>&);

 private:
    static void check_for_attempt_to_create_nested_cdc_log(replica::database& db, const schema& schema) {
@@ -505,7 +496,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old) {
+static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner(cdc::cdc_partitioner::classname);
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -540,28 +531,6 @@ static schema_ptr create_log_schema(const schema& s, api::timestamp_type timesta
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
-
-    auto validate_new_column = [&] (const sstring& name) {
-        // When dropping a column from a CDC log table, we set the drop timestamp to be
-        // `column_drop_leeway` seconds into the future (see `create_log_schema`).
-        // Therefore, when recreating a column with the same name, we need to validate
-        // that it's not recreated too soon and that the drop timestamp has passed.
-        if (old && old->dropped_columns().contains(name)) {
-            const auto& drop_info = old->dropped_columns().at(name);
-            auto create_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(timestamp));
-            auto drop_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(drop_info.timestamp));
-            if (drop_time > create_time) {
-                throw exceptions::invalid_request_exception(format("Cannot add column {} because a column with the same name was dropped too recently. Please retry after {} seconds",
-                        name, std::chrono::duration_cast<std::chrono::seconds>(drop_time - create_time).count() + 1));
-            }
-        }
-    };
-
-    auto add_column = [&] (sstring name, data_type type) {
-        validate_new_column(name);
-        b.with_column(to_bytes(name), type);
-    };
-
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
            auto type = column.type;
@@ -583,9 +552,9 @@ static schema_ptr create_log_schema(const schema& s, api::timestamp_type timesta
                    }
                ));
            }
-            add_column(log_data_column_name(column.name_as_text()), type);
+            b.with_column(log_data_column_name_bytes(column.name()), type);
            if (is_data_col) {
-                add_column(log_data_column_deleted_name(column.name_as_text()), boolean_type);
+                b.with_column(log_data_column_deleted_name_bytes(column.name()), boolean_type);
            }
            if (column.type->is_multi_cell()) {
                auto dtype = visit(*type, make_visitor(
@@ -601,7 +570,7 @@ static schema_ptr create_log_schema(const schema& s, api::timestamp_type timesta
                        throw std::invalid_argument("Should not reach");
                    }
                ));
-                add_column(log_data_column_deleted_elements_name(column.name_as_text()), dtype);
+                b.with_column(log_data_column_deleted_elements_name_bytes(column.name()), dtype);
            }
        }
    };
@@ -623,8 +592,7 @@ static schema_ptr create_log_schema(const schema& s, api::timestamp_type timesta
        // not super efficient, but we don't do this often.
        for (auto& col : old->all_columns()) {
            if (!b.has_column({col.name(), col.name_as_text() })) {
-                auto drop_ts = api::timestamp_clock::now() + column_drop_leeway;
-                b.without_column(col.name_as_text(), col.type, drop_ts.time_since_epoch().count());
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
            }
        }
    }
@@ -1508,7 +1476,7 @@ private:
    row_states_map _clustering_row_states;
    cell_map _static_row_state;

-    std::vector<mutation> _result_mutations;
+    utils::chunked_vector<mutation> _result_mutations;
    std::optional<log_mutation_builder> _builder;

    // When enabled, process_change will update _clustering_row_states and _static_row_state
@@ -1638,8 +1606,8 @@ public:

    // Takes and returns generated cdc log mutations and associated statistics about parts touched during transformer's lifetime.
    // The `transformer` object on which this method was called on should not be used anymore.
-    std::tuple<std::vector<mutation>, stats::part_type_set> finish() && {
-        return std::make_pair<std::vector<mutation>, stats::part_type_set>(std::move(_result_mutations), std::move(_touched_parts));
+    std::tuple<utils::chunked_vector<mutation>, stats::part_type_set> finish() && {
+        return std::make_pair<utils::chunked_vector<mutation>, stats::part_type_set>(std::move(_result_mutations), std::move(_touched_parts));
    }

    static db::timeout_clock::time_point default_timeout() {
@@ -1810,8 +1778,8 @@ public:
 };

 template <typename Func>
-future<std::vector<mutation>>
-transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
+future<utils::chunked_vector<mutation>>
+transform_mutations(utils::chunked_vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
    return parallel_for_each(
            boost::irange(static_cast<decltype(muts.size())>(0), muts.size(), batch_size),
            std::forward<Func>(f))
@@ -1820,8 +1788,8 @@ transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_siz

 } // namespace cdc

-future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
+cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
    // we do all this because in the case of batches, we can have mixed schemas.
    auto e = mutations.end();
    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
@@ -1829,14 +1797,14 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
    });

    if (i == e) {
-        return make_ready_future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), lw_shared_ptr<cdc::operation_result_tracker>()));
+        return make_ready_future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), lw_shared_ptr<cdc::operation_result_tracker>()));
    }

    tracing::trace(tr_state, "CDC: Started generating mutations for log rows");
    mutations.reserve(2 * mutations.size());

    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{},
-            [this, tr_state = std::move(tr_state), write_cl] (std::vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
+            [this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
        return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl] (int idx) mutable {
            auto& m = mutations[idx];
            auto s = m.schema();
@@ -1896,22 +1864,22 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                tracing::trace(tr_state, "CDC: Generated {} log mutations from {}", generated_count, mutations[idx].decorated_key());
                details.touched_parts.add(touched_parts);
            });
-        }).then([this, tr_state, &details](std::vector<mutation> mutations) {
+        }).then([this, tr_state, &details](utils::chunked_vector<mutation> mutations) {
            tracing::trace(tr_state, "CDC: Finished generating all log mutations");
            auto tracker = make_lw_shared<cdc::operation_result_tracker>(_ctxt._proxy.get_cdc_stats(), details);
-            return make_ready_future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), std::move(tracker)));
+            return make_ready_future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), std::move(tracker)));
        });
    });
 }

-bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutations) const {
+bool cdc::cdc_service::needs_cdc_augmentation(const utils::chunked_vector<mutation>& mutations) const {
    return std::any_of(mutations.begin(), mutations.end(), [](const mutation& m) {
        return m.schema()->cdc_options().enabled();
    });
 }

-future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
+cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -75,13 +75,13 @@ public:
    // appropriate augments to set the log entries.
    // Iff post-image is enabled for any of these, a non-empty callback is also
    // returned to be invoked post the mutation query.
-    future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
+    future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations,
+        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
        db::consistency_level write_cl
        );
-    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
+    bool needs_cdc_augmentation(const utils::chunked_vector<mutation>&) const;
 };

 struct db_context final {
--- a/cmake/Findkmip.cmake
+++ b/cmake/Findkmip.cmake
@@ -20,8 +20,6 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
  set(kmip_arch "aarch64")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64")
  set(kmip_arch "64")
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(powerpc|ppc)64le")
-  set(kmip_arch "ppc64le")
 endif()

 set(kmip_ROOT "${PROJECT_SOURCE_DIR}/kmipc/kmipc-${kmip_ver}-${kmip_distrib}_${kmip_arch}")
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -54,6 +54,62 @@
 #include "replica/database.hh"
 #include "timestamp.hh"

+
+can_gc_fn always_gc = [] (tombstone, is_shadowable) { return true; };
+can_gc_fn never_gc = [] (tombstone, is_shadowable) { return false; };
+
+max_purgeable_fn can_always_purge = [] (const dht::decorated_key&, is_shadowable) -> max_purgeable { return max_purgeable(api::max_timestamp); };
+max_purgeable_fn can_never_purge = [] (const dht::decorated_key&, is_shadowable) -> max_purgeable { return max_purgeable(api::min_timestamp); };
+
+max_purgeable& max_purgeable::combine(max_purgeable other) {
+    if (!other) {
+        return *this;
+    }
+    if (!*this) {
+        *this = std::move(other);
+        return *this;
+    }
+
+    if (_timestamp > other._timestamp) {
+        _source = other._source;
+        _timestamp = other._timestamp;
+    }
+
+    if (_expiry_threshold && other._expiry_threshold) {
+        _expiry_threshold = std::min(*_expiry_threshold, *other._expiry_threshold);
+    } else {
+        _expiry_threshold = std::nullopt;
+    }
+
+    return *this;
+}
+
+max_purgeable::can_purge_result max_purgeable::can_purge(tombstone t) const {
+    if (!*this) {
+        return { };
+    }
+    return {
+        .can_purge = (t.deletion_time < _expiry_threshold.value_or(gc_clock::time_point::min()) || t.timestamp < _timestamp),
+        .timestamp_source = _source,
+    };
+}
+
+auto fmt::formatter<max_purgeable::timestamp_source>::format(max_purgeable::timestamp_source s, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+    switch (s) {
+        case max_purgeable::timestamp_source::none:
+            return format_to(ctx.out(), "none");
+        case max_purgeable::timestamp_source::memtable_possibly_shadowing_data:
+            return format_to(ctx.out(), "memtable_possibly_shadowing_data");
+        case max_purgeable::timestamp_source::other_sstables_possibly_shadowing_data:
+            return format_to(ctx.out(), "other_sstables_possibly_shadowing_data");
+    }
+}
+
+auto fmt::formatter<max_purgeable>::format(max_purgeable mp, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+    const sstring expiry_str = mp.expiry_threshold() ? fmt::format("{}", mp.expiry_threshold()->time_since_epoch().count()) : "nullopt";
+    return format_to(ctx.out(), "max_purgeable{{timestamp={}, expiry_treshold={}, source={}}}", mp.timestamp(), expiry_str, mp.source());
+}
+
 namespace sstables {

 bool is_eligible_for_compaction(const shared_sstable& sst) noexcept {
@@ -135,18 +191,22 @@ std::string_view to_string(compaction_type_options::scrub::quarantine_mode quara
    return "(invalid)";
 }

-static max_purgeable get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
+static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& table_s, sstable_set::incremental_selector& selector,
        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks,
        const api::timestamp_type compacting_max_timestamp, const bool gc_check_only_compacting_sstables, const is_shadowable is_shadowable) {
    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
-        return { .timestamp = api::min_timestamp };
+        clogger.trace("get_max_purgeable_timestamp {}.{}: tombstone_gc_enabled=false, returning min_timestamp",
+                table_s.schema()->ks_name(), table_s.schema()->cf_name());
+        return max_purgeable(api::min_timestamp);
    }

    auto timestamp = api::max_timestamp;
    if (gc_check_only_compacting_sstables) {
        // If gc_check_only_compacting_sstables is enabled, do not
        // check memtables and other sstables not being compacted.
-        return { .timestamp = timestamp };
+        clogger.trace("get_max_purgeable_timestamp {}.{}: gc_check_only_compacting_sstables=true, returning max_timestamp",
+                table_s.schema()->ks_name(), table_s.schema()->cf_name());
+        return max_purgeable(timestamp);
    }

    auto source = max_purgeable::timestamp_source::none;
@@ -167,7 +227,8 @@ static max_purgeable get_max_purgeable_timestamp(const table_state& table_s, sst
        // See https://github.com/scylladb/scylladb/issues/20423
        memtable_min_timestamp = table_s.min_memtable_live_timestamp();
    }
-    clogger.trace("memtable_min_timestamp={} compacting_max_timestamp={} memtable_has_key={} is_shadowable={} min_memtable_live_timestamp={} min_memtable_live_row_marker_timestamp={}",
+    clogger.trace("get_max_purgeable_timestamp {}.{}: memtable_min_timestamp={} compacting_max_timestamp={} memtable_has_key={} is_shadowable={} min_memtable_live_timestamp={} min_memtable_live_row_marker_timestamp={}",
+            table_s.schema()->ks_name(), table_s.schema()->cf_name(),
            memtable_min_timestamp, compacting_max_timestamp, table_s.memtable_has_key(dk), is_shadowable, table_s.min_memtable_live_timestamp(), table_s.min_memtable_live_row_marker_timestamp());
    // Use memtable timestamp if it contains live data older than the sstables being compacted,
    // and if the memtable also contains the key we're calculating max purgeable timestamp for.
@@ -222,10 +283,10 @@ static max_purgeable get_max_purgeable_timestamp(const table_state& table_s, sst
            source = max_purgeable::timestamp_source::other_sstables_possibly_shadowing_data;
        }
    }
-    return { .timestamp = timestamp, .source = source };
+    return max_purgeable(timestamp, source);
 }

-static std::vector<shared_sstable> get_uncompacting_sstables(const table_state& table_s, std::vector<shared_sstable> sstables) {
+static std::vector<shared_sstable> get_uncompacting_sstables(const compaction_group_view& table_s, std::vector<shared_sstable> sstables) {
    auto sstable_set = table_s.sstable_set_for_tombstone_gc();
    auto all_sstables = *sstable_set->all() | std::ranges::to<std::vector>();
    auto& compacted_undeleted = table_s.compacted_undeleted_sstables();
@@ -248,13 +309,13 @@ class compaction;

 class compaction_write_monitor final : public sstables::write_monitor, public backlog_write_progress_manager {
    sstables::shared_sstable _sst;
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    const sstables::writer_offset_tracker* _tracker = nullptr;
    uint64_t _progress_seen = 0;
    api::timestamp_type _maximum_timestamp;
    unsigned _sstable_level;
 public:
-    compaction_write_monitor(sstables::shared_sstable sst, table_state& table_s, api::timestamp_type max_timestamp, unsigned sstable_level)
+    compaction_write_monitor(sstables::shared_sstable sst, compaction_group_view& table_s, api::timestamp_type max_timestamp, unsigned sstable_level)
        : _sst(sst)
        , _table_s(table_s)
        , _maximum_timestamp(max_timestamp)
@@ -390,7 +451,7 @@ using use_backlog_tracker = bool_class<class use_backlog_tracker_tag>;
 struct compaction_read_monitor_generator final : public read_monitor_generator {
    class compaction_read_monitor final : public  sstables::read_monitor, public backlog_read_progress_manager {
        sstables::shared_sstable _sst;
-        table_state& _table_s;
+        compaction_group_view& _table_s;
        const sstables::reader_position_tracker* _tracker = nullptr;
        uint64_t _last_position_seen = 0;
        use_backlog_tracker _use_backlog_tracker;
@@ -423,7 +484,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
            _sst = {};
        }

-        compaction_read_monitor(sstables::shared_sstable sst, table_state& table_s, use_backlog_tracker use_backlog_tracker)
+        compaction_read_monitor(sstables::shared_sstable sst, compaction_group_view& table_s, use_backlog_tracker use_backlog_tracker)
            : _sst(std::move(sst)), _table_s(table_s), _use_backlog_tracker(use_backlog_tracker) { }

        ~compaction_read_monitor() {
@@ -442,7 +503,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
        return p.first->second;
    }

-    explicit compaction_read_monitor_generator(table_state& table_s, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
+    explicit compaction_read_monitor_generator(compaction_group_view& table_s, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
        : _table_s(table_s), _use_backlog_tracker(use_backlog_tracker) {}

    uint64_t compacted() const {
@@ -458,7 +519,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
        }
    }
 private:
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    std::unordered_map<generation_type, compaction_read_monitor> _generated_monitors;
    use_backlog_tracker _use_backlog_tracker;

@@ -486,7 +547,7 @@ uint64_t compaction_progress_monitor::get_progress() const {
 class compaction {
 protected:
    compaction_data& _cdata;
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    const compaction_sstable_creator_fn _sstable_creator;
    const schema_ptr _schema;
    const reader_permit _permit;
@@ -536,6 +597,7 @@ protected:
    utils::observable<> _stop_request_observable;
    // optional tombstone_gc_state that is used when gc has to check only the compacting sstables to collect tombstones.
    std::optional<tombstone_gc_state> _tombstone_gc_state_with_commitlog_check_disabled;
+    int64_t _output_repaired_at = 0;
 private:
    // Keeps track of monitors for input sstable.
    // If _update_backlog_tracker is set to true, monitors are responsible for adjusting backlog as compaction progresses.
@@ -565,7 +627,7 @@ private:
        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
    }
 protected:
-    compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
+    compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
        : _cdata(init_compaction_data(cdata, descriptor))
        , _table_s(table_s)
        , _sstable_creator(std::move(descriptor.creator))
@@ -618,6 +680,7 @@ protected:
    }

    void finish_new_sstable(compaction_writer* writer) {
+        writer->writer.set_repaired_at(_output_repaired_at);
        writer->writer.consume_end_of_stream();
        writer->sst->open_data().get();
        _end_size += writer->sst->bytes_on_disk();
@@ -795,9 +858,13 @@ private:
        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        _input_sstables_basic_info.reserve(_sstables.size());
+        int64_t repaired_at = 0;
+        std::vector<int64_t> repaired_at_for_compacted_sstables;
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
            auto& sst_stats = sst->get_stats_metadata();
+            repaired_at_for_compacted_sstables.push_back(sst_stats.repaired_at);
+            repaired_at = std::max(sst_stats.repaired_at, repaired_at);
            timestamp_tracker.update(sst_stats.min_timestamp);
            timestamp_tracker.update(sst_stats.max_timestamp);

@@ -830,7 +897,11 @@ private:
                _rp = std::max(_rp, sst_stats.position);
            }
        }
-        log_info("{} [{}]", report_start_desc(), fmt::join(_sstables | std::views::transform([] (auto sst) { return to_string(sst, true); }), ","));
+        log_debug("{} [{}]", report_start_desc(), fmt::join(_sstables | std::views::transform([] (auto sst) { return to_string(sst, true); }), ","));
+        if (repaired_at) {
+            _output_repaired_at = repaired_at;
+        }
+        log_debug("repaired_at_vec={} output_repaired_at={}", repaired_at_for_compacted_sstables, _output_repaired_at);
        if (ssts->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->size(), _sstables.size());
@@ -951,7 +1022,7 @@ protected:
        // - add support to merge summary (message: Partition merge counts were {%s}.).
        // - there is no easy way, currently, to know the exact number of total partitions.
        // By the time being, using estimated key count.
-        log_info("{} {} sstables to [{}]. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
+        log_debug("{} {} sstables to [{}]. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(), _input_sstable_generations.size(),
                fmt::join(ret.new_sstables | std::views::transform([] (auto sst) { return to_string(sst, false); }), ","),
                utils::pretty_printed_data_size(_start_size), utils::pretty_printed_data_size(_end_size), int(ratio * 100),
@@ -1177,7 +1248,7 @@ void compacted_fragments_writer::consume_end_of_stream() {
 class regular_compaction : public compaction {
    seastar::semaphore _replacer_lock = {1};
 public:
-    regular_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
+    regular_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
        : compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker)
    {
    }
@@ -1319,7 +1390,7 @@ private:
        return bool(_replacer);
    }
 public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
+    reshape_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no) {
    }

@@ -1387,7 +1458,7 @@ public:

 class cleanup_compaction final : public regular_compaction {
 public:
-    cleanup_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
+    cleanup_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor)
    {
    }
@@ -1404,7 +1475,7 @@ public:
 class split_compaction final : public regular_compaction {
    compaction_type_options::split _options;
 public:
-    split_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::split options,
+    split_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::split options,
                         compaction_progress_monitor& progress_monitor)
            : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor)
            , _options(std::move(options))
@@ -1663,7 +1734,7 @@ private:
    uint64_t _validation_errors = 0;

 public:
-    scrub_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
+    scrub_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
        , _options(options)
        , _scrub_start_description(fmt::format("Scrubbing in {} mode", _options.operation_mode))
@@ -1760,7 +1831,7 @@ private:
                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
-    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
+    resharding_compaction(compaction_group_view& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
        : compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
        , _estimation_per_shard(smp::count)
        , _run_identifiers(smp::count)
@@ -1875,9 +1946,9 @@ compaction_type compaction_type_options::type() const {
    return index_to_type[_options.index()];
 }

-static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor) {
+static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor) {
    struct {
-        table_state& table_s;
+        compaction_group_view& table_s;
        sstables::compaction_descriptor&& descriptor;
        compaction_data& cdata;
        compaction_progress_monitor& progress_monitor;
@@ -1908,7 +1979,7 @@ static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstable
    return descriptor.options.visit(visitor_factory);
 }

-static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, read_monitor_generator& monitor_generator) {
+static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, read_monitor_generator& monitor_generator) {
    auto schema = table_s.schema();
    auto permit = table_s.make_compaction_reader_permit();

@@ -1933,11 +2004,7 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    using scrub = sstables::compaction_type_options::scrub;
    if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
        for (auto& sst : descriptor.sstables) {
-            try {
-                co_await sst->change_state(sstables::sstable_state::quarantine);
-            } catch (...) {
-                clogger.error("Moving {} to quarantine failed due to {}, continuing.", sst->get_filename(), std::current_exception());
-            }
+            co_await sst->change_state(sstables::sstable_state::quarantine);
        }
    }

@@ -1950,7 +2017,7 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    };
 }

-future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor) {
+future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
    progress_monitor.set_generator(std::make_unique<compaction_read_monitor_generator>(table_s, use_backlog_tracker::no));
    auto d = defer([&] { progress_monitor.reset_generator(); });
    auto res = co_await scrub_sstables_validate_mode(descriptor, cdata, table_s, *progress_monitor._generator);
@@ -1958,7 +2025,7 @@ future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_desc
 }

 future<compaction_result>
-compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor) {
+compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
    if (descriptor.sstables.empty()) {
        return make_exception_future<compaction_result>(std::runtime_error(format("Called {} compaction with empty set on behalf of {}.{}",
                compaction_name(descriptor.options.type()), table_s.schema()->ks_name(), table_s.schema()->cf_name())));
@@ -1972,7 +2039,7 @@ compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cd
 }

 std::unordered_set<sstables::shared_sstable>
-get_fully_expired_sstables(const table_state& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point compaction_time) {
+get_fully_expired_sstables(const compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point compaction_time) {
    clogger.debug("Checking droppable sstables in {}.{}", table_s.schema()->ks_name(), table_s.schema()->cf_name());

    if (compacting.empty()) {
@@ -1980,6 +2047,8 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
    }

    std::unordered_set<sstables::shared_sstable> candidates;
+    // Note: This contains both repaired and unrepaired sstables which means
+    // compaction consults both repaired and unrepaired sstables for tombstone gc.
    auto uncompacting_sstables = get_uncompacting_sstables(table_s, compacting);
    // Get list of uncompacting sstables that overlap the ones being compacted.
    std::vector<sstables::shared_sstable> overlapping = leveled_manifest::overlapping(*table_s.schema(), compacting, uncompacting_sstables);
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -16,7 +16,7 @@
 #include "mutation/mutation_tombstone_stats.hh"
 #include "gc_clock.hh"
 #include "utils/UUID.hh"
-#include "table_state.hh"
+#include "compaction_group_view.hh"
 #include <seastar/core/abort_source.hh>
 #include "sstables/basic_info.hh"

@@ -123,7 +123,7 @@ public:
    uint64_t get_progress() const;

    friend class compaction;
-    friend future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor, compaction_data&, table_state&, compaction_progress_monitor&);
+    friend future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor, compaction_data&, compaction_group_view&, compaction_progress_monitor&);
 };

 // Compact a list of N sstables into M sstables.
@@ -131,7 +131,7 @@ public:
 //
 // compaction_descriptor is responsible for specifying the type of compaction, and influencing
 // compaction behavior through its available member fields.
-future<compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor);
+future<compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor);

 // Return list of expired sstables for column family cf.
 // A sstable is fully expired *iff* its max_local_deletion_time precedes gc_before and its
@@ -139,7 +139,7 @@ future<compaction_result> compact_sstables(sstables::compaction_descriptor descr
 // In simpler words, a sstable is fully expired if all of its live cells with TTL is expired
 // and possibly doesn't contain any tombstone that covers cells in other sstables.
 std::unordered_set<sstables::shared_sstable>
-get_fully_expired_sstables(const table_state& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point gc_before);
+get_fully_expired_sstables(const compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point gc_before);

 // For tests, can drop after we virtualize sstables.
 mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
--- a/compaction/compaction_fwd.hh
+++ b/compaction/compaction_fwd.hh
@@ -15,7 +15,7 @@

 namespace compaction {

-class table_state;
+class compaction_group_view;
 class strategy_control;
 struct compaction_state;

--- a/compaction/compaction_garbage_collector.hh
+++ b/compaction/compaction_garbage_collector.hh
@@ -22,17 +22,88 @@ using can_gc_fn = std::function<bool(tombstone, is_shadowable)>;
 extern can_gc_fn always_gc;
 extern can_gc_fn never_gc;

-struct max_purgeable {
+// For the purposes of overlap with live data, a tombstone is purgeable if:
+//      tombstone.timestamp ∈ (-inf, max_purgeable._timestamp)
+//
+// The above overlap check can be omitted iff:
+//      tombstone.deletion_time ∈ (-inf, max_purgeable._expiry_threshold.value_or(gc_clock::time_point::min()))
+//
+// So in other words, a tombstone is purgeable iff:
+//      tombstone.deletion_time < max_purgeable._expiry_threshold.value_or(gc_clock::time_point::min()) || tombstone.timestamp < max_purgeable._timestamp
+//
+// See can_purge() for more details.
+class max_purgeable {
+public:
    enum class timestamp_source {
        none,
        memtable_possibly_shadowing_data,
        other_sstables_possibly_shadowing_data
    };

-    operator bool() const { return timestamp != api::missing_timestamp; }
+    using expiry_threshold_opt = std::optional<gc_clock::time_point>;

-    api::timestamp_type timestamp { api::missing_timestamp };
-    timestamp_source source { timestamp_source::none };
+private:
+    api::timestamp_type _timestamp { api::missing_timestamp };
+    expiry_threshold_opt _expiry_threshold;
+    timestamp_source _source { timestamp_source::none };
+
+public:
+    max_purgeable() = default;
+    explicit max_purgeable(api::timestamp_type timestamp, timestamp_source source = timestamp_source::none)
+        : _timestamp(timestamp), _source(source)
+    { }
+    explicit max_purgeable(api::timestamp_type timestamp, expiry_threshold_opt expiry_threshold, timestamp_source source = timestamp_source::none)
+        : _timestamp(timestamp), _expiry_threshold(expiry_threshold), _source(source)
+    { }
+
+    operator bool() const { return _timestamp != api::missing_timestamp; }
+    bool operator==(const max_purgeable&) const = default;
+    bool operator!=(const max_purgeable&) const = default;
+
+    api::timestamp_type timestamp() const noexcept { return _timestamp; }
+    expiry_threshold_opt expiry_threshold() const noexcept { return _expiry_threshold; }
+    timestamp_source source() const noexcept { return _source; }
+
+    max_purgeable& combine(max_purgeable other);
+
+    struct can_purge_result {
+        bool can_purge { true };
+        timestamp_source timestamp_source { timestamp_source::none };
+
+        // can purge?
+        operator bool() const noexcept {
+            return can_purge;
+        }
+        bool operator!() const noexcept {
+            return !can_purge;
+        }
+    };
+
+    // Determines whether the tombstone can be purged.
+    //
+    // If available, the expiry threshold is used to maybe elide the overlap
+    // check against the min live timestamp. The overlap check elision is
+    // possible if the tombstone's deletion time is < than the expiry threshold
+    // or in other words: the tombstone was already expired when the data
+    // source(s) represented by this max_purgeable were created. Consequently,
+    // all writes in these data sources arrived *after* the tombstone was already
+    // expired and hence it is not relevant to these writes, even if they
+    // otherwise overlap with the tombstone's timestamp.
+    //
+    // The overlap check elision is an optimization, checking whether a tombstone
+    // can be purged by just looking at the timestamps is still correct (but
+    // stricter).
+    can_purge_result can_purge(tombstone) const;
+};
+
+template <>
+struct fmt::formatter<max_purgeable::timestamp_source> : fmt::formatter<string_view> {
+    auto format(max_purgeable::timestamp_source, fmt::format_context& ctx) const -> decltype(ctx.out());
+};
+
+template <>
+struct fmt::formatter<max_purgeable> : fmt::formatter<string_view> {
+    auto format(max_purgeable, fmt::format_context& ctx) const -> decltype(ctx.out());
 };

 using max_purgeable_fn = std::function<max_purgeable(const dht::decorated_key&, is_shadowable)>;
--- a/compaction/compaction_group_view.hh
+++ b/compaction/compaction_group_view.hh
@@ -30,16 +30,16 @@ class compaction_strategy_state;

 namespace compaction {

-class table_state {
+class compaction_group_view {
 public:
-    virtual ~table_state() {}
+    virtual ~compaction_group_view() {}
    virtual dht::token_range token_range() const noexcept = 0;
    virtual const schema_ptr& schema() const noexcept = 0;
    // min threshold as defined by table.
    virtual unsigned min_compaction_threshold() const noexcept = 0;
    virtual bool compaction_enforce_min_threshold() const noexcept = 0;
-    virtual const sstables::sstable_set& main_sstable_set() const = 0;
-    virtual const sstables::sstable_set& maintenance_sstable_set() const = 0;
+    virtual future<lw_shared_ptr<const sstables::sstable_set>> main_sstable_set() const = 0;
+    virtual future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const = 0;
    virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
    virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
    virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
@@ -61,6 +61,7 @@ public:
    virtual const std::string get_group_id() const noexcept = 0;
    virtual seastar::condition_variable& get_staging_done_condition() noexcept = 0;
    virtual dht::token_range get_token_range_after_split(const dht::token& t) const noexcept = 0;
+    virtual int64_t get_sstables_repaired_at() const noexcept = 0;
 };

 } // namespace compaction
@@ -68,9 +69,9 @@ public:
 namespace fmt {

 template <>
-struct formatter<compaction::table_state> : formatter<string_view> {
+struct formatter<compaction::compaction_group_view> : formatter<string_view> {
    template <typename FormatContext>
-    auto format(const compaction::table_state& t, FormatContext& ctx) const {
+    auto format(const compaction::compaction_group_view& t, FormatContext& ctx) const {
        auto s = t.schema();
        return fmt::format_to(ctx.out(), "{}.{} compaction_group={}", s->ks_name(), s->cf_name(), t.get_group_id());
    }
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -197,7 +197,7 @@ unsigned compaction_manager::current_compaction_fan_in_threshold() const {
    return std::min(unsigned(32), largest_fan_in);
 }

-bool compaction_manager::can_register_compaction(table_state& t, int weight, unsigned fan_in) const {
+bool compaction_manager::can_register_compaction(compaction_group_view& t, int weight, unsigned fan_in) const {
    // Only one weight is allowed if parallel compaction is disabled.
    if (!t.get_compaction_strategy().parallel_compaction() && has_table_ongoing_compaction(t)) {
        return false;
@@ -233,15 +233,17 @@ void compaction_manager::deregister_weight(int weight) {
    reevaluate_postponed_compactions();
 }

-std::vector<sstables::shared_sstable> in_strategy_sstables(table_state& table_s) {
-    auto sstables = table_s.main_sstable_set().all();
-    return *sstables | std::views::filter([] (const sstables::shared_sstable& sst) {
+future<std::vector<sstables::shared_sstable>> in_strategy_sstables(compaction_group_view& table_s) {
+    auto set = co_await table_s.main_sstable_set();
+    auto sstables = set->all();
+    co_return *sstables | std::views::filter([] (const sstables::shared_sstable& sst) {
        return sstables::is_eligible_for_compaction(sst);
    }) | std::ranges::to<std::vector>();
 }

-std::vector<sstables::shared_sstable> compaction_manager::get_candidates(table_state& t) const {
-    return get_candidates(t, *t.main_sstable_set().all());
+future<std::vector<sstables::shared_sstable>> compaction_manager::get_candidates(compaction_group_view& t) const {
+    auto main_set = co_await t.main_sstable_set();
+    co_return get_candidates(t, *main_set->all());
 }

 bool compaction_manager::eligible_for_compaction(const sstables::shared_sstable& sstable) const {
@@ -256,7 +258,7 @@ bool compaction_manager::eligible_for_compaction(const sstables::frozen_sstable_

 template <std::ranges::range Range>
 requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable> || std::convertible_to<std::ranges::range_value_t<Range>, sstables::frozen_sstable_run>
-std::vector<std::ranges::range_value_t<Range>> compaction_manager::get_candidates(table_state& t, const Range& sstables) const {
+std::vector<std::ranges::range_value_t<Range>> compaction_manager::get_candidates(compaction_group_view& t, const Range& sstables) const {
    using range_candidates_t = std::ranges::range_value_t<Range>;
    std::vector<range_candidates_t> candidates;
    candidates.reserve(sstables.size());
@@ -314,7 +316,7 @@ private:
    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {}
 };

-compaction::compaction_state& compaction_manager::get_compaction_state(table_state* t) {
+compaction::compaction_state& compaction_manager::get_compaction_state(compaction_group_view* t) {
    try {
        return _compaction_state.at(t);
    } catch (std::out_of_range&) {
@@ -323,7 +325,7 @@ compaction::compaction_state& compaction_manager::get_compaction_state(table_sta
    }
 }

-compaction_task_executor::compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, table_state* t, sstables::compaction_type type, sstring desc)
+compaction_task_executor::compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, compaction_group_view* t, sstables::compaction_type type, sstring desc)
    : _cm(mgr)
    , _compacting_table(t)
    , _compaction_state(_cm.get_compaction_state(t))
@@ -362,7 +364,7 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
    co_return std::nullopt;
 }

-future<> compaction_manager::on_compaction_completion(table_state& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) {
+future<> compaction_manager::on_compaction_completion(compaction_group_view& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) {
    auto& cs = get_compaction_state(&t);
    auto new_sstables = desc.new_sstables | std::ranges::to<std::unordered_set>();
    for (const auto& sst : desc.old_sstables) {
@@ -392,7 +394,7 @@ future<sstables::compaction_result> compaction_task_executor::compact_sstables_a
    co_return res;
 }

-future<sstables::sstable_set> compaction_task_executor::sstable_set_for_tombstone_gc(table_state& t) {
+future<sstables::sstable_set> compaction_task_executor::sstable_set_for_tombstone_gc(compaction_group_view& t) {
    auto compound_set = t.sstable_set_for_tombstone_gc();
    // Compound set will be linearized into a single set, since compaction might add or remove sstables
    // to it for incremental compaction to work.
@@ -408,7 +410,7 @@ future<sstables::sstable_set> compaction_task_executor::sstable_set_for_tombston

 future<sstables::compaction_result> compaction_task_executor::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, compaction_manager::can_purge_tombstones can_purge,
                                                                               sstables::offstrategy offstrategy) {
-    table_state& t = *_compacting_table;
+    compaction_group_view& t = *_compacting_table;
    if (can_purge) {
        descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
    }
@@ -456,7 +458,7 @@ future<sstables::compaction_result> compaction_task_executor::compact_sstables(s

    co_return co_await sstables::compact_sstables(std::move(descriptor), cdata, t, _progress_monitor);
 }
-future<> compaction_task_executor::update_history(table_state& t, sstables::compaction_result&& res, const sstables::compaction_data& cdata) {
+future<> compaction_task_executor::update_history(compaction_group_view& t, sstables::compaction_result&& res, const sstables::compaction_data& cdata) {
    auto started_at = std::chrono::duration_cast<std::chrono::milliseconds>(res.stats.started_at.time_since_epoch());
    auto ended_at = std::chrono::duration_cast<std::chrono::milliseconds>(res.stats.ended_at.time_since_epoch());

@@ -510,7 +512,7 @@ protected:
    sstables::shared_sstable consume_sstable();

 public:
-    explicit sstables_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, table_state* t, sstables::compaction_type compaction_type, sstring desc, std::vector<sstables::shared_sstable> sstables, tasks::task_id parent_id, sstring entity = "")
+    explicit sstables_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, compaction_group_view* t, sstables::compaction_type compaction_type, sstring desc, std::vector<sstables::shared_sstable> sstables, tasks::task_id parent_id, sstring entity = "")
        : compaction_task_executor(mgr, do_throw_if_stopping, t, compaction_type, std::move(desc))
        , sstables_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), 0, "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), std::move(entity), parent_id)
    {
@@ -539,7 +541,7 @@ class major_compaction_task_executor : public compaction_task_executor, public m
 public:
    major_compaction_task_executor(compaction_manager& mgr,
            throw_if_stopping do_throw_if_stopping,
-            table_state* t,
+            compaction_group_view* t,
            tasks::task_id parent_id,
            bool consider_only_existing_data)
        : compaction_task_executor(mgr, do_throw_if_stopping, t, sstables::compaction_type::Compaction, "Major compaction")
@@ -568,6 +570,8 @@ protected:

        switch_state(state::pending);
        auto units = co_await acquire_semaphore(_cm._maintenance_ops_sem);
+        // Write lock is used to synchronize selection of sstables for compaction and their registration.
+        // Also used to synchronize with regular compaction, so major waits for regular to cease before selecting candidates.
        auto lock_holder = co_await _compaction_state.lock.hold_write_lock();
        if (!can_proceed()) {
            co_return std::nullopt;
@@ -575,9 +579,9 @@ protected:

        // candidates are sstables that aren't being operated on by other compaction types.
        // those are eligible for major compaction.
-        table_state* t = _compacting_table;
+        compaction_group_view* t = _compacting_table;
        sstables::compaction_strategy cs = t->get_compaction_strategy();
-        sstables::compaction_descriptor descriptor = cs.get_major_compaction_job(*t, _cm.get_candidates(*t));
+        sstables::compaction_descriptor descriptor = cs.get_major_compaction_job(*t, co_await _cm.get_candidates(*t));
        descriptor.gc_check_only_compacting_sstables = _consider_only_existing_data;
        auto compacting = compacting_sstable_registration(_cm, _cm.get_compaction_state(t), descriptor.sstables);
        auto on_replace = compacting.update_on_sstable_replacement();
@@ -628,7 +632,7 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_com
    co_return task_executor->get_stats();
 }

-std::optional<gate::holder> compaction_manager::start_compaction(table_state& t) {
+std::optional<gate::holder> compaction_manager::start_compaction(compaction_group_view& t) {
    if (_state != state::enabled) {
        return std::nullopt;
    }
@@ -641,7 +645,7 @@ std::optional<gate::holder> compaction_manager::start_compaction(table_state& t)
    return it->second.gate.hold();
 }

-future<> compaction_manager::perform_major_compaction(table_state& t, tasks::task_info info, bool consider_only_existing_data) {
+future<> compaction_manager::perform_major_compaction(compaction_group_view& t, tasks::task_info info, bool consider_only_existing_data) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return;
@@ -656,7 +660,7 @@ class custom_compaction_task_executor : public compaction_task_executor, public
    noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> _job;

 public:
-    custom_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, table_state* t, tasks::task_id parent_id, sstables::compaction_type type, sstring desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job)
+    custom_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, compaction_group_view* t, tasks::task_id parent_id, sstables::compaction_type type, sstring desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job)
        : compaction_task_executor(mgr, do_throw_if_stopping, t, type, std::move(desc))
        , compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), 0, "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), "", parent_id)
        , _job(std::move(job))
@@ -704,7 +708,7 @@ protected:

 }

-future<> compaction_manager::run_custom_job(table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping) {
+future<> compaction_manager::run_custom_job(compaction_group_view& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return;
@@ -718,7 +722,7 @@ future<> compaction_manager::update_static_shares(float static_shares) {
    return _compaction_controller.update_static_shares(static_shares);
 }

-compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manager& cm, table_state& t)
+compaction_reenabler::compaction_reenabler(compaction_manager& cm, compaction_group_view& t)
    : _cm(cm)
    , _table(&t)
    , _compaction_state(cm.get_compaction_state(_table))
@@ -729,14 +733,14 @@ compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manage
            t, _compaction_state.compaction_disabled_counter);
 }

-compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
+compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
    : _cm(o._cm)
    , _table(std::exchange(o._table, nullptr))
    , _compaction_state(o._compaction_state)
    , _holder(std::move(o._holder))
 {}

-compaction_manager::compaction_reenabler::~compaction_reenabler() {
+compaction_reenabler::~compaction_reenabler() {
    // submit compaction request if we're the last holder of the gate which is still opened.
    if (_table && --_compaction_state.compaction_disabled_counter == 0 && !_compaction_state.gate.is_closed()) {
        cmlog.debug("Reenabling compaction for {}", *_table);
@@ -749,16 +753,81 @@ compaction_manager::compaction_reenabler::~compaction_reenabler() {
    }
 }

-future<compaction_manager::compaction_reenabler>
-compaction_manager::stop_and_disable_compaction(table_state& t) {
+future<> compaction_manager::await_ongoing_compactions(compaction_group_view* t) {
+    auto name = t ? t->schema()->ks_name() + "." + t->schema()->cf_name() : "ALL";
+    try {
+        auto tasks = _tasks
+                | std::views::filter([t] (const auto& task) {
+                    return (!t || task.compacting_table() == t);
+                })
+                | std::views::transform([] (auto& task) { return task.shared_from_this(); })
+                | std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
+        auto sz = tasks.size();
+        cmlog.debug("Awaiting ongoing unrepaired compactions table={} tasks={}", name, sz);
+        bool task_stopped = false;
+        co_await await_tasks(std::move(tasks), task_stopped);
+        cmlog.debug("Awaiting ongoing unrepaired compactions table={} tasks={} done", name, sz);
+    } catch (...) {
+        cmlog.error("Awaiting ongoing unrepaired compactions table={} failed: {}", name, std::current_exception());
+        throw;
+    }
+}
+
+future<seastar::rwlock::holder>
+compaction_manager::get_incremental_repair_read_lock(compaction::compaction_group_view& t, const sstring& reason) {
+    if (!reason.empty()) {
+        cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
+    }
+    compaction::compaction_state& cs = get_compaction_state(&t);
+    auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
+    if (!reason.empty()) {
+        cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
+    }
+    co_return ret;
+}
+
+future<seastar::rwlock::holder>
+compaction_manager::get_incremental_repair_write_lock(compaction::compaction_group_view& t, const sstring& reason) {
+    if (!reason.empty()) {
+        cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
+    }
+    compaction::compaction_state& cs = get_compaction_state(&t);
+    auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
+    if (!reason.empty()) {
+        cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
+    }
+    co_return ret;
+}
+
+future<compaction_reenabler>
+compaction_manager::await_and_disable_compaction(compaction_group_view& t) {
    compaction_reenabler cre(*this, t);
-    co_await stop_ongoing_compactions("user-triggered operation", &t);
+    co_await await_ongoing_compactions(&t);
+    co_return cre;
+}
+
+
+compaction_reenabler
+compaction_manager::stop_and_disable_compaction_no_wait(compaction_group_view& t, sstring reason) {
+    compaction_reenabler cre(*this, t);
+    try {
+        do_stop_ongoing_compactions(std::move(reason), &t, {});
+    } catch (...) {
+        cmlog.error("Stopping ongoing compactions failed: {}.  Ignored", std::current_exception());
+    }
+    return cre;
+}
+
+future<compaction_reenabler>
+compaction_manager::stop_and_disable_compaction(sstring reason, compaction_group_view& t) {
+    compaction_reenabler cre(*this, t);
+    co_await stop_ongoing_compactions(std::move(reason), &t);
    co_return cre;
 }

 future<>
-compaction_manager::run_with_compaction_disabled(table_state& t, std::function<future<> ()> func) {
-    compaction_reenabler cre = co_await stop_and_disable_compaction(t);
+compaction_manager::run_with_compaction_disabled(compaction_group_view& t, std::function<future<> ()> func, sstring reason) {
+    compaction_reenabler cre = co_await stop_and_disable_compaction(std::move(reason), t);

    co_await func();
 }
@@ -920,7 +989,7 @@ class compaction_manager::strategy_control : public compaction::strategy_control
 public:
    explicit strategy_control(compaction_manager& cm) noexcept : _cm(cm) {}

-    bool has_ongoing_compaction(table_state& table_s) const noexcept override {
+    bool has_ongoing_compaction(compaction_group_view& table_s) const noexcept override {
        return std::any_of(_cm._tasks.begin(), _cm._tasks.end(), [&s = table_s.schema()] (const compaction_task_executor& task) {
            return task.compaction_running()
                && task.compacting_table()->schema()->ks_name() == s->ks_name()
@@ -928,12 +997,14 @@ public:
        });
    }

-    std::vector<sstables::shared_sstable> candidates(table_state& t) const override {
-        return _cm.get_candidates(t, *t.main_sstable_set().all());
+    future<std::vector<sstables::shared_sstable>> candidates(compaction_group_view& t) const override {
+        auto main_set = co_await t.main_sstable_set();
+        co_return _cm.get_candidates(t, *main_set->all());
    }

-    std::vector<sstables::frozen_sstable_run> candidates_as_runs(table_state& t) const override {
-        return _cm.get_candidates(t, t.main_sstable_set().all_sstable_runs());
+    future<std::vector<sstables::frozen_sstable_run>> candidates_as_runs(compaction_group_view& t) const override {
+        auto main_set = co_await t.main_sstable_set();
+        co_return _cm.get_candidates(t, main_set->all_sstable_runs());
    }
 };

@@ -962,7 +1033,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as, tasks::task
    , _update_compaction_static_shares_action([this] { return update_static_shares(static_shares()); })
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
    , _strategy_control(std::make_unique<strategy_control>(*this))
-    , _tombstone_gc_state(&_reconcile_history_maps) {
+    , _tombstone_gc_state(_shared_tombstone_gc_state) {
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
    register_metrics();
    // Bandwidth throttling is node-wide, updater is needed on single shard
@@ -985,7 +1056,7 @@ compaction_manager::compaction_manager(tasks::task_manager& tm)
    , _update_compaction_static_shares_action([] { return make_ready_future<>(); })
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
    , _strategy_control(std::make_unique<strategy_control>(*this))
-    , _tombstone_gc_state(&_reconcile_history_maps) {
+    , _tombstone_gc_state(_shared_tombstone_gc_state) {
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
    // No metric registration because this constructor is supposed to be used only by the testing
    // infrastructure.
@@ -1064,9 +1135,9 @@ future<> compaction_manager::postponed_compactions_reevaluation() {
        auto postponed = std::exchange(_postponed, {});
        try {
            for (auto it = postponed.begin(); it != postponed.end();) {
-                table_state* t = *it;
+                compaction_group_view* t = *it;
                it = postponed.erase(it);
-                // skip reevaluation of a table_state that became invalid post its removal
+                // skip reevaluation of a compaction_group_view that became invalid post its removal
                if (!_compaction_state.contains(t)) {
                    continue;
                }
@@ -1084,19 +1155,22 @@ void compaction_manager::reevaluate_postponed_compactions() noexcept {
    _postponed_reevaluation.signal();
 }

-void compaction_manager::postpone_compaction_for_table(table_state* t) {
+void compaction_manager::postpone_compaction_for_table(compaction_group_view* t) {
    _postponed.insert(t);
 }

-future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason) noexcept {
+void compaction_manager::stop_tasks(const std::vector<shared_ptr<compaction_task_executor>>& tasks, sstring reason) noexcept {
    // To prevent compaction from being postponed while tasks are being stopped,
    // let's stop all tasks before the deferring point below.
    for (auto& t : tasks) {
        cmlog.debug("Stopping {}", *t);
        t->stop_compaction(reason);
    }
-    co_await coroutine::parallel_for_each(tasks, [] (auto& task) -> future<> {
-        auto unlink_task = deferred_action([task] { task->unlink(); });
+}
+
+future<> compaction_manager::await_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, bool task_stopped) const noexcept {
+    co_await coroutine::parallel_for_each(tasks, [task_stopped] (auto& task) -> future<> {
+        auto unlink_task = deferred_action([task, task_stopped] { if (task_stopped) { task->unlink(); } });
        try {
            co_await task->compaction_done();
        } catch (sstables::compaction_stopped_exception&) {
@@ -1104,38 +1178,46 @@ future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_e
            // as it happens with reshard and reshape.
        } catch (...) {
            // just log any other errors as the callers have nothing to do with them.
-            cmlog.debug("Stopping {}: task returned error: {}", *task, std::current_exception());
+            cmlog.debug("Awaiting {}: task returned error: {}", *task, std::current_exception());
            co_return;
        }
-        cmlog.debug("Stopping {}: done", *task);
+        cmlog.debug("Awaiting {}: done", *task);
    });
 }

-future<> compaction_manager::stop_ongoing_compactions(sstring reason, table_state* t, std::optional<sstables::compaction_type> type_opt) noexcept {
-    try {
-        auto ongoing_compactions = get_compactions(t).size();
-        auto tasks = _tasks
-                | std::views::filter([t, type_opt] (const auto& task) {
-                    return (!t || task.compacting_table() == t) && (!type_opt || task.compaction_type() == *type_opt);
-                })
-                | std::views::transform([] (auto& task) { return task.shared_from_this(); })
-                | std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
-        logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
-        if (cmlog.is_enabled(level)) {
-            std::string scope = "";
-            if (t) {
-                scope = fmt::format(" for table {}", *t);
-            }
-            if (type_opt) {
-                scope += fmt::format(" {} type={}", scope.size() ? "and" : "for", *type_opt);
-            }
-            cmlog.log(level, "Stopping {} tasks for {} ongoing compactions{} due to {}", tasks.size(), ongoing_compactions, scope, reason);
+std::vector<shared_ptr<compaction_task_executor>>
+compaction_manager::do_stop_ongoing_compactions(sstring reason, compaction_group_view* t, std::optional<sstables::compaction_type> type_opt) noexcept {
+    auto ongoing_compactions = get_compactions(t).size();
+    auto tasks = _tasks
+            | std::views::filter([t, type_opt] (const auto& task) {
+                return (!t || task.compacting_table() == t) && (!type_opt || task.compaction_type() == *type_opt);
+            })
+            | std::views::transform([] (auto& task) { return task.shared_from_this(); })
+            | std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
+    logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
+    if (cmlog.is_enabled(level)) {
+        std::string scope = "";
+        if (t) {
+            scope = fmt::format(" for table {}", *t);
        }
-        return stop_tasks(std::move(tasks), std::move(reason));
+        if (type_opt) {
+            scope += fmt::format(" {} type={}", scope.size() ? "and" : "for", *type_opt);
+        }
+        cmlog.log(level, "Stopping {} tasks for {} ongoing compactions{} due to {}", tasks.size(), ongoing_compactions, scope, reason);
+    }
+    stop_tasks(tasks, std::move(reason));
+    return tasks;
+}
+
+future<> compaction_manager::stop_ongoing_compactions(sstring reason, compaction_group_view* t, std::optional<sstables::compaction_type> type_opt) noexcept {
+    try {
+        auto tasks = do_stop_ongoing_compactions(std::move(reason), t, type_opt);
+        bool task_stopped = true;
+        co_await await_tasks(std::move(tasks), task_stopped);
    } catch (...) {
        cmlog.error("Stopping ongoing compactions failed: {}.  Ignored", std::current_exception());
    }
-    return make_ready_future();
+    co_return;
 }

 future<> compaction_manager::drain() {
@@ -1155,6 +1237,9 @@ future<> compaction_manager::drain() {

 future<> compaction_manager::stop() {
    do_stop();
+    if (auto cm = std::exchange(_task_manager_module, nullptr)) {
+        co_await cm->stop();
+    }
    if (_stop_future) {
        co_await std::exchange(*_stop_future, make_ready_future());
    }
@@ -1165,15 +1250,14 @@ future<> compaction_manager::really_do_stop() noexcept {
    // Reset the metrics registry
    _metrics.clear();
    co_await stop_ongoing_compactions("shutdown");
-    co_await _task_manager_module->stop();
+    if (!_tasks.empty()) {
+        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
+    }
    co_await coroutine::parallel_for_each(_compaction_state | std::views::values, [] (compaction_state& cs) -> future<> {
        if (!cs.gate.is_closed()) {
            co_await cs.gate.close();
        }
    });
-    if (!_tasks.empty()) {
-        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
-    }
    reevaluate_postponed_compactions();
    co_await std::move(_waiting_reevalution);
    co_await _sys_ks.close();
@@ -1199,7 +1283,7 @@ void compaction_manager::do_stop() noexcept {
    }
 }

-inline bool compaction_manager::can_proceed(table_state* t) const {
+inline bool compaction_manager::can_proceed(compaction_group_view* t) const {
    if (_state != state::enabled) {
        return false;
    }
@@ -1262,7 +1346,7 @@ namespace compaction {

 class regular_compaction_task_executor : public compaction_task_executor, public regular_compaction_task_impl {
 public:
-    regular_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, table_state& t)
+    regular_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, compaction_group_view& t)
        : compaction_task_executor(mgr, do_throw_if_stopping, &t, sstables::compaction_type::Compaction, "Compaction")
        , regular_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), mgr._task_manager_module->new_sequence_number(), t.schema()->ks_name(), t.schema()->cf_name(), "", tasks::task_id::create_null_id())
    {}
@@ -1283,20 +1367,26 @@ protected:
        co_await coroutine::switch_to(_cm.compaction_sg());

        for (;;) {
+            auto uuid = utils::make_random_uuid();
            if (!can_proceed()) {
                co_return std::nullopt;
            }
            switch_state(state::pending);
-            // take read lock for table, so major and regular compaction can't proceed in parallel.
-            auto lock_holder = co_await _compaction_state.lock.hold_read_lock();
+            // Write lock is used to synchronize selection of sstables for compaction and their registration.
+            auto lock_holder = co_await _compaction_state.lock.hold_write_lock();
            if (!can_proceed()) {
                co_return std::nullopt;
            }

-            table_state& t = *_compacting_table;
+            compaction_group_view& t = *_compacting_table;
            sstables::compaction_strategy cs = t.get_compaction_strategy();
-            sstables::compaction_descriptor descriptor = cs.get_sstables_for_compaction(t, _cm.get_strategy_control());
+            sstables::compaction_descriptor descriptor = co_await cs.get_sstables_for_compaction(t, _cm.get_strategy_control());
            int weight = calculate_weight(descriptor);
+            cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
+                    descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
+                    compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
+
+            auto old_sstables = ::format("{}", descriptor.sstables);

            if (descriptor.sstables.empty() || !can_proceed() || t.is_auto_compaction_disabled_by_user()) {
                cmlog.debug("{}: sstables={} can_proceed={} auto_compaction={}", *this, descriptor.sstables.size(), can_proceed(), t.is_auto_compaction_disabled_by_user());
@@ -1315,6 +1405,10 @@ protected:
            cmlog.debug("Accepted compaction job: task={} ({} sstable(s)) of weight {} for {}",
                fmt::ptr(this), descriptor.sstables.size(), weight, t);

+            // Finished selecting and registering compacting sstables, so write lock can be released.
+            lock_holder.return_all();
+            lock_holder = co_await _compaction_state.lock.hold_read_lock();
+
            setup_new_compaction(descriptor.run_identifier);
            _compaction_state.last_regular_compaction = gc_clock::now();
            std::exception_ptr ex;
@@ -1322,6 +1416,8 @@ protected:
            try {
                bool should_update_history = this->should_update_history(descriptor.options.type());
                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
+                cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
+                        old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
                finish_compaction();
                if (should_update_history) {
                    // update_history can take a long time compared to
@@ -1357,7 +1453,7 @@ protected:

 }

-void compaction_manager::submit(table_state& t) {
+void compaction_manager::submit(compaction_group_view& t) {
    if (t.is_auto_compaction_disabled_by_user()) {
        return;
    }
@@ -1372,25 +1468,25 @@ void compaction_manager::submit(table_state& t) {
    (void)perform_compaction<regular_compaction_task_executor>(throw_if_stopping::no, tasks::task_info{}, t).then_wrapped([gh = std::move(gh)] (auto f) { f.ignore_ready_future(); });
 }

-bool compaction_manager::can_perform_regular_compaction(table_state& t) {
+bool compaction_manager::can_perform_regular_compaction(compaction_group_view& t) {
    return can_proceed(&t) && !t.is_auto_compaction_disabled_by_user();
 }

-future<> compaction_manager::maybe_wait_for_sstable_count_reduction(table_state& t) {
+future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_group_view& t) {
    auto schema = t.schema();
    if (!can_perform_regular_compaction(t)) {
        cmlog.trace("maybe_wait_for_sstable_count_reduction in {}: cannot perform regular compaction", t);
        co_return;
    }
-    auto num_runs_for_compaction = [&, this] {
+    auto num_runs_for_compaction = [&, this] -> future<size_t> {
        auto& cs = t.get_compaction_strategy();
-        auto desc = cs.get_sstables_for_compaction(t, get_strategy_control());
-        return std::ranges::size(desc.sstables
+        auto desc = co_await cs.get_sstables_for_compaction(t, get_strategy_control());
+        co_return std::ranges::size(desc.sstables
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
            | std::ranges::to<std::unordered_set>());
    };
    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
-    auto count = num_runs_for_compaction();
+    auto count = co_await num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
                t, count, threshold);
@@ -1403,9 +1499,11 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(table_state&
    auto start = db_clock::now();
    auto& cstate = get_compaction_state(&t);
    try {
-        co_await cstate.compaction_done.wait([this, &num_runs_for_compaction, threshold, &t] {
-            return num_runs_for_compaction() <= threshold || !can_perform_regular_compaction(t);
-        });
+        while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
+            co_await cstate.compaction_done.wait([this, &t] {
+                return !can_perform_regular_compaction(t);
+            });
+        }
    } catch (const broken_condition_variable&) {
        co_return;
    }
@@ -1420,7 +1518,7 @@ namespace compaction {
 class offstrategy_compaction_task_executor : public compaction_task_executor, public offstrategy_compaction_task_impl {
    bool& _performed;
 public:
-    offstrategy_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, table_state* t, tasks::task_id parent_id, bool& performed)
+    offstrategy_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, compaction_group_view* t, tasks::task_id parent_id, bool& performed)
        : compaction_task_executor(mgr, do_throw_if_stopping, t, sstables::compaction_type::Reshape, "Offstrategy compaction")
        , offstrategy_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), parent_id ? 0 : mgr._task_manager_module->new_sequence_number(), "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), "", parent_id)
        , _performed(performed)
@@ -1453,12 +1551,13 @@ private:
        // efficient (avoiding a 100% overhead) as we will incrementally replace input
        // SSTables from maintenance set by output ones into main set.

-        table_state& t = *_compacting_table;
+        compaction_group_view& t = *_compacting_table;

        // Filter out sstables that require view building, to avoid a race between off-strategy
        // and view building. Refs: #11882
-        auto get_reshape_candidates = [&t] () {
-            return *t.maintenance_sstable_set().all()
+        auto get_reshape_candidates = [&t] () -> future<std::vector<sstables::shared_sstable>> {
+            auto maintenance_set = co_await t.maintenance_sstable_set();
+            co_return *maintenance_set->all()
                | std::views::filter([](const sstables::shared_sstable &sst) {
                        return !sst->requires_view_building();
                })
@@ -1466,14 +1565,14 @@ private:
        };

        auto get_next_job = [&] () -> future<std::optional<sstables::compaction_descriptor>> {
-            auto candidates = get_reshape_candidates();
+            auto candidates = co_await get_reshape_candidates();
            if (candidates.empty()) {
                co_return std::nullopt;
            }
            // all sstables added to maintenance set share the same underlying storage.
            auto& storage = candidates.front()->get_storage();
            sstables::reshape_config cfg = co_await sstables::make_reshape_config(storage, sstables::reshape_mode::strict);
-            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), cfg);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(co_await get_reshape_candidates(), t.schema(), cfg);
            co_return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

@@ -1500,7 +1599,7 @@ private:
        // user has aborted off-strategy. So we can only integrate them into the main set, such that
        // they become candidates for regular compaction. We cannot hold them forever in maintenance set,
        // as that causes read and space amplification issues.
-        if (auto sstables = get_reshape_candidates(); sstables.size()) {
+        if (auto sstables = co_await get_reshape_candidates(); sstables.size()) {
            auto completion_desc = sstables::compaction_completion_desc{
                .old_sstables = sstables, // removes from maintenance set.
                .new_sstables = sstables, // adds into main set.
@@ -1512,6 +1611,11 @@ private:
            co_await coroutine::return_exception_ptr(std::move(err));
        }
    }
+
+    future<size_t> maintenance_set_size() const {
+        auto maintenance_set = co_await _compacting_table->maintenance_sstable_set();
+        co_return maintenance_set->size();
+    }
 protected:
    virtual future<compaction_manager::compaction_stats_opt> do_run() override {
        co_await coroutine::switch_to(_cm.maintenance_sg());
@@ -1529,8 +1633,8 @@ protected:

            std::exception_ptr ex;
            try {
-                table_state& t = *_compacting_table;
-                auto size = t.maintenance_sstable_set().size();
+                compaction_group_view& t = *_compacting_table;
+                auto size = co_await maintenance_set_size();
                if (!size) {
                    cmlog.debug("Skipping off-strategy compaction for {}, No candidates were found", t);
                    finish_compaction();
@@ -1557,7 +1661,7 @@ protected:

 }

-future<bool> compaction_manager::perform_offstrategy(table_state& t, tasks::task_info info) {
+future<bool> compaction_manager::perform_offstrategy(compaction_group_view& t, tasks::task_info info) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return false;
@@ -1577,7 +1681,7 @@ class rewrite_sstables_compaction_task_executor : public sstables_task_executor
    compaction_manager::can_purge_tombstones _can_purge;

 public:
-    rewrite_sstables_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, table_state* t, tasks::task_id parent_id, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
+    rewrite_sstables_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, compaction_group_view* t, tasks::task_id parent_id, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
                                     std::vector<sstables::shared_sstable> sstables, compacting_sstable_registration compacting,
                                     compaction_manager::can_purge_tombstones can_purge, sstring type_options_desc = "")
        : sstables_task_executor(mgr, do_throw_if_stopping, t, options.type(), sstring(sstables::to_string(options.type())), std::move(sstables), parent_id, std::move(type_options_desc))
@@ -1659,7 +1763,7 @@ class split_compaction_task_executor final : public rewrite_sstables_compaction_
 public:
    split_compaction_task_executor(compaction_manager& mgr,
                                       throw_if_stopping do_throw_if_stopping,
-                                       table_state* t,
+                                       compaction_group_view* t,
                                       tasks::task_id parent_id,
                                       sstables::compaction_type_options options,
                                       owned_ranges_ptr owned_ranges,
@@ -1726,7 +1830,7 @@ protected:
 template<typename TaskType, typename... Args>
 requires std::derived_from<TaskType, compaction_task_executor> &&
         std::derived_from<TaskType, compaction_task_impl>
-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task_on_all_files(tasks::task_info info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task_on_all_files(sstring reason, tasks::task_info info, compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return std::nullopt;
@@ -1750,7 +1854,7 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
        std::sort(sstables.begin(), sstables.end(), [](sstables::shared_sstable& a, sstables::shared_sstable& b) {
            return a->data_size() > b->data_size();
        });
-    });
+    }, std::move(reason));
    if (sstables.empty()) {
        co_return std::nullopt;
    }
@@ -1758,10 +1862,10 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
 }

 future<compaction_manager::compaction_stats_opt>
-compaction_manager::rewrite_sstables(table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
+compaction_manager::rewrite_sstables(compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
                                     get_candidates_func get_func, tasks::task_info info, can_purge_tombstones can_purge,
                                     sstring options_desc) {
-    return perform_task_on_all_files<rewrite_sstables_compaction_task_executor>(info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_func), can_purge, std::move(options_desc));
+    return perform_task_on_all_files<rewrite_sstables_compaction_task_executor>("rewrite", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_func), can_purge, std::move(options_desc));
 }

 namespace compaction {
@@ -1770,7 +1874,7 @@ class validate_sstables_compaction_task_executor : public sstables_task_executor
    compaction_manager::quarantine_invalid_sstables _quarantine_sstables;
 public:
    validate_sstables_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping,
-            table_state* t, tasks::task_id parent_id, std::vector<sstables::shared_sstable> sstables,
+            compaction_group_view* t, tasks::task_id parent_id, std::vector<sstables::shared_sstable> sstables,
            compaction_manager::quarantine_invalid_sstables quarantine_sstables)
        : sstables_task_executor(mgr, do_throw_if_stopping, t, sstables::compaction_type::Scrub, "Scrub compaction in validate mode", std::move(sstables), parent_id)
        , _quarantine_sstables(quarantine_sstables)
@@ -1825,33 +1929,23 @@ private:

 }

-static std::vector<sstables::shared_sstable> get_all_sstables(table_state& t) {
-    auto s = *t.main_sstable_set().all() | std::ranges::to<std::vector>();
-    auto maintenance_set = t.maintenance_sstable_set().all();
-    s.insert(s.end(), maintenance_set->begin(), maintenance_set->end());
-    return s;
+static future<std::vector<sstables::shared_sstable>> get_all_sstables(compaction_group_view& t) {
+    auto main_set = co_await t.main_sstable_set();
+    auto maintenance_set = co_await t.maintenance_sstable_set();
+    auto s = *main_set->all() | std::ranges::to<std::vector>();
+    auto maintenance_sstables = maintenance_set->all();
+    s.insert(s.end(), maintenance_sstables->begin(), maintenance_sstables->end());
+    co_return s;
 }

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub_validate_mode(table_state& t, tasks::task_info info, quarantine_invalid_sstables quarantine_sstables) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub_validate_mode(compaction_group_view& t, tasks::task_info info, quarantine_invalid_sstables quarantine_sstables) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return compaction_stats_opt{};
    }
-
-    // Collect and register all sstables as compacting while compaction is disabled, to avoid a race condition where
-    // regular compaction runs in between and picks the same files.
-    std::vector<sstables::shared_sstable> all_sstables;
-    compacting_sstable_registration compacting(*this, get_compaction_state(&t));
-    co_await run_with_compaction_disabled(t, [&all_sstables, &compacting, &t] () -> future<> {
-        // All sstables must be included.
-        all_sstables = get_all_sstables(t);
-        compacting.register_compacting(all_sstables);
-        return make_ready_future<>();
-    });
-    if (all_sstables.empty()) {
-        co_return compaction_stats_opt{};
-    }
-
+    // All sstables must be included, even the ones being compacted, such that everything in table is validated.
+    // No need to split sstables as repaired or unrepaired. No need to take any compaction and repair locks, since this compation does not modify the sstable.
+    auto all_sstables = co_await get_all_sstables(t);
    co_return co_await perform_compaction<validate_sstables_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id, std::move(all_sstables), quarantine_sstables);
 }

@@ -1863,7 +1957,7 @@ class cleanup_sstables_compaction_task_executor : public compaction_task_executo
    compacting_sstable_registration _compacting;
    std::vector<sstables::compaction_descriptor> _pending_cleanup_jobs;
 public:
-    cleanup_sstables_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, table_state* t, tasks::task_id parent_id, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
+    cleanup_sstables_compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, compaction_group_view* t, tasks::task_id parent_id, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
                                     std::vector<sstables::shared_sstable> candidates, compacting_sstable_registration compacting)
            : compaction_task_executor(mgr, do_throw_if_stopping, t, options.type(), sstring(sstables::to_string(options.type())))
            , cleanup_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), 0, "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), "", parent_id)
@@ -1986,7 +2080,7 @@ bool needs_cleanup(const sstables::shared_sstable& sst,
    return true;
 }

-bool compaction_manager::update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges) {
+bool compaction_manager::update_sstable_cleanup_state(compaction_group_view& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges) {
    auto& cs = get_compaction_state(&t);
    if (sst->is_shared()) {
        throw std::runtime_error(format("Shared SSTable {} cannot be marked as requiring cleanup, as it can only be processed by resharding",
@@ -2001,22 +2095,22 @@ bool compaction_manager::update_sstable_cleanup_state(table_state& t, const ssta
    }
 }

-bool compaction_manager::erase_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst) {
+bool compaction_manager::erase_sstable_cleanup_state(compaction_group_view& t, const sstables::shared_sstable& sst) {
    auto& cs = get_compaction_state(&t);
    return cs.sstables_requiring_cleanup.erase(sst);
 }

-bool compaction_manager::requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const {
+bool compaction_manager::requires_cleanup(compaction_group_view& t, const sstables::shared_sstable& sst) const {
    const auto& cs = get_compaction_state(&t);
    return cs.sstables_requiring_cleanup.contains(sst);
 }

-const std::unordered_set<sstables::shared_sstable>& compaction_manager::sstables_requiring_cleanup(table_state& t) const {
+const std::unordered_set<sstables::shared_sstable>& compaction_manager::sstables_requiring_cleanup(compaction_group_view& t) const {
    const auto& cs = get_compaction_state(&t);
    return cs.sstables_requiring_cleanup;
 }

-future<> compaction_manager::perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t, tasks::task_info info) {
+future<> compaction_manager::perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction_group_view& t, tasks::task_info info) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return;
@@ -2061,7 +2155,7 @@ future<> compaction_manager::perform_cleanup(owned_ranges_ptr sorted_owned_range
    }
 }

-future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t, tasks::task_info info) {
+future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction_group_view& t, tasks::task_info info) {
    auto check_for_cleanup = [this, &t] {
        return std::ranges::any_of(_tasks, [&t] (auto& task) {
            return task.compacting_table() == &t && task.compaction_type() == sstables::compaction_type::Cleanup;
@@ -2073,16 +2167,16 @@ future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_r

    auto& cs = get_compaction_state(&t);
    co_await run_with_compaction_disabled(t, [&] () -> future<> {
-        auto update_sstables_cleanup_state = [&] (const sstables::sstable_set& set) -> future<> {
-            // Hold on to the sstable set since it may be overwritten
-            // while we yield in this loop.
-            auto set_holder = set.shared_from_this();
-            co_await set.for_each_sstable_gently([&] (const sstables::shared_sstable& sst) {
+        auto update_sstables_cleanup_state = [&] (lw_shared_ptr<const sstables::sstable_set> set) -> future<> {
+            co_await set->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) {
                update_sstable_cleanup_state(t, sst, *sorted_owned_ranges);
            });
        };
-        co_await update_sstables_cleanup_state(t.main_sstable_set());
-        co_await update_sstables_cleanup_state(t.maintenance_sstable_set());
+        // No need to treat repaired and unrepaired sstables separtely here,
+        // since it only inserts or deletes sstables into or from
+        // sstables_requiring_cleanup.
+        co_await update_sstables_cleanup_state(co_await t.main_sstable_set());
+        co_await update_sstables_cleanup_state(co_await t.maintenance_sstable_set());

        // Some sstables may remain in sstables_requiring_cleanup
        // for later processing if they can't be cleaned up right now.
@@ -2090,14 +2184,15 @@ future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_r
        if (!cs.sstables_requiring_cleanup.empty()) {
            cs.owned_ranges_ptr = std::move(sorted_owned_ranges);
        }
-    });
+    }, "cleanup");

    if (cs.sstables_requiring_cleanup.empty()) {
        cmlog.debug("perform_cleanup for {} found no sstables requiring cleanup", t);
        co_return;
    }

-    auto found_maintenance_sstables = bool(t.maintenance_sstable_set().for_each_sstable_until([this, &t] (const sstables::shared_sstable& sst) {
+    auto maintenance_set = co_await t.maintenance_sstable_set();
+    auto found_maintenance_sstables = bool(maintenance_set->for_each_sstable_until([this, &t] (const sstables::shared_sstable& sst) {
        return stop_iteration(requires_cleanup(t, sst));
    }));
    if (found_maintenance_sstables) {
@@ -2113,18 +2208,18 @@ future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_r
        co_return get_candidates(t, cs.sstables_requiring_cleanup);
    };

-    co_await perform_task_on_all_files<cleanup_sstables_compaction_task_executor>(info, t, sstables::compaction_type_options::make_cleanup(), std::move(sorted_owned_ranges),
+    co_await perform_task_on_all_files<cleanup_sstables_compaction_task_executor>("cleanup", info, t, sstables::compaction_type_options::make_cleanup(), std::move(sorted_owned_ranges),
                                                                         std::move(get_sstables));
 }

 // Submit a table to be upgraded and wait for its termination.
-future<> compaction_manager::perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, table_state& t, bool exclude_current_version, tasks::task_info info) {
-    auto get_sstables = [this, &t, exclude_current_version] {
+future<> compaction_manager::perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction_group_view& t, bool exclude_current_version, tasks::task_info info) {
+    auto get_sstables = [this, &t, exclude_current_version] () -> future<std::vector<sstables::shared_sstable>> {
        std::vector<sstables::shared_sstable> tables;

        auto last_version = t.get_sstables_manager().get_highest_supported_format();

-        for (auto& sst : get_candidates(t)) {
+        for (auto& sst : co_await get_candidates(t)) {
            // if we are a "normal" upgrade, we only care about
            // tables with older versions, but potentially
            // we are to actually rewrite everything. (-a)
@@ -2133,7 +2228,7 @@ future<> compaction_manager::perform_sstable_upgrade(owned_ranges_ptr sorted_own
            }
        }

-        return make_ready_future<std::vector<sstables::shared_sstable>>(tables);
+        co_return std::move(tables);
    };

    // doing a "cleanup" is about as compacting as we need
@@ -2142,21 +2237,21 @@ future<> compaction_manager::perform_sstable_upgrade(owned_ranges_ptr sorted_own
    // Note that we potentially could be doing multiple
    // upgrades here in parallel, but that is really the users
    // problem.
-    return rewrite_sstables(t, sstables::compaction_type_options::make_upgrade(), std::move(sorted_owned_ranges), std::move(get_sstables), info).discard_result();
+    co_await rewrite_sstables(t, sstables::compaction_type_options::make_upgrade(), std::move(sorted_owned_ranges), std::move(get_sstables), info).discard_result();
 }

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_split_compaction(table_state& t, sstables::compaction_type_options::split opt, tasks::task_info info) {
-    auto get_sstables = [this, &t] {
-        return make_ready_future<std::vector<sstables::shared_sstable>>(get_candidates(t));
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_split_compaction(compaction_group_view& t, sstables::compaction_type_options::split opt, tasks::task_info info) {
+    auto get_sstables = [this, &t] () -> future<std::vector<sstables::shared_sstable>> {
+        return get_candidates(t);
    };
    owned_ranges_ptr owned_ranges_ptr = {};
    auto options = sstables::compaction_type_options::make_split(std::move(opt.classifier));

-    return perform_task_on_all_files<split_compaction_task_executor>(info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables));
+    return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables));
 }

 future<std::vector<sstables::shared_sstable>>
-compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, table_state& t, sstables::compaction_type_options::split opt) {
+compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, sstables::compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
        co_return std::vector<sstables::shared_sstable>{sst};
    }
@@ -2180,15 +2275,15 @@ compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, table_stat
 }

 // Submit a table to be scrubbed and wait for its termination.
-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub(table_state& t, sstables::compaction_type_options::scrub opts, tasks::task_info info) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub(compaction_group_view& t, sstables::compaction_type_options::scrub opts, tasks::task_info info) {
    auto scrub_mode = opts.operation_mode;
    if (scrub_mode == sstables::compaction_type_options::scrub::mode::validate) {
-        return perform_sstable_scrub_validate_mode(t, info, opts.quarantine_sstables);
+        co_return co_await perform_sstable_scrub_validate_mode(t, info, opts.quarantine_sstables);
    }
    owned_ranges_ptr owned_ranges_ptr = {};
    sstring option_desc = fmt::format("mode: {};\nquarantine_mode: {}\n", opts.operation_mode, opts.quarantine_operation_mode);
-    return rewrite_sstables(t, sstables::compaction_type_options::make_scrub(scrub_mode), std::move(owned_ranges_ptr), [&t, opts] {
-        auto all_sstables = get_all_sstables(t);
+    co_return co_await rewrite_sstables(t, sstables::compaction_type_options::make_scrub(scrub_mode), std::move(owned_ranges_ptr), [&t, opts] -> future<std::vector<sstables::shared_sstable>> {
+        auto all_sstables = co_await get_all_sstables(t);
        std::vector<sstables::shared_sstable> sstables = all_sstables
                | std::views::filter([&opts] (const sstables::shared_sstable& sst) {
            if (sst->requires_view_building()) {
@@ -2205,27 +2300,31 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
            on_internal_error(cmlog, "bad scrub quarantine mode");
        })
                | std::ranges::to<std::vector>();
-        return make_ready_future<std::vector<sstables::shared_sstable>>(std::move(sstables));
+        co_return std::vector<sstables::shared_sstable>(std::move(sstables));
    }, info, can_purge_tombstones::no, std::move(option_desc));
 }

-compaction::compaction_state::compaction_state(table_state& t)
+compaction::compaction_state::compaction_state(compaction_group_view& t)
    : gate(format("compaction_state for table {}.{}", t.schema()->ks_name(), t.schema()->cf_name()))
-    , backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
 {
 }

-void compaction_manager::add(table_state& t) {
+void compaction_manager::add(compaction_group_view& t) {
    auto [_, inserted] = _compaction_state.try_emplace(&t, t);
    if (!inserted) {
        on_internal_error(cmlog, format("compaction_state for table {} [{}] already exists", t, fmt::ptr(&t)));
    }
 }

-future<> compaction_manager::remove(table_state& t, sstring reason) noexcept {
+compaction_reenabler compaction_manager::add_with_compaction_disabled(compaction_group_view& view) {
+    add(view);
+    return compaction_reenabler(*this, view);
+}
+
+future<> compaction_manager::remove(compaction_group_view& t, sstring reason) noexcept {
    auto& c_state = get_compaction_state(&t);
-    auto erase_state = defer([&t, &c_state, this] () noexcept {
-       c_state.backlog_tracker->disable();
+    auto erase_state = defer([&t, this] () noexcept {
+       t.get_backlog_tracker().disable();
       _compaction_state.erase(&t);
    });

@@ -2260,7 +2359,7 @@ future<> compaction_manager::remove(table_state& t, sstring reason) noexcept {
 #endif
 }

-const std::vector<sstables::compaction_info> compaction_manager::get_compactions(table_state* t) const {
+const std::vector<sstables::compaction_info> compaction_manager::get_compactions(compaction_group_view* t) const {
    auto to_info = [] (const compaction_task_executor& task) {
        sstables::compaction_info ret;
        ret.compaction_uuid = task.compaction_data().compaction_uuid;
@@ -2276,13 +2375,13 @@ const std::vector<sstables::compaction_info> compaction_manager::get_compactions
            }) | std::views::transform(to_info) | std::ranges::to<std::vector>();
 }

-bool compaction_manager::has_table_ongoing_compaction(const table_state& t) const {
+bool compaction_manager::has_table_ongoing_compaction(const compaction_group_view& t) const {
    return std::any_of(_tasks.begin(), _tasks.end(), [&t] (const compaction_task_executor& task) {
        return task.compacting_table() == &t && task.compaction_running();
    });
 };

-bool compaction_manager::compaction_disabled(table_state& t) const {
+bool compaction_manager::compaction_disabled(compaction_group_view& t) const {
    if (auto it = _compaction_state.find(&t); it != _compaction_state.end()) {
        return it->second.compaction_disabled();
    } else {
@@ -2294,7 +2393,7 @@ bool compaction_manager::compaction_disabled(table_state& t) const {
    }
 }

-future<> compaction_manager::stop_compaction(sstring type, table_state* table) {
+future<> compaction_manager::stop_compaction(sstring type, compaction_group_view* table) {
    sstables::compaction_type target_type;
    try {
        target_type = sstables::to_compaction_type(type);
@@ -2313,7 +2412,7 @@ future<> compaction_manager::stop_compaction(sstring type, table_state* table) {
    return stop_ongoing_compactions("user request", table, target_type);
 }

-void compaction_manager::propagate_replacement(table_state& t,
+void compaction_manager::propagate_replacement(compaction_group_view& t,
        const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
    for (auto& task : _tasks) {
        if (task.compacting_table() == &t && task.compaction_running()) {
@@ -2459,13 +2558,6 @@ compaction_backlog_manager::~compaction_backlog_manager() {
    }
 }

-void compaction_manager::register_backlog_tracker(table_state& t, compaction_backlog_tracker new_backlog_tracker) {
-    auto& cs = get_compaction_state(&t);
-    cs.backlog_tracker.emplace(std::move(new_backlog_tracker));
-    register_backlog_tracker(*cs.backlog_tracker);
-}
-
-compaction_backlog_tracker& compaction_manager::get_backlog_tracker(table_state& t) {
-    auto& cs = get_compaction_state(&t);
-    return *cs.backlog_tracker;
+compaction_backlog_tracker& compaction_manager::get_backlog_tracker(compaction_group_view& t) {
+    return t.get_backlog_tracker();
 }
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -16,6 +16,7 @@
 #include <seastar/core/metrics_registration.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/condition-variable.hh>
+#include <seastar/core/rwlock.hh>
 #include "sstables/shared_sstable.hh"
 #include "utils/exponential_backoff_retry.hh"
 #include "utils/updateable_value.hh"
@@ -33,6 +34,7 @@
 #include "sstables/exceptions.hh"
 #include "tombstone_gc.hh"
 #include "utils/pluggable.hh"
+#include "compaction/compaction_reenabler.hh"

 namespace db {
 class compaction_history_entry;
@@ -123,12 +125,12 @@ private:
    future<> _waiting_reevalution = make_ready_future<>();
    condition_variable _postponed_reevaluation;
    // tables that wait for compaction but had its submission postponed due to ongoing compaction.
-    std::unordered_set<compaction::table_state*> _postponed;
+    std::unordered_set<compaction::compaction_group_view*> _postponed;
    // tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
    // weight is value assigned to a compaction job that is log base N of total size of all input sstables.
    std::unordered_set<int> _weight_tracker;

-    std::unordered_map<compaction::table_state*, compaction_state> _compaction_state;
+    std::unordered_map<compaction::compaction_group_view*, compaction_state> _compaction_state;

    // Purpose is to serialize all maintenance (non regular) compaction activity to reduce aggressiveness and space requirement.
    // If the operation must be serialized with regular, then the per-table write lock must be taken.
@@ -160,14 +162,17 @@ private:
    class strategy_control;
    std::unique_ptr<strategy_control> _strategy_control;

-    per_table_history_maps _reconcile_history_maps;
+    shared_tombstone_gc_state _shared_tombstone_gc_state;
+    // TODO: tombstone_gc_state should now have value semantics, but the code
+    // still uses it with reference semantics (inconsistently though).
+    // Drop this member, once the code is converted into using value semantics.
    tombstone_gc_state _tombstone_gc_state;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);

    // Return nullopt if compaction cannot be started
-    std::optional<gate::holder> start_compaction(table_state& t);
+    std::optional<gate::holder> start_compaction(compaction_group_view& t);

    template<typename TaskExecutor, typename... Args>
    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
@@ -177,14 +182,15 @@ private:
    }
    future<compaction_manager::compaction_stats_opt> perform_compaction(throw_if_stopping do_throw_if_stopping, tasks::task_info parent_info, Args&&... args);

-    future<> stop_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>> tasks, sstring reason) noexcept;
+    void stop_tasks(const std::vector<shared_ptr<compaction::compaction_task_executor>>& tasks, sstring reason) noexcept;
+    future<> await_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>>, bool task_stopped) const noexcept;
    future<> update_throughput(uint32_t value_mbs);

    // Return the largest fan-in of currently running compactions
    unsigned current_compaction_fan_in_threshold() const;

    // Return true if compaction can be initiated
-    bool can_register_compaction(compaction::table_state& t, int weight, unsigned fan_in) const;
+    bool can_register_compaction(compaction::compaction_group_view& t, int weight, unsigned fan_in) const;
    // Register weight for a table. Do that only if can_register_weight()
    // returned true.
    void register_weight(int weight);
@@ -192,14 +198,14 @@ private:
    void deregister_weight(int weight);

    // Get candidates for compaction strategy, which are all sstables but the ones being compacted.
-    std::vector<sstables::shared_sstable> get_candidates(compaction::table_state& t) const;
+    future<std::vector<sstables::shared_sstable>> get_candidates(compaction::compaction_group_view& t) const;

    bool eligible_for_compaction(const sstables::shared_sstable& sstable) const;
    bool eligible_for_compaction(const sstables::frozen_sstable_run& sstable_run) const;

    template <std::ranges::range Range>
    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable> || std::convertible_to<std::ranges::range_value_t<Range>, sstables::frozen_sstable_run>
-    std::vector<std::ranges::range_value_t<Range>> get_candidates(table_state& t, const Range& sstables) const;
+    std::vector<std::ranges::range_value_t<Range>> get_candidates(compaction_group_view& t, const Range& sstables) const;

    template <std::ranges::range Range>
    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
@@ -211,23 +217,23 @@ private:

    // gets the table's compaction state
    // throws std::out_of_range exception if not found.
-    compaction_state& get_compaction_state(compaction::table_state* t);
-    const compaction_state& get_compaction_state(compaction::table_state* t) const {
+    compaction_state& get_compaction_state(compaction::compaction_group_view* t);
+    const compaction_state& get_compaction_state(compaction::compaction_group_view* t) const {
        return const_cast<compaction_manager*>(this)->get_compaction_state(t);
    }

    // Return true if compaction manager is enabled and
    // table still exists and compaction is not disabled for the table.
-    inline bool can_proceed(compaction::table_state* t) const;
+    inline bool can_proceed(compaction::compaction_group_view* t) const;

    future<> postponed_compactions_reevaluation();
    void reevaluate_postponed_compactions() noexcept;
    // Postpone compaction for a table that couldn't be executed due to ongoing
    // similar-sized compaction.
-    void postpone_compaction_for_table(compaction::table_state* t);
+    void postpone_compaction_for_table(compaction::compaction_group_view* t);

    using quarantine_invalid_sstables = sstables::compaction_type_options::scrub::quarantine_invalid_sstables;
-    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t, tasks::task_info info, quarantine_invalid_sstables quarantine_sstables);
+    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::compaction_group_view& t, tasks::task_info info, quarantine_invalid_sstables quarantine_sstables);
    future<> update_static_shares(float shares);

    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
@@ -237,9 +243,10 @@ private:
    template<typename TaskType, typename... Args>
    requires std::derived_from<TaskType, compaction_task_executor> &&
            std::derived_from<TaskType, compaction_task_impl>
-    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(tasks::task_info info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);

-    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
+    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(sstring reason, tasks::task_info info, compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);
+
+    future<compaction_stats_opt> rewrite_sstables(compaction::compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
                                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstring options_desc = "");

    // Stop all fibers, without waiting. Safe to be called multiple times.
@@ -247,7 +254,7 @@ private:
    future<> really_do_stop() noexcept;

    // Propagate replacement of sstables to all ongoing compaction of a given table
-    void propagate_replacement(compaction::table_state& t, const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added);
+    void propagate_replacement(compaction::compaction_group_view& t, const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added);

    // This constructor is supposed to only be used for testing so lets be more explicit
    // about invoking it. Ref #10146
@@ -301,27 +308,22 @@ public:
    // unless it is moved back to enabled state.
    future<> drain();

-    // Check if compaction manager is running, i.e. it was enabled or drained
-    bool is_running() const noexcept {
-        return _state == state::enabled || _state == state::disabled;
-    }
-
    using compaction_history_consumer = noncopyable_function<future<>(const db::compaction_history_entry&)>;
    future<> get_compaction_history(compaction_history_consumer&& f);

    // Submit a table to be compacted.
-    void submit(compaction::table_state& t);
+    void submit(compaction::compaction_group_view& t);

    // Can regular compaction be performed in the given table
-    bool can_perform_regular_compaction(compaction::table_state& t);
+    bool can_perform_regular_compaction(compaction::compaction_group_view& t);

    // Maybe wait before adding more sstables
    // if there are too many sstables.
-    future<> maybe_wait_for_sstable_count_reduction(compaction::table_state& t);
+    future<> maybe_wait_for_sstable_count_reduction(compaction::compaction_group_view& t);

    // Submit a table to be off-strategy compacted.
    // Returns true iff off-strategy compaction was required and performed.
-    future<bool> perform_offstrategy(compaction::table_state& t, tasks::task_info info);
+    future<bool> perform_offstrategy(compaction::compaction_group_view& t, tasks::task_info info);

    // Submit a table to be cleaned up and wait for its termination.
    //
@@ -330,34 +332,34 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, tasks::task_info info);
+    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::compaction_group_view& t, tasks::task_info info);
 private:
-    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, tasks::task_info info);
+    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::compaction_group_view& t, tasks::task_info info);

    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
-    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
+    bool update_sstable_cleanup_state(compaction_group_view& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);

-    future<> on_compaction_completion(table_state& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
+    future<> on_compaction_completion(compaction_group_view& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
 public:
    // Submit a table to be upgraded and wait for its termination.
-    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version, tasks::task_info info);
+    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::compaction_group_view& t, bool exclude_current_version, tasks::task_info info);

    // Submit a table to be scrubbed and wait for its termination.
-    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts, tasks::task_info info);
+    future<compaction_stats_opt> perform_sstable_scrub(compaction::compaction_group_view& t, sstables::compaction_type_options::scrub opts, tasks::task_info info);

    // Submit a table for major compaction.
-    future<> perform_major_compaction(compaction::table_state& t, tasks::task_info info, bool consider_only_existing_data = false);
+    future<> perform_major_compaction(compaction::compaction_group_view& t, tasks::task_info info, bool consider_only_existing_data = false);

    // Splits a compaction group by segregating all its sstable according to the classifier[1].
    // [1]: See sstables::compaction_type_options::splitting::classifier.
    // Returns when all sstables in the main sstable set are split. The only exception is shutdown
    // or user aborted splitting using stop API.
-    future<compaction_stats_opt> perform_split_compaction(compaction::table_state& t, sstables::compaction_type_options::split opt, tasks::task_info info);
+    future<compaction_stats_opt> perform_split_compaction(compaction::compaction_group_view& t, sstables::compaction_type_options::split opt, tasks::task_info info);

    // Splits a single SSTable by segregating all its data according to the classifier.
    // If SSTable doesn't need split, the same input SSTable is returned as output.
    // If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
-    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, table_state& t, sstables::compaction_type_options::split opt);
+    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, sstables::compaction_type_options::split opt);

    // Run a custom job for a given table, defined by a function
    // it completes when future returned by job is ready or returns immediately
@@ -366,35 +368,19 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping);
-
-    class compaction_reenabler {
-        compaction_manager& _cm;
-        compaction::table_state* _table;
-        compaction::compaction_state& _compaction_state;
-        gate::holder _holder;
-
-    public:
-        compaction_reenabler(compaction_manager&, compaction::table_state&);
-        compaction_reenabler(compaction_reenabler&&) noexcept;
-
-        ~compaction_reenabler();
-
-        compaction::table_state* compacting_table() const noexcept {
-            return _table;
-        }
-
-        const compaction::compaction_state& compaction_state() const noexcept {
-            return _compaction_state;
-        }
-    };
+    future<> run_custom_job(compaction::compaction_group_view& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping);

    // Disable compaction temporarily for a table t.
    // Caller should call the compaction_reenabler::reenable
-    future<compaction_reenabler> stop_and_disable_compaction(compaction::table_state& t);
+    future<compaction_reenabler> stop_and_disable_compaction(sstring reason, compaction::compaction_group_view& t);
+
+    future<compaction_reenabler> await_and_disable_compaction(compaction::compaction_group_view& t);
+
+    future<seastar::rwlock::holder> get_incremental_repair_read_lock(compaction::compaction_group_view& t, const sstring& reason);
+    future<seastar::rwlock::holder> get_incremental_repair_write_lock(compaction::compaction_group_view& t, const sstring& reason);

    // Run a function with compaction temporarily disabled for a table T.
-    future<> run_with_compaction_disabled(compaction::table_state& t, std::function<future<> ()> func);
+    future<> run_with_compaction_disabled(compaction::compaction_group_view& t, std::function<future<> ()> func, sstring reason = "custom operation");

    void plug_system_keyspace(db::system_keyspace& sys_ks) noexcept;
    future<> unplug_system_keyspace() noexcept;
@@ -402,28 +388,40 @@ public:
    // Adds a table to the compaction manager.
    // Creates a compaction_state structure that can be used for submitting
    // compaction jobs of all types.
-    void add(compaction::table_state& t);
+    void add(compaction::compaction_group_view& t);
+    // Adds a group with compaction temporarily disabled. Compaction is only enabled back
+    // when the compaction_reenabler returned is destroyed.
+    compaction_reenabler add_with_compaction_disabled(compaction::compaction_group_view& view);

    // Remove a table from the compaction manager.
    // Cancel requests on table and wait for possible ongoing compactions.
-    future<> remove(compaction::table_state& t, sstring reason = "table removal") noexcept;
+    future<> remove(compaction::compaction_group_view& t, sstring reason = "table removal") noexcept;

    const stats& get_stats() const {
        return _stats;
    }

-    const std::vector<sstables::compaction_info> get_compactions(compaction::table_state* t = nullptr) const;
+    const std::vector<sstables::compaction_info> get_compactions(compaction::compaction_group_view* t = nullptr) const;

    // Returns true if table has an ongoing compaction, running on its behalf
-    bool has_table_ongoing_compaction(const compaction::table_state& t) const;
+    bool has_table_ongoing_compaction(const compaction::compaction_group_view& t) const;

-    bool compaction_disabled(compaction::table_state& t) const;
+    bool compaction_disabled(compaction::compaction_group_view& t) const;

    // Stops ongoing compaction of a given type.
-    future<> stop_compaction(sstring type, compaction::table_state* table = nullptr);
+    future<> stop_compaction(sstring type, compaction::compaction_group_view* table = nullptr);

+private:
+    std::vector<shared_ptr<compaction_task_executor>>
+    do_stop_ongoing_compactions(sstring reason, compaction_group_view* t, std::optional<sstables::compaction_type> type_opt) noexcept;
+
+public:
    // Stops ongoing compaction of a given table and/or compaction_type.
-    future<> stop_ongoing_compactions(sstring reason, compaction::table_state* t = nullptr, std::optional<sstables::compaction_type> type_opt = {}) noexcept;
+    future<> stop_ongoing_compactions(sstring reason, compaction::compaction_group_view* t = nullptr, std::optional<sstables::compaction_type> type_opt = {}) noexcept;
+
+    future<> await_ongoing_compactions(compaction_group_view* t);
+
+    compaction_reenabler stop_and_disable_compaction_no_wait(compaction_group_view& t, sstring reason);

    double backlog() {
        return _backlog_manager.backlog();
@@ -432,29 +430,32 @@ public:
    void register_backlog_tracker(compaction_backlog_tracker& backlog_tracker) {
        _backlog_manager.register_backlog_tracker(backlog_tracker);
    }
-    void register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker);

-    compaction_backlog_tracker& get_backlog_tracker(compaction::table_state& t);
+    compaction_backlog_tracker& get_backlog_tracker(compaction::compaction_group_view& t);

    static sstables::compaction_data create_compaction_data();

    compaction::strategy_control& get_strategy_control() const noexcept;

-    tombstone_gc_state& get_tombstone_gc_state() noexcept {
-        return _tombstone_gc_state;
-    };
-
    const tombstone_gc_state& get_tombstone_gc_state() const noexcept {
        return _tombstone_gc_state;
    };

+    shared_tombstone_gc_state& get_shared_tombstone_gc_state() noexcept {
+        return _shared_tombstone_gc_state;
+    };
+
+    const shared_tombstone_gc_state& get_shared_tombstone_gc_state() const noexcept {
+        return _shared_tombstone_gc_state;
+    };
+
    // Uncoditionally erase sst from `sstables_requiring_cleanup`
    // Returns true iff sst was found and erased.
-    bool erase_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst);
+    bool erase_sstable_cleanup_state(compaction_group_view& t, const sstables::shared_sstable& sst);

    // checks if the sstable is in the respective compaction_state.sstables_requiring_cleanup set.
-    bool requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const;
-    const std::unordered_set<sstables::shared_sstable>& sstables_requiring_cleanup(table_state& t) const;
+    bool requires_cleanup(compaction_group_view& t, const sstables::shared_sstable& sst) const;
+    const std::unordered_set<sstables::shared_sstable>& sstables_requiring_cleanup(compaction_group_view& t) const;

    friend class compacting_sstable_registration;
    friend class compaction_weight_registration;
@@ -470,6 +471,7 @@ public:
    friend class compaction::rewrite_sstables_compaction_task_executor;
    friend class compaction::cleanup_sstables_compaction_task_executor;
    friend class compaction::validate_sstables_compaction_task_executor;
+    friend compaction_reenabler;
 };

 namespace compaction {
@@ -494,7 +496,7 @@ public:
    };
 protected:
    compaction_manager& _cm;
-    ::compaction::table_state* _compacting_table = nullptr;
+    ::compaction::compaction_group_view* _compacting_table = nullptr;
    compaction::compaction_state& _compaction_state;
    sstables::compaction_data _compaction_data;
    state _state = state::none;
@@ -510,7 +512,7 @@ private:
    compaction_manager::compaction_stats_opt _stats = std::nullopt;

 public:
-    explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);
+    explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::compaction_group_view* t, sstables::compaction_type type, sstring desc);

    compaction_task_executor(compaction_task_executor&&) = delete;
    compaction_task_executor(const compaction_task_executor&) = delete;
@@ -554,7 +556,7 @@ protected:
    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes,
                                sstables::offstrategy offstrategy = sstables::offstrategy::no);
-    future<> update_history(::compaction::table_state& t, sstables::compaction_result&& res, const sstables::compaction_data& cdata);
+    future<> update_history(::compaction::compaction_group_view& t, sstables::compaction_result&& res, const sstables::compaction_data& cdata);
    bool should_update_history(sstables::compaction_type ct) {
        return ct == sstables::compaction_type::Compaction;
    }
@@ -565,7 +567,7 @@ public:

    future<compaction_manager::compaction_stats_opt> run_compaction() noexcept;

-    const ::compaction::table_state* compacting_table() const noexcept {
+    const ::compaction::compaction_group_view* compacting_table() const noexcept {
        return _compacting_table;
    }

@@ -601,7 +603,7 @@ private:
        return _compaction_done.get_future();
    }

-    future<sstables::sstable_set> sstable_set_for_tombstone_gc(::compaction::table_state& t);
+    future<sstables::sstable_set> sstable_set_for_tombstone_gc(::compaction::compaction_group_view& t);
 public:
    bool stopping() const noexcept {
        return _compaction_data.abort.abort_requested();
@@ -622,7 +624,8 @@ public:
    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, tasks::task_info parent_info, Args&&... args);
    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
    friend fmt::formatter<compaction_task_executor>;
-    friend future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason) noexcept;
+    friend void compaction_manager::stop_tasks(const std::vector<shared_ptr<compaction_task_executor>>& tasks, sstring reason) noexcept;
+    friend future<> compaction_manager::await_tasks(std::vector<shared_ptr<compaction_task_executor>>, bool task_stopped) const noexcept;
    friend sstables::test_env_compaction_manager;
 };

@@ -643,4 +646,4 @@ struct fmt::formatter<compaction::compaction_task_executor> {
 bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges);

 // Return all sstables but those that are off-strategy like the ones in maintenance set and staging dir.
-std::vector<sstables::shared_sstable> in_strategy_sstables(compaction::table_state& table_s);
+future<std::vector<sstables::shared_sstable>> in_strategy_sstables(compaction::compaction_group_view& table_s);
--- a/compaction/compaction_reenabler.hh
+++ b/compaction/compaction_reenabler.hh
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+
+class compaction_manager;
+
+namespace compaction {
+    class compaction_group_view;
+    class compaction_state;
+}
+
+class compaction_reenabler {
+    compaction_manager& _cm;
+    compaction::compaction_group_view* _table;
+    compaction::compaction_state& _compaction_state;
+    seastar::gate::holder _holder;
+
+public:
+    compaction_reenabler(compaction_manager&, compaction::compaction_group_view&);
+    compaction_reenabler(compaction_reenabler&&) noexcept;
+
+    ~compaction_reenabler();
+
+    compaction::compaction_group_view* compacting_table() const noexcept {
+        return _table;
+    }
+
+    const compaction::compaction_state& compaction_state() const noexcept {
+        return _compaction_state;
+    }
+};
+
--- a/compaction/compaction_state.hh
+++ b/compaction/compaction_state.hh
@@ -19,28 +19,41 @@

 namespace compaction {

+// There's 1:1 relationship between compaction_grop_view and compaction_state.
+// Two or more compaction_group_view can be served by the same instance of sstable::sstable_set,
+// so it's not safe to track any sstable state here.
 struct compaction_state {
    // Used both by compaction tasks that refer to the compaction_state
    // and by any function running under run_with_compaction_disabled().
    seastar::named_gate gate;

-    // Prevents table from running major and minor compaction at the same time.
+    // Used for synchronizing selection of sstable for compaction.
+    // Write lock is held when getting sstable list, feeding them into strategy, and registering compacting sstables.
+    // The lock prevents two concurrent compaction tasks from picking the same sstables. And it also helps major
+    // to synchronize with minor, such that major doesn't miss any sstable.
    seastar::rwlock lock;

+    // Compations like major need to work on all sstables in the unrepaired
+    // set, no matter if the sstable is being repaired or not. The
+    // incremental_repair_lock lock is introduced to serialize repair and such
+    // compactions. This lock guarantees that no sstables are being repaired.
+    // Note that the minor compactions do not need to take this lock because
+    // they ignore sstables that are being repaired.
+    seastar::rwlock incremental_repair_lock;
+
    // Raised by any function running under run_with_compaction_disabled();
    long compaction_disabled_counter = 0;

    // Signaled whenever a compaction task completes.
    condition_variable compaction_done;

-    std::optional<compaction_backlog_tracker> backlog_tracker;
-
+    // Used only with vnodes, will not work with tablets. Can be removed once vnodes are gone.
    std::unordered_set<sstables::shared_sstable> sstables_requiring_cleanup;
    compaction::owned_ranges_ptr owned_ranges_ptr;

    gc_clock::time_point last_regular_compaction;

-    explicit compaction_state(table_state& t);
+    explicit compaction_state(compaction_group_view& t);
    compaction_state(compaction_state&&) = delete;
    ~compaction_state();

--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -46,7 +46,7 @@ compaction_descriptor compaction_strategy_impl::make_major_compaction_job(std::v
    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes);
 }

-std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    // The default implementation is suboptimal and causes the writeamp problem described issue in #10097.
    // The compaction strategy relying on it should strive to implement its own method, to make cleanup bucket aware.
    return candidates | std::views::transform([] (const shared_sstable& sst) {
@@ -55,7 +55,7 @@ std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compact
    }) | std::ranges::to<std::vector>();
 }

-bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t) {
+bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const compaction_group_view& t) {
    if (_disable_tombstone_compaction) {
        return false;
    }
@@ -581,12 +581,12 @@ struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
 //
 class null_compaction_strategy : public compaction_strategy_impl {
 public:
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override {
-        return sstables::compaction_descriptor();
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override {
+        return make_ready_future<sstables::compaction_descriptor>();
    }

-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override {
-        return 0;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override {
+        return make_ready_future<int64_t>(0);
    }

    virtual compaction_strategy_type type() const override {
@@ -700,19 +700,19 @@ compaction_strategy_type compaction_strategy::type() const {
    return _compaction_strategy_impl->type();
 }

-compaction_descriptor compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor> compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    return _compaction_strategy_impl->get_sstables_for_compaction(table_s, control);
 }

-compaction_descriptor compaction_strategy::get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
+compaction_descriptor compaction_strategy::get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) {
    return _compaction_strategy_impl->get_major_compaction_job(table_s, std::move(candidates));
 }

-std::vector<compaction_descriptor> compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+std::vector<compaction_descriptor> compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    return _compaction_strategy_impl->get_cleanup_compaction_jobs(table_s, std::move(candidates));
 }

-void compaction_strategy::notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
+void compaction_strategy::notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
    _compaction_strategy_impl->notify_completion(table_s, removed, added);
 }

@@ -720,7 +720,7 @@ bool compaction_strategy::parallel_compaction() const {
    return _compaction_strategy_impl->parallel_compaction();
 }

-int64_t compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    return _compaction_strategy_impl->estimated_pending_compactions(table_s);
 }

@@ -789,7 +789,7 @@ future<reshape_config> make_reshape_config(const sstables::storage& storage, res
    };
 }

-std::unique_ptr<sstable_set_impl> incremental_compaction_strategy::make_sstable_set(const table_state& ts) const {
+std::unique_ptr<sstable_set_impl> incremental_compaction_strategy::make_sstable_set(const compaction_group_view& ts) const {
    return std::make_unique<partitioned_sstable_set>(ts.schema(), ts.token_range());
 }

--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -12,7 +12,7 @@
 #include "sstables/shared_sstable.hh"
 #include "exceptions/exceptions.hh"
 #include "compaction_strategy_type.hh"
-#include "table_state.hh"
+#include "compaction_group_view.hh"
 #include "strategy_control.hh"

 struct mutation_source_metadata;
@@ -41,15 +41,15 @@ public:
    compaction_strategy& operator=(compaction_strategy&&);

    // Return a list of sstables to be compacted after applying the strategy.
-    compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control);
+    future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control);

-    compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<shared_sstable> candidates);
+    compaction_descriptor get_major_compaction_job(compaction_group_view& table_s, std::vector<shared_sstable> candidates);

-    std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const;
+    std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const;

    // Some strategies may look at the compacted and resulting sstables to
    // get some useful information for subsequent compactions.
-    void notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added);
+    void notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added);

    // Return if parallel compaction is allowed by strategy.
    bool parallel_compaction() const;
@@ -58,7 +58,7 @@ public:
    bool use_clustering_key_filter() const;

    // An estimation of number of compaction for strategy to be satisfied.
-    int64_t estimated_pending_compactions(table_state& table_s) const;
+    future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const;

    static sstring name(compaction_strategy_type type) {
        switch (type) {
@@ -105,7 +105,7 @@ public:
        return name(type());
    }

-    sstable_set make_sstable_set(const table_state& ts) const;
+    sstable_set make_sstable_set(const compaction_group_view& ts) const;

    compaction_backlog_tracker make_backlog_tracker() const;

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -45,18 +45,18 @@ protected:
            uint64_t max_sstable_bytes = compaction_descriptor::default_max_sstable_bytes);
 public:
    virtual ~compaction_strategy_impl() {}
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) = 0;
-    virtual compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) = 0;
+    virtual compaction_descriptor get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) {
        return make_major_compaction_job(std::move(candidates));
    }
-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const;
-    virtual void notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) { }
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const;
+    virtual void notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) { }
    virtual compaction_strategy_type type() const = 0;
    virtual bool parallel_compaction() const {
        return true;
    }
-    virtual int64_t estimated_pending_compactions(table_state& table_s) const = 0;
-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const = 0;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const;

    bool use_clustering_key_filter() const {
        return _use_clustering_key_filter;
@@ -64,7 +64,7 @@ public:

    // Check if a given sstable is entitled for tombstone compaction based on its
    // droppable tombstone histogram and gc_before.
-    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t);
+    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const compaction_group_view& t);

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const = 0;

--- a/compaction/incremental_compaction_strategy.cc
+++ b/compaction/incremental_compaction_strategy.cc
@@ -244,7 +244,7 @@ incremental_compaction_strategy::most_interesting_bucket(std::vector<std::vector
 }

 compaction_descriptor
-incremental_compaction_strategy::find_garbage_collection_job(const compaction::table_state& t, std::vector<size_bucket_t>& buckets) {
+incremental_compaction_strategy::find_garbage_collection_job(const compaction::compaction_group_view& t, std::vector<size_bucket_t>& buckets) {
    auto worth_dropping_tombstones = [this, &t, now = db_clock::now()] (const sstable_run& run, gc_clock::time_point compaction_time) {
        if (run.all().empty()) {
            return false;
@@ -318,9 +318,9 @@ incremental_compaction_strategy::find_garbage_collection_job(const compaction::t
    return compaction_descriptor(runs_to_sstables(std::move(input)), 0, _fragment_size);
 }

-compaction_descriptor
-incremental_compaction_strategy::get_sstables_for_compaction(table_state& t, strategy_control& control) {
-    auto candidates = control.candidates_as_runs(t);
+future<compaction_descriptor>
+incremental_compaction_strategy::get_sstables_for_compaction(compaction_group_view& t, strategy_control& control) {
+    auto candidates = co_await control.candidates_as_runs(t);

    // make local copies so they can't be changed out from under us mid-method
    size_t min_threshold = t.min_compaction_threshold();
@@ -330,28 +330,28 @@ incremental_compaction_strategy::get_sstables_for_compaction(table_state& t, str

    if (is_any_bucket_interesting(buckets, min_threshold)) {
        std::vector<sstables::frozen_sstable_run> most_interesting = most_interesting_bucket(std::move(buckets), min_threshold, max_threshold);
-        return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
+        co_return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
    }
    // If we are not enforcing min_threshold explicitly, try any pair of sstable runs in the same tier.
    if (!t.compaction_enforce_min_threshold() && is_any_bucket_interesting(buckets, 2)) {
        std::vector<sstables::frozen_sstable_run> most_interesting = most_interesting_bucket(std::move(buckets), 2, max_threshold);
-        return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
+        co_return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
    }

    // The cross-tier behavior is only triggered once we're done with all the pending same-tier compaction to
    // increase overall efficiency.
    if (control.has_ongoing_compaction(t)) {
-        return sstables::compaction_descriptor();
+        co_return sstables::compaction_descriptor();
    }

    auto desc = find_garbage_collection_job(t, buckets);
    if (!desc.sstables.empty()) {
-        return desc;
+        co_return desc;
    }

    if (_space_amplification_goal) {
        if (buckets.size() < 2) {
-            return sstables::compaction_descriptor();
+            co_return sstables::compaction_descriptor();
        }
        // Let S0 be the size of largest tier
        // Let S1 be the size of second-largest tier,
@@ -383,33 +383,34 @@ incremental_compaction_strategy::get_sstables_for_compaction(table_state& t, str
            cross_tier_input.reserve(cross_tier_input.size() + s1.size());
            std::move(s1.begin(), s1.end(), std::back_inserter(cross_tier_input));

-            return sstables::compaction_descriptor(runs_to_sstables(std::move(cross_tier_input)),
+            co_return sstables::compaction_descriptor(runs_to_sstables(std::move(cross_tier_input)),
                                                   0, _fragment_size);
        }
    }

-    return sstables::compaction_descriptor();
+    co_return sstables::compaction_descriptor();
 }

 compaction_descriptor
-incremental_compaction_strategy::get_major_compaction_job(table_state& t, std::vector<sstables::shared_sstable> candidates) {
+incremental_compaction_strategy::get_major_compaction_job(compaction_group_view& t, std::vector<sstables::shared_sstable> candidates) {
    if (candidates.empty()) {
        return compaction_descriptor();
    }
    return make_major_compaction_job(std::move(candidates), 0, _fragment_size);
 }

-int64_t incremental_compaction_strategy::estimated_pending_compactions(table_state& t) const {
+future<int64_t> incremental_compaction_strategy::estimated_pending_compactions(compaction_group_view& t) const {
    size_t min_threshold = t.schema()->min_compaction_threshold();
    size_t max_threshold = t.schema()->max_compaction_threshold();
    int64_t n = 0;

-    for (auto& bucket : get_buckets(t.main_sstable_set().all_sstable_runs())) {
+    auto main_set = co_await t.main_sstable_set();
+    for (auto& bucket : get_buckets(main_set->all_sstable_runs())) {
        if (bucket.size() >= min_threshold) {
            n += (bucket.size() + max_threshold - 1) / max_threshold;
        }
    }
-    return n;
+    co_return n;
 }

 std::vector<shared_sstable>
@@ -483,7 +484,7 @@ incremental_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
 }

 std::vector<compaction_descriptor>
-incremental_compaction_strategy::get_cleanup_compaction_jobs(table_state& t, std::vector<shared_sstable> candidates) const {
+incremental_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& t, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;
    const auto& schema = t.schema();
    unsigned max_threshold = schema->max_compaction_threshold();
--- a/compaction/incremental_compaction_strategy.hh
+++ b/compaction/incremental_compaction_strategy.hh
@@ -70,7 +70,7 @@ private:

    bool is_any_bucket_interesting(const std::vector<std::vector<sstables::frozen_sstable_run>>& buckets, size_t min_threshold) const;

-    compaction_descriptor find_garbage_collection_job(const table_state& t, std::vector<size_bucket_t>& buckets);
+    compaction_descriptor find_garbage_collection_job(const compaction_group_view& t, std::vector<size_bucket_t>& buckets);

    static std::vector<shared_sstable> runs_to_sstables(std::vector<frozen_sstable_run> runs);
    static std::vector<frozen_sstable_run> sstables_to_runs(std::vector<shared_sstable> sstables);
@@ -82,13 +82,13 @@ public:

    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

-    virtual compaction_descriptor get_sstables_for_compaction(table_state& t, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& t, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& t, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& t, std::vector<shared_sstable> candidates) const override;

-    virtual compaction_descriptor get_major_compaction_job(table_state& t, std::vector<sstables::shared_sstable> candidates) override;
+    virtual compaction_descriptor get_major_compaction_job(compaction_group_view& t, std::vector<sstables::shared_sstable> candidates) override;

-    virtual int64_t estimated_pending_compactions(table_state& t) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& t) const override;

    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::incremental;
@@ -98,7 +98,7 @@ public:

    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;

-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const override;

    friend class ::incremental_backlog_tracker;
 };
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -13,13 +13,13 @@

 namespace sstables {

-leveled_compaction_strategy_state& leveled_compaction_strategy::get_state(table_state& table_s) const {
+leveled_compaction_strategy_state& leveled_compaction_strategy::get_state(compaction_group_view& table_s) const {
    return table_s.get_compaction_strategy_state().get<leveled_compaction_strategy_state>();
 }

-compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    auto& state = get_state(table_s);
-    auto candidates = control.candidates(table_s);
+    auto candidates = co_await control.candidates(table_s);
    // NOTE: leveled_manifest creation may be slightly expensive, so later on,
    // we may want to store it in the strategy itself. However, the sstable
    // lists managed by the manifest may become outdated. For example, one
@@ -32,12 +32,13 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
    auto candidate = manifest.get_compaction_candidates(*state.last_compacted_keys, state.compaction_counter);

    if (!candidate.sstables.empty()) {
-        leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), table_s.main_sstable_set().all()->size());
-        return candidate;
+        auto main_set = co_await table_s.main_sstable_set();
+        leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
+        co_return candidate;
    }

    if (!table_s.tombstone_gc_enabled()) {
-        return compaction_descriptor();
+        co_return compaction_descriptor();
    }

    // if there is no sstable to compact in standard way, try compacting based on droppable tombstone ratio
@@ -59,12 +60,12 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
            auto ratio_j = j->estimate_droppable_tombstone_ratio(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
            return ratio_i < ratio_j;
        });
-        return sstables::compaction_descriptor({ sst }, sst->get_sstable_level());
+        co_return sstables::compaction_descriptor({ sst }, sst->get_sstable_level());
    }
-    return {};
+    co_return compaction_descriptor();
 }

-compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
+compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) {
    if (candidates.empty()) {
        return compaction_descriptor();
    }
@@ -75,7 +76,7 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(tabl
                                 ideal_level, max_sstable_size_in_bytes);
 }

-void leveled_compaction_strategy::notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
+void leveled_compaction_strategy::notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
    auto& state = get_state(table_s);
    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
@@ -132,14 +133,15 @@ void leveled_compaction_strategy::generate_last_compacted_keys(leveled_compactio
    state.last_compacted_keys = std::move(last_compacted_keys);
 }

-int64_t leveled_compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> leveled_compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    std::vector<sstables::shared_sstable> sstables;
-    auto all_sstables = table_s.main_sstable_set().all();
+    auto main_set = co_await table_s.main_sstable_set();
+    auto all_sstables = main_set->all();
    sstables.reserve(all_sstables->size());
    for (auto& entry : *all_sstables) {
        sstables.push_back(entry);
    }
-    return leveled_manifest::get_estimated_tasks(leveled_manifest::get_levels(sstables), _max_sstable_size_in_mb * 1024 * 1024);
+    co_return leveled_manifest::get_estimated_tasks(leveled_manifest::get_levels(sstables), _max_sstable_size_in_mb * 1024 * 1024);
 }

 compaction_descriptor
@@ -222,7 +224,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
 }

 std::vector<compaction_descriptor>
-leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+leveled_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;

    auto levels = leveled_manifest::get_levels(candidates);
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -43,25 +43,25 @@ class leveled_compaction_strategy : public compaction_strategy_impl {
 private:
    int32_t calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const;

-    leveled_compaction_strategy_state& get_state(table_state& table_s) const;
+    leveled_compaction_strategy_state& get_state(compaction_group_view& table_s) const;
 public:
    static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

    leveled_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const override;

-    virtual compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) override;
+    virtual compaction_descriptor get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) override;

-    virtual void notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) override;
+    virtual void notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) override;

    // for each level > 0, get newest sstable and use its last key as last
    // compacted key for the previous level.
    void generate_last_compacted_keys(leveled_compaction_strategy_state&, leveled_manifest& manifest);

-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override;

    virtual bool parallel_compaction() const override {
        return false;
@@ -70,7 +70,7 @@ public:
    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::leveled;
    }
-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const override;

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

--- a/compaction/leveled_manifest.hh
+++ b/compaction/leveled_manifest.hh
@@ -15,11 +15,11 @@
 #include "utils/assert.hh"
 #include "sstables/sstables.hh"
 #include "size_tiered_compaction_strategy.hh"
-#include "interval.hh"
+#include "utils/interval.hh"
 #include "utils/log.hh"

 class leveled_manifest {
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    schema_ptr _schema;
    std::vector<std::vector<sstables::shared_sstable>> _generations;
    uint64_t _max_sstable_size_in_bytes;
@@ -52,7 +52,7 @@ public:
    // level to be considered worth compacting.
    static constexpr float TARGET_SCORE = 1.001f;
 private:
-    leveled_manifest(table_state& table_s, int max_sstable_size_in_MB, const sstables::size_tiered_compaction_strategy_options& stcs_options)
+    leveled_manifest(compaction_group_view& table_s, int max_sstable_size_in_MB, const sstables::size_tiered_compaction_strategy_options& stcs_options)
        : _table_s(table_s)
        , _schema(table_s.schema())
        , _max_sstable_size_in_bytes(max_sstable_size_in_MB * 1024 * 1024)
@@ -77,7 +77,7 @@ public:
        return levels;
    }

-    static leveled_manifest create(table_state& table_s, std::vector<sstables::shared_sstable>& sstables, int max_sstable_size_in_mb,
+    static leveled_manifest create(compaction_group_view& table_s, std::vector<sstables::shared_sstable>& sstables, int max_sstable_size_in_mb,
            const sstables::size_tiered_compaction_strategy_options& stcs_options) {
        leveled_manifest manifest = leveled_manifest(table_s, max_sstable_size_in_mb, stcs_options);

--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -207,13 +207,13 @@ size_tiered_compaction_strategy::most_interesting_bucket(std::vector<std::vector
    return std::move(max);
 }

-compaction_descriptor
-size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor>
+size_tiered_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    // make local copies so they can't be changed out from under us mid-method
    int min_threshold = table_s.min_compaction_threshold();
    int max_threshold = table_s.schema()->max_compaction_threshold();
    auto compaction_time = gc_clock::now();
-    auto candidates = control.candidates(table_s);
+    auto candidates = co_await control.candidates(table_s);

    // TODO: Add support to filter cold sstables (for reference: SizeTieredCompactionStrategy::filterColdSSTables).

@@ -221,17 +221,17 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_

    if (is_any_bucket_interesting(buckets, min_threshold)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), min_threshold, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        co_return sstables::compaction_descriptor(std::move(most_interesting));
    }

    // If we are not enforcing min_threshold explicitly, try any pair of SStables in the same tier.
    if (!table_s.compaction_enforce_min_threshold() && is_any_bucket_interesting(buckets, 2)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), 2, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        co_return sstables::compaction_descriptor(std::move(most_interesting));
    }

    if (!table_s.tombstone_gc_enabled()) {
-        return compaction_descriptor();
+        co_return compaction_descriptor();
    }

    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
@@ -250,9 +250,9 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
        auto it = std::min_element(sstables.begin(), sstables.end(), [] (auto& i, auto& j) {
            return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
        });
-        return sstables::compaction_descriptor({ *it });
+        co_return sstables::compaction_descriptor({ *it });
    }
-    return sstables::compaction_descriptor();
+    co_return sstables::compaction_descriptor();
 }

 int64_t size_tiered_compaction_strategy::estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
@@ -266,18 +266,19 @@ int64_t size_tiered_compaction_strategy::estimated_pending_compactions(const std
    return n;
 }

-int64_t size_tiered_compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> size_tiered_compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    int min_threshold = table_s.min_compaction_threshold();
    int max_threshold = table_s.schema()->max_compaction_threshold();
    std::vector<sstables::shared_sstable> sstables;

-    auto all_sstables = table_s.main_sstable_set().all();
+    auto main_set = co_await table_s.main_sstable_set();
+    auto all_sstables = main_set->all();
    sstables.reserve(all_sstables->size());
    for (auto& entry : *all_sstables) {
        sstables.push_back(entry);
    }

-    return estimated_pending_compactions(sstables, min_threshold, max_threshold, _options);
+    co_return estimated_pending_compactions(sstables, min_threshold, max_threshold, _options);
 }

 std::vector<sstables::shared_sstable>
@@ -337,7 +338,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
 }

 std::vector<compaction_descriptor>
-size_tiered_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+size_tiered_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;
    const auto& schema = table_s.schema();
    unsigned max_threshold = schema->max_compaction_threshold();
--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -75,13 +75,13 @@ public:
    explicit size_tiered_compaction_strategy(const size_tiered_compaction_strategy_options& options);
    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const override;

    static int64_t estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
        int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options);
-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override;

    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::size_tiered;
--- a/compaction/strategy_control.hh
+++ b/compaction/strategy_control.hh
@@ -18,9 +18,9 @@ namespace compaction {
 class strategy_control {
 public:
    virtual ~strategy_control() {}
-    virtual bool has_ongoing_compaction(table_state& table_s) const noexcept = 0;
-    virtual std::vector<sstables::shared_sstable> candidates(table_state&) const = 0;
-    virtual std::vector<sstables::frozen_sstable_run> candidates_as_runs(table_state&) const = 0;
+    virtual bool has_ongoing_compaction(compaction_group_view& table_s) const noexcept = 0;
+    virtual future<std::vector<sstables::shared_sstable>> candidates(compaction_group_view&) const = 0;
+    virtual future<std::vector<sstables::frozen_sstable_run>> candidates_as_runs(compaction_group_view&) const = 0;
 };

 }
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -158,7 +158,7 @@ future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory::
    // There is a semaphore inside the compaction manager in run_resharding_jobs. So we
    // parallel_for_each so the statistics about pending jobs are updated to reflect all
    // jobs. But only one will run in parallel at a time
-    auto& t = table.try_get_table_state_with_static_sharding();
+    auto& t = table.try_get_compaction_group_view_with_static_sharding();
    co_await coroutine::parallel_for_each(buckets, [&] (std::vector<sstables::shared_sstable>& sstlist) mutable {
        return table.get_compaction_manager().run_custom_job(t, sstables::compaction_type::Reshard, "Reshard compaction", [&] (sstables::compaction_data& info, sstables::compaction_progress_monitor& progress_monitor) -> future<> {
            auto erm = table.get_effective_replication_map(); // keep alive around compaction.
@@ -453,7 +453,7 @@ future<> global_cleanup_compaction_task_impl::run() {
        co_await coroutine::parallel_for_each(keyspaces, [&] (const sstring& ks) -> future<> {
            const auto& keyspace = db.find_keyspace(ks);
            const auto& replication_strategy = keyspace.get_replication_strategy();
-            if (replication_strategy.get_type() == locator::replication_strategy_type::local) {
+            if (replication_strategy.is_local()) {
                // this keyspace does not require cleanup
                co_return;
            }
@@ -495,7 +495,7 @@ future<> table_cleanup_keyspace_compaction_task_impl::run() {
    // it is the responsibility of the system operator to not
    // perform additional incompatible range movements during cleanup.
    auto get_owned_ranges = [&] (std::string_view ks_name) -> future<owned_ranges_ptr> {
-        const auto& erm = _db.find_keyspace(ks_name).get_vnode_effective_replication_map();
+        const auto& erm = _db.find_keyspace(ks_name).get_static_effective_replication_map();
        co_return compaction::make_owned_ranges_ptr(co_await _db.get_keyspace_local_ranges(erm));
    };
    auto owned_ranges_ptr = co_await get_owned_ranges(_status.keyspace);
@@ -575,14 +575,15 @@ future<> table_upgrade_sstables_compaction_task_impl::run() {
        if (ks.get_replication_strategy().is_per_table()) {
            co_return nullptr;
        }
-        const auto& erm = ks.get_vnode_effective_replication_map();
+        const auto& erm = ks.get_static_effective_replication_map();
        co_return compaction::make_owned_ranges_ptr(co_await _db.get_keyspace_local_ranges(erm));
    };
    auto owned_ranges_ptr = co_await get_owned_ranges(_status.keyspace);
    tasks::task_info info{_status.id, _status.shard};
    co_await run_on_table("upgrade_sstables", _db, _status.keyspace, _ti, [&] (replica::table& t) -> future<> {
-        return t.parallel_foreach_table_state([&] (compaction::table_state& ts) -> future<> {
-            return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, _exclude_current_version, info);
+        return t.parallel_foreach_compaction_group_view([&] (compaction::compaction_group_view& ts) -> future<> {
+            auto lock_holder = co_await t.get_compaction_manager().get_incremental_repair_read_lock(ts, "upgrade_sstables_compaction");
+            co_await t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, _exclude_current_version, info);
        });
    });
 }
@@ -620,7 +621,8 @@ future<> table_scrub_sstables_compaction_task_impl::run() {
    auto& cm = _db.get_compaction_manager();
    auto& cf = _db.find_column_family(_status.keyspace, _status.table);
    tasks::task_info info{_status.id, _status.shard};
-    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
+    co_await cf.parallel_foreach_compaction_group_view([&] (compaction::compaction_group_view& ts) mutable -> future<> {
+        auto lock_holder = co_await cm.get_incremental_repair_read_lock(ts, "scrub_sstables_compaction");
        auto r = co_await cm.perform_sstable_scrub(ts, _opts, info);
        _stats += r.value_or(sstables::compaction_stats{});
    });
@@ -648,19 +650,20 @@ future<> shard_reshaping_compaction_task_impl::run() {
    auto holder = table.async_gate().hold();
    tasks::task_info info{_status.id, _status.shard};

-    std::unordered_map<compaction::table_state*, std::unordered_set<sstables::shared_sstable>> sstables_grouped_by_compaction_group;
+    std::unordered_map<compaction::compaction_group_view*, std::unordered_set<sstables::shared_sstable>> sstables_grouped_by_compaction_group;
    for (auto& sstable : _dir.get_unshared_local_sstables()) {
-        auto& t = table.table_state_for_sstable(sstable);
+        auto& t = table.compaction_group_view_for_sstable(sstable);
        sstables_grouped_by_compaction_group[&t].insert(sstable);
    }

    // reshape sstables individually within the compaction groups
    for (auto& sstables_in_cg : sstables_grouped_by_compaction_group) {
+        auto lock_holder = co_await table.get_compaction_manager().get_incremental_repair_read_lock(*sstables_in_cg.first, "reshaping_compaction");
        co_await reshape_compaction_group(*sstables_in_cg.first, sstables_in_cg.second, table, info);
    }
 }

-future<> shard_reshaping_compaction_task_impl::reshape_compaction_group(compaction::table_state& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info) {
+future<> shard_reshaping_compaction_task_impl::reshape_compaction_group(compaction::compaction_group_view& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info) {

    while (true) {
        auto reshape_candidates = sstables_in_cg
--- a/compaction/task_manager_module.hh
+++ b/compaction/task_manager_module.hh
@@ -628,7 +628,7 @@ private:
    std::function<bool (const sstables::shared_sstable&)> _filter;
    uint64_t& _total_shard_size;

-    future<> reshape_compaction_group(compaction::table_state& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info);
+    future<> reshape_compaction_group(compaction::compaction_group_view& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info);
 public:
    shard_reshaping_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -21,7 +21,7 @@ extern logging::logger clogger;

 using timestamp_type = api::timestamp_type;

-time_window_compaction_strategy_state& time_window_compaction_strategy::get_state(table_state& table_s) const {
+time_window_compaction_strategy_state& time_window_compaction_strategy::get_state(compaction_group_view& table_s) const {
    return table_s.get_compaction_strategy_state().get<time_window_compaction_strategy_state>();
 }

@@ -332,14 +332,14 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
    return compaction_descriptor();
 }

-compaction_descriptor
-time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor>
+time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    auto& state = get_state(table_s);
    auto compaction_time = gc_clock::now();
-    auto candidates = control.candidates(table_s);
+    auto candidates = co_await control.candidates(table_s);

    if (candidates.empty()) {
-        return compaction_descriptor();
+        co_return compaction_descriptor();
    }

    auto now = db_clock::now();
@@ -350,7 +350,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
        auto expired = table_s.fully_expired_sstables(candidates, compaction_time);
        if (!expired.empty()) {
            clogger.debug("[{}] Going to compact {} expired sstables", fmt::ptr(this), expired.size());
-            return compaction_descriptor(has_only_fully_expired::yes, std::vector<shared_sstable>(expired.begin(), expired.end()));
+            co_return compaction_descriptor(has_only_fully_expired::yes, std::vector<shared_sstable>(expired.begin(), expired.end()));
        }
        // Keep checking for fully_expired_sstables until we don't find
        // any among the candidates, meaning they are either already compacted
@@ -362,7 +362,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_

    auto compaction_candidates = get_next_non_expired_sstables(table_s, control, std::move(candidates), compaction_time);
    clogger.debug("[{}] Going to compact {} non-expired sstables", fmt::ptr(this), compaction_candidates.size());
-    return compaction_descriptor(std::move(compaction_candidates));
+    co_return compaction_descriptor(std::move(compaction_candidates));
 }

 time_window_compaction_strategy::bucket_compaction_mode
@@ -382,7 +382,7 @@ time_window_compaction_strategy::compaction_mode(const time_window_compaction_st
 }

 std::vector<shared_sstable>
-time_window_compaction_strategy::get_next_non_expired_sstables(table_state& table_s, strategy_control& control,
+time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
        std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time) {
    auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables);

@@ -409,7 +409,7 @@ time_window_compaction_strategy::get_next_non_expired_sstables(table_state& tabl
 }

 std::vector<shared_sstable>
-time_window_compaction_strategy::get_compaction_candidates(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables) {
+time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables) {
    auto& state = get_state(table_s);
    auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
    // Update the highest window seen, if necessary
@@ -463,7 +463,7 @@ struct fmt::formatter<std::map<sstables::timestamp_type, std::vector<sstables::s
 namespace sstables {

 std::vector<shared_sstable>
-time_window_compaction_strategy::newest_bucket(table_state& table_s, strategy_control& control, std::map<timestamp_type, std::vector<shared_sstable>> buckets,
+time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<timestamp_type, std::vector<shared_sstable>> buckets,
        int min_threshold, int max_threshold, timestamp_type now) {
    auto& state = get_state(table_s);
    clogger.debug("time_window_compaction_strategy::newest_bucket:\n  now {}\n{}", now, buckets);
@@ -515,11 +515,12 @@ time_window_compaction_strategy::trim_to_threshold(std::vector<shared_sstable> b
    return bucket;
 }

-int64_t time_window_compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    auto& state = get_state(table_s);
    auto min_threshold = table_s.min_compaction_threshold();
    auto max_threshold = table_s.schema()->max_compaction_threshold();
-    auto candidate_sstables = *table_s.main_sstable_set().all() | std::ranges::to<std::vector>();
+    auto main_set = co_await table_s.main_sstable_set();
+    auto candidate_sstables = *main_set->all() | std::ranges::to<std::vector>();
    auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);

    int64_t n = 0;
@@ -535,11 +536,11 @@ int64_t time_window_compaction_strategy::estimated_pending_compactions(table_sta
            break;
        }
    }
-    return n;
+    co_return n;
 }

 std::vector<compaction_descriptor>
-time_window_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+time_window_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;
    for (auto&& [_, sstables] : get_buckets(std::move(candidates), _options).first) {
        auto per_window_jobs = size_tiered_compaction_strategy(_stcs_options).get_cleanup_compaction_jobs(table_s, std::move(sstables));
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -81,13 +81,13 @@ public:
    enum class bucket_compaction_mode { none, size_tiered, major };
 public:
    time_window_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const override;

    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
 private:
-    time_window_compaction_strategy_state& get_state(table_state& table_s) const;
+    time_window_compaction_strategy_state& get_state(compaction_group_view& table_s) const;

    static api::timestamp_type
    to_timestamp_type(time_window_compaction_strategy_options::timestamp_resolutions resolution, int64_t timestamp_from_sstable) {
@@ -111,9 +111,9 @@ private:
    compaction_mode(const time_window_compaction_strategy_state&, const bucket_t& bucket, api::timestamp_type bucket_key, api::timestamp_type now, size_t min_threshold) const;

    std::vector<shared_sstable>
-    get_next_non_expired_sstables(table_state& table_s, strategy_control& control, std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time);
+    get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control, std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time);

-    std::vector<shared_sstable> get_compaction_candidates(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables);
+    std::vector<shared_sstable> get_compaction_candidates(compaction_group_view& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables);
 public:
    // Find the lowest timestamp for window of given size
    static api::timestamp_type
@@ -126,7 +126,7 @@ public:
    get_buckets(std::vector<shared_sstable> files, const time_window_compaction_strategy_options& options);

    std::vector<shared_sstable>
-    newest_bucket(table_state& table_s, strategy_control& control, std::map<api::timestamp_type, std::vector<shared_sstable>> buckets,
+    newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<api::timestamp_type, std::vector<shared_sstable>> buckets,
        int min_threshold, int max_threshold, api::timestamp_type now);

    static std::vector<shared_sstable>
@@ -144,13 +144,13 @@ public:
 private:
    friend class time_window_backlog_tracker;
 public:
-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override;

    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::time_window;
    }

-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const override;

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -776,6 +776,35 @@ maintenance_socket: ignore
 #       ...
 #

+#
+# Azure Key Vault host(s).
+#
+# The unique name of azure host/account config that can be referenced in table schema.
+#
+# host.yourdomain.com={ azure_tenant_id=<the tenant hosting your service principal>, azure_client_id=<ID of your service principal>, azure_client_secret=<secret of the service principal>, azure_client_certificate_path=<path to PEM-encoded certificate and private key of the service principal>, master_key=<vault name>/<keyname>, truststore=/path/to/truststore.pem, priority_string=<tls priority string>, key_cache_expiry=<cache expiry in ms>, key_cache_refersh=<cache refresh in ms>}:...
+#
+# Authentication can be explicit with Service Principal credentials. Either secret or certificate can be provided.
+# If both are provided, the secret will be used. If no credentials are provided, the provider will try to detect them
+# from the environment, the Azure CLI, and IMDS, in this specific order.
+#
+# master_key is a Vault key that will be used to wrap all keys used for actual encryption of scylla data.
+# This key must be pre-created and the principal must have permissions for Wrapkey and Unwrapkey operations on this key.
+#
+# azure_hosts:
+#    <name>:
+#       azure_tenant_id: <the tenant hosting your service principal> (optional)
+#       azure_client_id: <ID of your service principal> (optional)
+#       azure_client_secret: <secret of the service principal> (optional)
+#       azure_client_certificate_path: <path to PEM-encoded certificate and private key of the service principal> (optional)
+#       master_key: <vault name>/<keyname> - named Vault key for key wrapping (optional)
+#       truststore: <PEM file with CA certificates for TLS connection> (optional)
+#       priority_string: <GnuTLS priority string for TLS handshake> (optional)
+#       key_cache_expiry: <key cache expiry period (ms)> (optional)
+#       key_cache_refresh: <key cache refresh/prune period (ms)> (optional)
+#   <name>:
+#       ...
+#
+
 #
 # Server-global user information encryption settings
 #
@@ -856,6 +885,13 @@ rf_rack_valid_keyspaces: false
 # Note: DynamoDB has a hard-coded limit of 25.
 # alternator_max_items_in_batch_write: 100

+#
+# Vector Store options
+#
+# Uri for the vector store using dns name. Only http schema is supported. Port number is mandatory.
+# Default is empty, which means that the vector store is not used.
+# vector_store_uri: http://vector-store.dns.name:{port}
+
 # 
 # io-streaming rate limiting
 # When setting this value to be non-zero scylla throttles disk throughput for
--- a/configure.py
+++ b/configure.py
@@ -469,6 +469,7 @@ scylla_tests = set([
    'test/boost/chunked_vector_test',
    'test/boost/clustering_ranges_walker_test',
    'test/boost/compaction_group_test',
+    'test/boost/comparable_bytes_test',
    'test/boost/compound_test',
    'test/boost/compress_test',
    'test/boost/config_test',
@@ -550,6 +551,7 @@ scylla_tests = set([
    'test/boost/sstable_conforms_to_mutation_source_test',
    'test/boost/sstable_datafile_test',
    'test/boost/sstable_generation_test',
+    'test/boost/sstable_inexact_index_test',
    'test/boost/sstable_move_test',
    'test/boost/sstable_mutation_test',
    'test/boost/sstable_partition_index_cache_test',
@@ -563,15 +565,21 @@ scylla_tests = set([
    'test/boost/token_metadata_test',
    'test/boost/top_k_test',
    'test/boost/transport_test',
+    'test/boost/bti_key_translation_test',
+    'test/boost/bti_node_sink_test',
+    'test/boost/trie_traversal_test',
+    'test/boost/trie_writer_test',
    'test/boost/symmetric_key_test',
    'test/boost/types_test',
    'test/boost/utf8_test',
+    'test/boost/vector_store_client_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_table_mutation_source_test',
    'test/boost/wasm_alloc_test',
    'test/boost/wasm_test',
    'test/boost/wrapping_interval_test',
    'test/boost/unique_view_test',
+    'test/boost/scoped_item_list_test',
    'test/manual/ec2_snitch_test',
    'test/manual/enormous_table_scan_test',
    'test/manual/gce_snitch_test',
@@ -769,6 +777,7 @@ scylla_raft_core = [

 scylla_core = (['message/messaging_service.cc',
                'replica/database.cc',
+                'replica/schema_describe_helper.cc',
                'replica/table.cc',
                'replica/tablets.cc',
                'replica/distributed_loader.cc',
@@ -828,9 +837,8 @@ scylla_core = (['message/messaging_service.cc',
                'readers/mutation_reader.cc',
                'readers/mutation_readers.cc',
                'mutation_query.cc',
-                'keys.cc',
+                'keys/keys.cc',
                'counters.cc',
-                'compress.cc',
                'sstable_dict_autotrainer.cc',
                'sstables/sstables.cc',
                'sstables/sstables_manager.cc',
@@ -842,6 +850,7 @@ scylla_core = (['message/messaging_service.cc',
                'sstables/kl/reader.cc',
                'sstables/sstable_version.cc',
                'sstables/compress.cc',
+                'sstables/compressor.cc',
                'sstables/checksummed_data_source.cc',
                'sstables/sstable_mutation_reader.cc',
                'compaction/compaction.cc',
@@ -860,6 +869,10 @@ scylla_core = (['message/messaging_service.cc',
                'sstables/random_access_reader.cc',
                'sstables/metadata_collector.cc',
                'sstables/writer.cc',
+                'sstables/trie/bti_key_translation.cc',
+                'sstables/trie/bti_node_reader.cc',
+                'sstables/trie/bti_node_sink.cc',
+                'sstables/trie/trie_writer.cc',
                'transport/cql_protocol_extension.cc',
                'transport/event.cc',
                'transport/event_notifier.cc',
@@ -957,6 +970,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/murmur_hash.cc',
                'utils/uuid.cc',
                'utils/big_decimal.cc',
+                'types/comparable_bytes.cc',
                'types/types.cc',
                'validation.cc',
                'service/migration_manager.cc',
@@ -1043,6 +1057,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/s3/client.cc',
                'utils/s3/retryable_http_client.cc',
                'utils/s3/retry_strategy.cc',
+                'utils/s3/s3_retry_strategy.cc',
                'utils/s3/credentials_providers/aws_credentials_provider.cc',
                'utils/s3/credentials_providers/environment_aws_credentials_provider.cc',
                'utils/s3/credentials_providers/instance_profile_credentials_provider.cc',
@@ -1050,6 +1065,11 @@ scylla_core = (['message/messaging_service.cc',
                'utils/s3/credentials_providers/aws_credentials_provider_chain.cc',
                'utils/s3/utils/manip_s3.cc',
                'utils/advanced_rpc_compressor.cc',
+                'utils/azure/identity/credentials.cc',
+                'utils/azure/identity/service_principal_credentials.cc',
+                'utils/azure/identity/managed_identity_credentials.cc',
+                'utils/azure/identity/azure_cli_credentials.cc',
+                'utils/azure/identity/default_credentials.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -1118,6 +1138,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/incremental.cc',
                'streaming/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
@@ -1156,7 +1177,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/aws_sigv4.cc',
                'duration.cc',
                'vint-serialization.cc',
-                'utils/arch/powerpc/crc32-vpmsum/crc32_wrapper.cc',
                'querier.cc',
                'mutation_writer/multishard_writer.cc',
                'ent/encryption/encryption_config.cc',
@@ -1173,6 +1193,8 @@ scylla_core = (['message/messaging_service.cc',
                'ent/encryption/gcp_host.cc',
                'ent/encryption/gcp_key_provider.cc',
                'ent/encryption/utils.cc',
+                'ent/encryption/azure_host.cc',
+                'ent/encryption/azure_key_provider.cc',
                'ent/ldap/ldap_connection.cc',
                'multishard_mutation_query.cc',
                'reader_concurrency_semaphore.cc',
@@ -1217,6 +1239,7 @@ scylla_core = (['message/messaging_service.cc',
                'node_ops/task_manager_module.cc',
                'reader_concurrency_semaphore_group.cc',
                'utils/disk_space_monitor.cc',
+                'service/vector_store_client.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
               )
@@ -1373,6 +1396,7 @@ scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_gener
    'test/lib/exception_utils.cc',
    'test/lib/random_schema.cc',
    'test/lib/key_utils.cc',
+    'test/lib/proc_utils.cc',
 ]

 scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc', 'utils/exceptions.cc']
@@ -1536,7 +1560,6 @@ deps['test/boost/combined_tests'] += [
    'test/boost/query_processor_test.cc',
    'test/boost/reader_concurrency_semaphore_test.cc',
    'test/boost/repair_test.cc',
-    'test/boost/replicator_test.cc',
    'test/boost/restrictions_test.cc',
    'test/boost/role_manager_test.cc',
    'test/boost/row_cache_test.cc',
@@ -1546,7 +1569,6 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sessions_test.cc',
    'test/boost/sstable_compaction_test.cc',
    'test/boost/sstable_compressor_factory_test.cc',
-    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
    'test/boost/statement_restrictions_test.cc',
@@ -2131,7 +2153,6 @@ def kmip_arch():

 kmipc_dir = f'kmipc/kmipc-2.1.0t-{kmiplib()}_{kmip_arch()}'
 kmipc_lib = f'{kmipc_dir}/lib/libkmip.a'
-libs += ' -lboost_filesystem'
 if os.path.exists(kmipc_lib):
    libs += f' {kmipc_lib}'
    user_cflags += f' -I{kmipc_dir}/include -DHAVE_KMIP'
@@ -2407,7 +2428,6 @@ def write_build_file(f,
            objs = ['$builddir/' + mode + '/' + src.replace('.cc', '.o')
                    for src in srcs
                    if src.endswith('.cc')]
-            objs.append('$builddir/../utils/arch/powerpc/crc32-vpmsum/crc32.S')
            has_rust = False
            for dep in deps[binary]:
                if isinstance(dep, Antlr3Grammar):
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -413,6 +413,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool bypass_cache = false;
        auto attrs = std::make_unique<cql3::attributes::raw>();
        expression wclause = conjunction{};
+        bool is_ann_ordering = false;
    }
    : K_SELECT (
                ( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
@@ -425,7 +426,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
             )
      ( K_WHERE w=whereClause { wclause = std::move(w); } )?
      ( K_GROUP K_BY gbcolumns=listOfIdentifiers)?
-      ( K_ORDER K_BY orderByClause[orderings] ( ',' orderByClause[orderings] )* )?
+      ( K_ORDER K_BY orderByClause[orderings, is_ann_ordering] ( ',' orderByClause[orderings, is_ann_ordering] )* )?
      ( K_PER K_PARTITION K_LIMIT rows=intValue { per_partition_limit = std::move(rows); } )?
      ( K_LIMIT rows=intValue { limit = std::move(rows); } )?
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
@@ -484,11 +485,37 @@ whereClause returns [uexpression clause]
        { clause = conjunction{std::move(terms)}; }
    ;

-orderByClause[raw::select_statement::parameters::orderings_type& orderings]
+orderByClause[raw::select_statement::parameters::orderings_type& orderings, bool& is_ann_ordering]
    @init{
        raw::select_statement::ordering ordering = raw::select_statement::ordering::ascending;
+        std::optional<expression> ann_ordering;
+    }
+    : c=cident (K_ANN K_OF t=term {ann_ordering=std::move(t);})? (K_ASC | K_DESC { ordering = raw::select_statement::ordering::descending; })?
+    {
+        if (!ann_ordering) {
+            if (is_ann_ordering) {
+                throw exceptions::invalid_request_exception(
+                    "ANN ordering does not support any other ordering");
+            }
+            orderings.emplace_back(c, ordering);
+        } else {
+            if (ordering != raw::select_statement::ordering::ascending) {
+                throw exceptions::invalid_request_exception(
+                    "Descending ANN ordering is not supported");
+            }
+            if (!orderings.empty()) {
+                if (is_ann_ordering) {
+                    throw exceptions::invalid_request_exception(
+                        "Cannot specify more than one ANN ordering");
+                } else {
+                    throw exceptions::invalid_request_exception(
+                        "ANN ordering does not support any other ordering");
+                }
+            }
+            is_ann_ordering = true;
+            orderings.emplace_back(c, ann_ordering.value());
+        }
    }
-    : c=cident (K_ASC | K_DESC { ordering = raw::select_statement::ordering::descending; })? { orderings.emplace_back(c, ordering); }
    ;

 jsonValue returns [uexpression value]
@@ -2243,6 +2270,7 @@ K_ORDER:       O R D E R;
 K_BY:          B Y;
 K_ASC:         A S C;
 K_DESC:        D E S C;
+K_ANN:         A N N;
 K_ALLOW:       A L L O W;
 K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
--- a/cql3/description.cc
+++ b/cql3/description.cc
@@ -18,21 +18,21 @@ static logging::logger dlogger{"description"};

 namespace cql3 {

-std::vector<bytes_opt> description::serialize(bool serialize_create_statement) const {
-    std::vector<bytes_opt> result{};
+std::vector<managed_bytes_opt> description::serialize(bool serialize_create_statement) && {
+    std::vector<managed_bytes_opt> result{};
    result.reserve(serialize_create_statement ? 4 : 3);

    if (keyspace) {
-        result.push_back(to_bytes(cql3::util::maybe_quote(*keyspace)));
+        result.push_back(to_managed_bytes(cql3::util::maybe_quote(*keyspace)));
    } else {
-        result.push_back(data_value::make_null(utf8_type).serialize());
+        result.push_back(to_managed_bytes_opt(data_value::make_null(utf8_type).serialize()));
    }

-    result.push_back(to_bytes(type));
-    result.push_back(to_bytes(cql3::util::maybe_quote(name)));
+    result.push_back(to_managed_bytes(type));
+    result.push_back(to_managed_bytes(cql3::util::maybe_quote(name)));

    if (serialize_create_statement && create_statement) {
-        result.push_back(to_bytes(*create_statement));
+        result.push_back(std::move(create_statement.value()).as_managed_bytes());
    } else if (serialize_create_statement) {
        on_internal_error(dlogger, "create_statement field is empty");
    }
--- a/cql3/description.hh
+++ b/cql3/description.hh
@@ -11,7 +11,7 @@
 #include <seastar/core/sstring.hh>
 #include <seastar/util/bool_class.hh>

-#include "bytes_fwd.hh"
+#include "utils/managed_string.hh"

 #include <optional>
 #include <vector>
@@ -69,8 +69,18 @@ struct description {
    sstring type;
    /// The name of the entity itself, e.g. a keyspace of name `ks` will be of name: ks
    sstring name;
-    /// CQL statement that can be used to restore the entity.
-    std::optional<sstring> create_statement;
+    /// Encoded CQL statement that can be used to restore the entity.
+    ///
+    /// Technical note:
+    /// ---------------
+    /// This field could (and used to) be an optional of `sstring`.
+    /// The reason why we use `managed_string` instead is that some create statements
+    /// may be quite large and lead to oversized allocations if we use a contiguous
+    /// memory buffer. That's a rare occurrence (in my own experience), but it has
+    /// happened: see issue scylladb/scylladb#24018. That's why we need to use
+    /// `managed_string` right away: it's less convenient to handle, but this struct
+    /// is pretty much only used for serialization purposes, so it's a good trade-off.
+    std::optional<managed_string> create_statement;

    /// Serialize the description to represent multiple UTF-8 columns.
    /// The number of columns will be equal to 4 unless `serialize_create_statement`
@@ -80,7 +90,7 @@ struct description {
    ///
    /// Precondition: if `serialize_create_statement` is true, then `create_statement.has_value()`
    ///               is also true.
-    std::vector<bytes_opt> serialize(bool serialize_create_statement = true) const;
+    std::vector<managed_bytes_opt> serialize(bool serialize_create_statement = true) &&;
 };

 } // namespace cql3
--- a/cql3/expr/expr-utils.hh
+++ b/cql3/expr/expr-utils.hh
@@ -6,8 +6,8 @@
 #include "expression.hh"

 #include "bytes.hh"
-#include "keys.hh"
-#include "interval.hh"
+#include "keys/keys.hh"
+#include "utils/interval.hh"
 #include "cql3/expr/restrictions.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/statements/bound.hh"
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1349,7 +1349,7 @@ static managed_bytes reserialize_value(View value_bytes,
    if (type.is_map()) {
        std::vector<std::pair<managed_bytes, managed_bytes>> elements = partially_deserialize_map(value_bytes);

-        const map_type_impl& mapt = dynamic_cast<const map_type_impl&>(type);
+        const map_type_impl mapt = dynamic_cast<const map_type_impl&>(type);
        const abstract_type& key_type = mapt.get_keys_type()->without_reversed();
        const abstract_type& value_type = mapt.get_values_type()->without_reversed();

@@ -1391,7 +1391,7 @@ static managed_bytes reserialize_value(View value_bytes,
        const vector_type_impl& vtype = dynamic_cast<const vector_type_impl&>(type);
        std::vector<managed_bytes> elements = vtype.split_fragmented(value_bytes);

-        const auto& elements_type = vtype.get_elements_type()->without_reversed();
+        auto elements_type = vtype.get_elements_type()->without_reversed();

        if (elements_type.bound_value_needs_to_be_reserialized()) {
            for (size_t i = 0; i < elements.size(); i++) {
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -26,6 +26,7 @@
 #include <cstdint>
 #include <optional>
 #include <type_traits>
+#include "utils/managed_string.hh"

 using namespace cql3;
 using namespace functions;
@@ -357,7 +358,7 @@ user_aggregate::user_aggregate(function_name fname, bytes_opt initcond, ::shared
 bool user_aggregate::has_finalfunc() const { return _agg.state_to_result_function != nullptr; }

 description user_aggregate::describe(with_create_statement with_stmt) const {
-    auto maybe_create_statement = std::invoke([&] -> std::optional<sstring> {
+    auto maybe_create_statement = std::invoke([&] -> std::optional<managed_string> {
        if (!with_stmt) {
            return std::nullopt;
        }
@@ -365,7 +366,7 @@ description user_aggregate::describe(with_create_statement with_stmt) const {
        auto ks = cql3::util::maybe_quote(name().keyspace);
        auto na = cql3::util::maybe_quote(name().name);

-        std::ostringstream os;
+        fragmented_ostringstream os;

        os << "CREATE AGGREGATE " << ks << "." << na << "(";
        auto a = arg_types();
@@ -390,7 +391,7 @@ description user_aggregate::describe(with_create_statement with_stmt) const {
        }
        os << ";";

-        return std::move(os).str();
+        return std::move(os).to_managed_string();
    });

    return description {
--- a/cql3/functions/functions.hh
+++ b/cql3/functions/functions.hh
@@ -16,6 +16,7 @@
 #include "cql3/functions/function_name.hh"
 #include "schema/schema.hh"
 #include <unordered_map>
+#include "data_dictionary/user_types_metadata.hh"

 namespace cql3 {

@@ -102,6 +103,13 @@ const functions& instance();

 class change_batch : public functions {
 public:
+    struct func_name_and_args {
+        function_name name;
+        std::vector<data_type> arg_types;
+        bool aggregate;
+    };
+    std::vector<func_name_and_args> removed_functions;
+
    // Skip init as we copy data from static instance.
    change_batch() : functions(skip_init{}) {
        _declared = instance()._declared;
@@ -112,6 +120,15 @@ public:

    // Used only by unittest.
    void clear_functions() noexcept;
+
+    void remove_function(function_name& name, std::vector<data_type>& arg_types, bool aggregate = false) {
+        removed_functions.emplace_back(name, arg_types, aggregate);
+        functions::remove_function(name, arg_types);
+    };
+
+    void remove_aggregate(function_name& name, std::vector<data_type>& arg_types) {
+        remove_function(name, arg_types, true);
+    }
 };

 }
--- a/cql3/functions/user_function.cc
+++ b/cql3/functions/user_function.cc
@@ -11,6 +11,7 @@
 #include "cql3/util.hh"
 #include "utils/log.hh"
 #include "lang/wasm.hh"
+#include "utils/managed_string.hh"

 #include <seastar/core/thread.hh>

@@ -70,11 +71,13 @@ bytes_opt user_function::execute(std::span<const bytes_opt> parameters) {
 }

 description user_function::describe(with_create_statement with_stmt) const {
-    auto maybe_create_statement = std::invoke([&] -> std::optional<sstring> {
+    auto maybe_create_statement = std::invoke([&] -> std::optional<managed_string> {
        if (!with_stmt) {
            return std::nullopt;
        }

+        fragmented_ostringstream stream;
+
        auto arg_type_range = _arg_types | std::views::transform(std::mem_fn(&abstract_type::cql3_type_name_without_frozen));
        auto arg_range = std::views::zip(_arg_names, arg_type_range)
                | std::views::transform([] (std::tuple<std::string_view, std::string_view> arg) {
@@ -82,7 +85,7 @@ description user_function::describe(with_create_statement with_stmt) const {
                    return seastar::format("{} {}", name, type);
                });

-        return seastar::format("CREATE FUNCTION {}.{}({})\n"
+        fmt::format_to(stream.to_iter(), "CREATE FUNCTION {}.{}({})\n"
                "{} ON NULL INPUT\n"
                "RETURNS {}\n"
                "LANGUAGE {}\n"
@@ -92,6 +95,8 @@ description user_function::describe(with_create_statement with_stmt) const {
                _return_type->cql3_type_name_without_frozen(),
                _language,
                _body);
+
+        return std::move(stream).to_managed_string();
    });

    return description {
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -18,7 +18,7 @@ namespace cql3 {
 const cql_config default_cql_config(cql_config::default_tag{});

 thread_local const query_options::specific_options query_options::specific_options::DEFAULT{
-    -1, {}, db::consistency_level::SERIAL, api::missing_timestamp};
+    -1, {}, db::consistency_level::SERIAL, api::missing_timestamp, service::node_local_only::no};

 thread_local query_options query_options::DEFAULT{default_cql_config,
    db::consistency_level::ONE, std::nullopt,
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -19,6 +19,7 @@
 #include "service/pager/paging_state.hh"
 #include "cql3/values.hh"
 #include "utils/small_vector.hh"
+#include "service/storage_proxy_fwd.hh"

 namespace cql3 {

@@ -74,6 +75,7 @@ public:
        const lw_shared_ptr<service::pager::paging_state> state;
        const std::optional<db::consistency_level> serial_consistency;
        const api::timestamp_type timestamp;
+        const service::node_local_only node_local_only;
    };
 private:
    const cql_config& _cql_config;
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -27,6 +27,7 @@
 #include "cql3/untyped_result_set.hh"
 #include "db/config.hh"
 #include "data_dictionary/data_dictionary.hh"
+#include "service/vector_store_client.hh"
 #include "utils/hashers.hh"
 #include "utils/error_injection.hh"
 #include "service/migration_manager.hh"
@@ -68,11 +69,12 @@ static service::query_state query_state_for_internal_call() {
    return {service::client_state::for_internal_calls(), empty_service_permit()};
 }

-query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
+query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, service::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
        : _migration_subscriber{std::make_unique<migration_subscriber>(this)}
        , _proxy(proxy)
        , _db(db)
        , _mnotifier(mn)
+        , _vector_store_client(vsc)
        , _mcfg(mcfg)
        , _cql_config(cql_cfg)
        , _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
@@ -679,15 +681,32 @@ query_processor::prepare(sstring query_string, service::query_state& query_state

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
 query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
-    using namespace cql_transport::messages;
-    return prepare_one<result_message::prepared::cql>(
-            std::move(query_string),
-            client_state,
-            d,
-            [d] (std::string_view query_string, std::string_view keyspace) {
-                return compute_id(query_string, keyspace, d);
-            },
-            prepared_cache_key_type::cql_id);
+    try {
+        auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
+        auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
+                auto prepared = get_statement(query_string, client_state, d);
+                prepared->calculate_metadata_id();
+                auto bound_terms = prepared->statement->get_bound_terms();
+                if (bound_terms > std::numeric_limits<uint16_t>::max()) {
+                    throw exceptions::invalid_request_exception(
+                            format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}",
+                                bound_terms,
+                                std::numeric_limits<uint16_t>::max()));
+                }
+                SCYLLA_ASSERT(bound_terms == prepared->bound_names.size());
+                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
+            });
+
+        const auto& warnings = prep_ptr->warnings;
+        const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
+                    client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
+        for (const auto& w : warnings) {
+            msg->add_warning(w);
+        }
+        co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
+    } catch(typename prepared_statements_cache::statement_is_too_big&) {
+        throw prepared_statement_is_too_big(query_string);
+    }
 }

 static std::string hash_target(std::string_view query_string, std::string_view keyspace) {
@@ -783,7 +802,8 @@ query_options query_processor::make_internal_options(
        const statements::prepared_statement::checked_weak_ptr& p,
        const std::vector<data_value_or_unset>& values,
        db::consistency_level cl,
-        int32_t page_size) const {
+        int32_t page_size,
+        service::node_local_only node_local_only) const {
    if (p->bound_names.size() != values.size()) {
        throw std::invalid_argument(
                format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
@@ -810,16 +830,16 @@ query_options query_processor::make_internal_options(
        }, var);
        ++ni;
    }
-    if (page_size > 0) {
-        lw_shared_ptr<service::pager::paging_state> paging_state;
-        db::consistency_level serial_consistency = db::consistency_level::SERIAL;
-        api::timestamp_type ts = api::missing_timestamp;
-        return query_options(
-                cl,
-                std::move(bound_values),
-                cql3::query_options::specific_options{page_size, std::move(paging_state), serial_consistency, ts});
-    }
-    return query_options(cl, std::move(bound_values));
+    return query_options(
+            cl,
+            std::move(bound_values),
+            cql3::query_options::specific_options {
+                .page_size = page_size,
+                .state = {},
+                .serial_consistency = db::consistency_level::SERIAL,
+                .timestamp = api::missing_timestamp,
+                .node_local_only = node_local_only
+            });
 }

 statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string) {
@@ -939,7 +959,7 @@ query_processor::execute_internal(
    }
 }

-future<std::vector<mutation>> query_processor::get_mutations_internal(
+future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
        const sstring query_string,
        service::query_state& query_state,
        api::timestamp_type timestamp,
@@ -1135,9 +1155,6 @@ void query_processor::migration_subscriber::on_update_view(
    on_update_column_family(ks_name, view_name, columns_changed);
 }

-void query_processor::migration_subscriber::on_update_tablet_metadata(const locator::tablet_metadata_change_hint&) {
-}
-
 void query_processor::migration_subscriber::on_drop_keyspace(const sstring& ks_name) {
    remove_invalid_prepared_statements(ks_name, std::nullopt);
 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -28,11 +28,13 @@
 #include "transport/messages/result_message.hh"
 #include "service/client_state.hh"
 #include "service/broadcast_tables/experimental/query_result.hh"
+#include "service/vector_store_client.hh"
 #include "utils/assert.hh"
 #include "utils/observable.hh"
 #include "service/raft/raft_group0_client.hh"
 #include "types/types.hh"
 #include "db/auth_version.hh"
+#include "service/storage_proxy_fwd.hh"


 namespace lang { class manager; }
@@ -107,6 +109,7 @@ private:
    service::storage_proxy& _proxy;
    data_dictionary::database _db;
    service::migration_notifier& _mnotifier;
+    service::vector_store_client& _vector_store_client;
    memory_config _mcfg;
    const cql_config& _cql_config;

@@ -146,7 +149,7 @@ public:
    static std::unique_ptr<statements::raw::parsed_statement> parse_statement(const std::string_view& query, dialect d);
    static std::vector<std::unique_ptr<statements::raw::parsed_statement>> parse_statements(std::string_view queries, dialect d);

-    query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm);
+    query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, service::vector_store_client& vsc, memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm);

    ~query_processor();

@@ -176,6 +179,14 @@ public:

    lang::manager& lang() { return _lang_manager; }

+    const service::vector_store_client& vector_store_client() const noexcept {
+        return _vector_store_client;
+    }
+
+    service::vector_store_client& vector_store_client() noexcept {
+        return _vector_store_client;
+    }
+
    db::auth_version_t auth_version;

    statements::prepared_statement::checked_weak_ptr get_prepared(const std::optional<auth::authenticated_user>& user, const prepared_cache_key_type& key) {
@@ -384,7 +395,7 @@ public:
    // function enables putting multiple CQL queries into a single raft command
    // and vice versa, split mutations from one query into separate commands.
    // It supports write-only queries, read-modified-writes not supported.
-    future<std::vector<mutation>> get_mutations_internal(
+    future<utils::chunked_vector<mutation>> get_mutations_internal(
        const sstring query_string,
        service::query_state& query_state,
        api::timestamp_type timestamp,
@@ -463,15 +474,16 @@ public:

    bool topology_global_queue_empty();

-private:
-    // Keep the holder until you stop using the `remote` services.
-    std::pair<std::reference_wrapper<remote>, gate::holder> remote();
-
    query_options make_internal_options(
            const statements::prepared_statement::checked_weak_ptr& p,
            const std::vector<data_value_or_unset>& values,
            db::consistency_level,
-            int32_t page_size = -1) const;
+            int32_t page_size = -1,
+            service::node_local_only node_local_only = service::node_local_only::no) const;
+
+private:
+    // Keep the holder until you stop using the `remote` services.
+    std::pair<std::reference_wrapper<remote>, gate::holder> remote();

    future<::shared_ptr<cql_transport::messages::result_message>>
    process_authorized_statement(const ::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard);
@@ -517,58 +529,6 @@ private:
    future<::shared_ptr<cql_transport::messages::result_message>> execute_with_guard(
        std::function<future<::shared_ptr<cql_transport::messages::result_message>>(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)> fn,
        ::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options);
-
-    ///
-    /// \tparam ResultMsgType type of the returned result message (CQL)
-    /// \tparam PreparedKeyGenerator a function that generates the prepared statement cache key for given query and
-    ///         keyspace
-    /// \tparam IdGetter a function that returns the corresponding prepared statement ID (CQL) for a given
-    ////        prepared statement cache key
-    /// \param query_string
-    /// \param client_state
-    /// \param id_gen prepared ID generator, called before the first deferring
-    /// \param id_getter prepared ID getter, passed to deferred context by reference. The caller must ensure its
-    ////       liveness.
-    /// \return
-    template <typename ResultMsgType, typename PreparedKeyGenerator, typename IdGetter>
-    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    prepare_one(
-            sstring query_string,
-            const service::client_state& client_state,
-            dialect d,
-            PreparedKeyGenerator&& id_gen,
-            IdGetter&& id_getter) {
-        return do_with(
-                id_gen(query_string, client_state.get_raw_keyspace()),
-                std::move(query_string),
-                [this, &client_state, &id_getter, d](const prepared_cache_key_type& key, const sstring& query_string) {
-            return _prepared_cache.get(key, [this, &query_string, &client_state, d] {
-                auto prepared = get_statement(query_string, client_state, d);
-                prepared->calculate_metadata_id();
-                auto bound_terms = prepared->statement->get_bound_terms();
-                if (bound_terms > std::numeric_limits<uint16_t>::max()) {
-                    throw exceptions::invalid_request_exception(
-                            format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}",
-                                   bound_terms,
-                                   std::numeric_limits<uint16_t>::max()));
-                }
-                SCYLLA_ASSERT(bound_terms == prepared->bound_names.size());
-                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
-            }).then([&key, &id_getter, &client_state] (auto prep_ptr) {
-                const auto& warnings = prep_ptr->warnings;
-                const auto msg =
-                        ::make_shared<ResultMsgType>(id_getter(key), std::move(prep_ptr),
-                            client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
-                for (const auto& w : warnings) {
-                    msg->add_warning(w);
-                }
-                return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(std::move(msg));
-            }).handle_exception_type([&query_string] (typename prepared_statements_cache::statement_is_too_big&) {
-                return make_exception_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(
-                        prepared_statement_is_too_big(query_string));
-            });
-        });
-    };
 };

 class query_processor::migration_subscriber : public service::migration_listener {
@@ -590,7 +550,6 @@ public:
    virtual void on_update_function(const sstring& ks_name, const sstring& function_name) override;
    virtual void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override;
    virtual void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override;
-    virtual void on_update_tablet_metadata(const locator::tablet_metadata_change_hint&) override;

    virtual void on_drop_keyspace(const sstring& ks_name) override;
    virtual void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override;
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -26,6 +26,7 @@
 #include "cql3/statements/request_validations.hh"
 #include "cql3/functions/token_fct.hh"
 #include "dht/i_partitioner.hh"
+#include "db/schema_tables.hh"
 #include "types/tuple.hh"

 namespace {
@@ -1274,14 +1275,16 @@ statement_restrictions::statement_restrictions(data_dictionary::database db,
        }

        const auto& im = index_opt->metadata();
-        sstring index_table_name = im.name() + "_index";
-        schema_ptr view_schema = db.find_schema(schema->ks_name(), index_table_name);
-        _view_schema = view_schema;
+        if (db::schema_tables::view_should_exist(im)) {
+            sstring index_table_name = im.name() + "_index";
+            schema_ptr view_schema = db.find_schema(schema->ks_name(), index_table_name);
+            _view_schema = view_schema;

-        if (im.local()) {
-            prepare_indexed_local(*view_schema);
-        } else {
-            prepare_indexed_global(*view_schema);
+            if (im.local()) {
+                prepare_indexed_local(*view_schema);
+            } else {
+                prepare_indexed_global(*view_schema);
+            }
        }
    }
 }
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -18,14 +18,22 @@ namespace cql3 {
 metadata::metadata(std::vector<lw_shared_ptr<column_specification>> names_)
        : _flags(flag_enum_set())
        , _column_info(make_lw_shared<column_info>(std::move(names_)))
-{ }
+{
+    if (!_column_info->_names.empty() && column_specification::all_in_same_table(_column_info->_names)) {
+        _flags.set<flag::GLOBAL_TABLES_SPEC>();
+    }
+}

 metadata::metadata(flag_enum_set flags, std::vector<lw_shared_ptr<column_specification>> names_, uint32_t column_count,
        lw_shared_ptr<const service::pager::paging_state> paging_state)
    : _flags(flags)
    , _column_info(make_lw_shared<column_info>(std::move(names_), column_count))
    , _paging_state(std::move(paging_state))
-{ }
+{
+    if (!_column_info->_names.empty() && column_specification::all_in_same_table(_column_info->_names)) {
+        _flags.set<flag::GLOBAL_TABLES_SPEC>();
+    }
+}

 // The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
 uint32_t metadata::value_count() const {
@@ -38,14 +46,6 @@ void metadata::add_non_serialized_column(lw_shared_ptr<column_specification> nam
    _column_info->_names.emplace_back(std::move(name));
 }

-bool metadata::all_in_same_cf() const {
-    if (_flags.contains<flag::NO_METADATA>()) {
-        return false;
-    }
-
-    return column_specification::all_in_same_table(_column_info->_names);
-}
-
 void metadata::set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state) {
    _flags.set<flag::HAS_MORE_PAGES>();
    _paging_state = std::move(paging_state);
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -74,9 +74,6 @@ public:

    void add_non_serialized_column(lw_shared_ptr<column_specification> name);

-private:
-    bool all_in_same_cf() const;
-
 public:
    void set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state);
    void maybe_set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state);
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -201,7 +201,7 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
        const auto tmptr = qp.proxy().get_token_metadata_ptr();
        const auto& feat = qp.proxy().features();
        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, *tmptr, feat);
-        std::vector<mutation> muts;
+        utils::chunked_vector<mutation> muts;
        std::vector<sstring> warnings;
        auto old_ks_options = get_old_options_flattened(ks);
        auto ks_options = get_current_options_flattened(_attrs, feat);
@@ -276,44 +276,33 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            muts.insert(muts.begin(), schema_mutations.begin(), schema_mutations.end());
        }

-        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
-                ks_md_update->strategy_name(),
-                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
-
        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to perform a schema change that
        // would lead to an RF-rack-valid keyspace. Verify that this change does not.
        // For more context, see: scylladb/scylladb#23071.
-        try {
-            // There are two things to note here:
-            // 1. We hold a group0_guard, so it's correct to check this here.
-            //    The topology or schema cannot change while we're performing this query.
-            // 2. The replication strategy we use here does NOT represent the actual state
-            //    we will arrive at after applying the schema change. For instance, if the user
-            //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
-            //    strategy we pass to this function, while in reality that means that the RF
-            //    will NOT change. That is not a problem:
-            //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
-            //    - the keyspace must've been RF-rack-valid before this change. We check that
-            //      condition for all keyspaces at startup.
-            //    The second hyphen is not really true because currently topological changes can
-            //    disturb it (see scylladb/scylladb#23345), but we ignore that.
-            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-        } catch (const std::exception& e) {
-            if (qp.db().get_config().rf_rack_valid_keyspaces()) {
+        if (qp.db().get_config().rf_rack_valid_keyspaces()) {
+            auto rs = locator::abstract_replication_strategy::create_replication_strategy(
+                    ks_md_update->strategy_name(),
+                    locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
+
+            try {
+                // There are two things to note here:
+                // 1. We hold a group0_guard, so it's correct to check this here.
+                //    The topology or schema cannot change while we're performing this query.
+                // 2. The replication strategy we use here does NOT represent the actual state
+                //    we will arrive at after applying the schema change. For instance, if the user
+                //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
+                //    strategy we pass to this function, while in reality that means that the RF
+                //    will NOT change. That is not a problem:
+                //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
+                //    - the keyspace must've been RF-rack-valid before this change. We check that
+                //      condition for all keyspaces at startup.
+                //    The second hyphen is not really true because currently topological changes can
+                //    disturb it (see scylladb/scylladb#23345), but we ignore that.
+                locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+            } catch (const std::exception& e) {
                // There's no guarantee what the type of the exception will be, so we need to
                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
-            } else {
-                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
-                // we'd like to inform the user that the keyspace they're altering will not
-                // satisfy the restriction after the change--but just as a warning.
-                // For more context, see issue: scylladb/scylladb#23330.
-                warnings.push_back(seastar::format(
-                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
-                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
-                    "For more context, see: "
-                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
-                    _name));
            }
        }

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -460,7 +460,7 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
    return make_pair(cfm.build(), std::move(view_updates));
 }

-future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>
+future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chunked_vector<mutation>, cql3::cql_warnings_vec>>
 alter_table_statement::prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type ts) const {
  data_dictionary::database db = qp.db();
  auto [s, view_updates] = prepare_schema_update(db, options);
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -64,7 +64,7 @@ public:
    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
    virtual future<::shared_ptr<messages::result_message>> execute(query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const override;

-    future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>> prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type) const override;
+    future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chunked_vector<mutation>, cql3::cql_warnings_vec>> prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type) const override;
 private:
    void add_column(const query_options& options, const schema& schema, data_dictionary::table cf, schema_builder& cfm, std::vector<view_ptr>& view_updates, const column_identifier& column_name, const cql3_type validator, const column_definition* def, bool is_static) const;
    void alter_column(const query_options& options, const schema& schema, data_dictionary::table cf, schema_builder& cfm, std::vector<view_ptr>& view_updates, const column_identifier& column_name, const cql3_type validator, const column_definition* def, bool is_static) const;
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -46,8 +46,8 @@ const sstring& alter_type_statement::keyspace() const
    return _name.get_keyspace();
 }

-future<std::vector<mutation>> alter_type_statement::prepare_announcement_mutations(service::storage_proxy& sp, api::timestamp_type ts) const {
-    std::vector<mutation> m;
+future<utils::chunked_vector<mutation>> alter_type_statement::prepare_announcement_mutations(service::storage_proxy& sp, api::timestamp_type ts) const {
+    utils::chunked_vector<mutation> m;
    auto&& ks = sp.data_dictionary().find_keyspace(keyspace());
    auto&& all_types = ks.metadata()->user_types().get_all_types();
    auto to_update = all_types.find(_name.get_user_type_name());
@@ -96,7 +96,7 @@ future<std::vector<mutation>> alter_type_statement::prepare_announcement_mutatio
    co_return m;
 }

-future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>
+future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chunked_vector<mutation>, cql3::cql_warnings_vec>>
 alter_type_statement::prepare_schema_mutations(query_processor& qp, const query_options&, api::timestamp_type ts) const {
    try {
        auto m = co_await prepare_announcement_mutations(qp.proxy(), ts);
--- a/cql3/statements/alter_type_statement.hh
+++ b/cql3/statements/alter_type_statement.hh
@@ -35,14 +35,14 @@ public:
    virtual const sstring& keyspace() const override;


-    future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>> prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type) const override;
+    future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chunked_vector<mutation>, cql3::cql_warnings_vec>> prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type) const override;

    class add_or_alter;
    class renames;
 protected:
    virtual user_type make_updated_type(data_dictionary::database db, user_type to_update) const = 0;
 private:
-    future<std::vector<mutation>> prepare_announcement_mutations(service::storage_proxy& sp, api::timestamp_type) const;
+    future<utils::chunked_vector<mutation>> prepare_announcement_mutations(service::storage_proxy& sp, api::timestamp_type) const;
 };

 class alter_type_statement::add_or_alter : public alter_type_statement {
--- a/cql3/statements/alter_view_statement.cc
+++ b/cql3/statements/alter_view_statement.cc
@@ -75,7 +75,7 @@ view_ptr alter_view_statement::prepare_view(data_dictionary::database db) const
    return view_ptr(builder.build());
 }

-future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>> alter_view_statement::prepare_schema_mutations(query_processor& qp, const query_options&, api::timestamp_type ts) const {
+future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chunked_vector<mutation>, cql3::cql_warnings_vec>> alter_view_statement::prepare_schema_mutations(query_processor& qp, const query_options&, api::timestamp_type ts) const {
    auto m = co_await service::prepare_view_update_announcement(qp.proxy(), prepare_view(qp.db()), ts);

    using namespace cql_transport;
--- a/cql3/statements/alter_view_statement.hh
+++ b/cql3/statements/alter_view_statement.hh
@@ -33,7 +33,7 @@ public:

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

-    future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>> prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type) const override;
+    future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chunked_vector<mutation>, cql3::cql_warnings_vec>> prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type) const override;

    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
 };
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -155,7 +155,7 @@ const std::vector<batch_statement::single_statement>& batch_statement::get_state
    return _statements;
 }

-future<std::vector<mutation>> batch_statement::get_mutations(query_processor& qp, const query_options& options,
+future<utils::chunked_vector<mutation>> batch_statement::get_mutations(query_processor& qp, const query_options& options,
        db::timeout_clock::time_point timeout, bool local, api::timestamp_type now, service::query_state& query_state) const {
    // Do not process in parallel because operations like list append/prepend depend on execution order.
    using mutation_set_type = std::unordered_set<mutation, mutation_hash_by_key, mutation_equals_by_key>;
@@ -182,7 +182,7 @@ future<std::vector<mutation>> batch_statement::get_mutations(query_processor& qp
    }

    // can't use range adaptors, because we want to move
-    auto vresult = std::vector<mutation>();
+    auto vresult = utils::chunked_vector<mutation>();
    vresult.reserve(result.size());
    for (auto&& m : result) {
        vresult.push_back(std::move(m));
@@ -190,7 +190,7 @@ future<std::vector<mutation>> batch_statement::get_mutations(query_processor& qp
    co_return vresult;
 }

-void batch_statement::verify_batch_size(query_processor& qp, const std::vector<mutation>& mutations) {
+void batch_statement::verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) {
    if (mutations.size() <= 1) {
        return;     // We only warn for batch spanning multiple mutations
    }
@@ -273,7 +273,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_

    auto timeout = db::timeout_clock::now() + get_timeout(query_state.get_client_state(), options);
    return get_mutations(qp, options, timeout, local, now, query_state).then([this, &qp, &options, timeout, tr_state = query_state.get_trace_state(),
-                                                                                                                               permit = query_state.get_permit()] (std::vector<mutation> ms) mutable {
+                                                                                                                               permit = query_state.get_permit()] (utils::chunked_vector<mutation> ms) mutable {
        return execute_without_conditions(qp, std::move(ms), options.get_consistency(), timeout, std::move(tr_state), std::move(permit));
    }).then([] (coordinator_result<> res) {
        if (!res) {
@@ -287,7 +287,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_

 future<coordinator_result<>> batch_statement::execute_without_conditions(
        query_processor& qp,
-        std::vector<mutation> mutations,
+        utils::chunked_vector<mutation> mutations,
        db::consistency_level cl,
        db::timeout_clock::time_point timeout,
        tracing::trace_state_ptr tr_state,
@@ -367,14 +367,14 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
        throw exceptions::invalid_request_exception(format("Unrestricted partition key in a conditional BATCH"));
    }

-    auto shard = service::storage_proxy::cas_shard(*_statements[0].statement->s, request->key()[0].start()->value().as_decorated_key().token());
-    if (shard != this_shard_id()) {
+    auto cas_shard = service::cas_shard(*_statements[0].statement->s, request->key()[0].start()->value().as_decorated_key().token());
+    if (!cas_shard.this_shard()) {
        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
-                qp.bounce_to_shard(shard, std::move(cached_fn_calls))
+                qp.bounce_to_shard(cas_shard.shard(), std::move(cached_fn_calls))
            );
    }

-    return qp.proxy().cas(schema, request, request->read_command(qp), request->key(),
+    return qp.proxy().cas(schema, std::move(cas_shard), request, request->read_command(qp), request->key(),
            {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
            cl_for_paxos, cl_for_learn, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
        return request->build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied);
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -108,7 +108,7 @@ public:

    const std::vector<single_statement>& get_statements();
 private:
-    future<std::vector<mutation>> get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout,
+    future<utils::chunked_vector<mutation>> get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout,
            bool local, api::timestamp_type now, service::query_state& query_state) const;

 public:
@@ -116,7 +116,7 @@ public:
     * Checks batch size to ensure threshold is met. If not, a warning is logged.
     * @param cfs ColumnFamilies that will store the batch's mutations.
     */
-    static void verify_batch_size(query_processor& qp, const std::vector<mutation>& mutations);
+    static void verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations);

    virtual future<shared_ptr<cql_transport::messages::result_message>> execute(
            query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const override;
@@ -132,7 +132,7 @@ private:

    future<exceptions::coordinator_result<>> execute_without_conditions(
            query_processor& qp,
-            std::vector<mutation> mutations,
+            utils::chunked_vector<mutation> mutations,
            db::consistency_level cl,
            db::timeout_clock::time_point timeout,
            tracing::trace_state_ptr tr_state,
--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -44,7 +44,7 @@ std::optional<mutation> cas_request::apply_updates(api::timestamp_type ts) const
    for (const cas_row_update& op: _updates) {
        update_parameters params(_schema, op.options, ts, op.statement.get_time_to_live(op.options), _rows);

-        std::vector<mutation> statement_mutations = op.statement.apply_updates(_key, op.ranges, params, op.json_cache);
+        auto statement_mutations = op.statement.apply_updates(_key, op.ranges, params, op.json_cache);
        // Append all mutations (in fact only one) to the consolidated one.
        for (mutation& m : statement_mutations) {
            if (mutation_set.has_value() == false) {
--- a/Show More
+++ b/Show More