Update urgent_issue_reminder.yml - run daily

The action will run daily, alerting about urgent issues not touched in the last 7 days.
Merge 'test.py: dtest: port next_gating tests from commitlog_test.py' from Evgeniy Naydanov
2025-08-20 14:49:33 +03:00 · 2025-08-19 17:25:07 +03:00 · 2025-08-19 17:21:18 +03:00 · 2025-08-19 13:17:29 +03:00 · 2025-08-19 13:13:22 +03:00 · 2025-08-19 13:09:18 +03:00
1155 changed files with 57963 additions and 16199 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -112,10 +112,15 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):
                    is_draft = True
                    repo_local.git.add(A=True)
                    repo_local.git.cherry_pick('--continue')
-            repo_local.git.push(fork_repo, new_branch_name, force=True)
-            create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
-                                is_draft, is_collaborator)
-
+            # Check if the branch already exists in the remote fork
+            remote_refs = repo_local.git.ls_remote('--heads', fork_repo, new_branch_name)
+            if not remote_refs:
+                # Branch does not exist, create it with a regular push
+                repo_local.git.push(fork_repo, new_branch_name)
+                create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
+                                    is_draft, is_collaborator)
+            else:
+                logging.info(f"Remote branch {new_branch_name} already exists in fork. Skipping push.")
        except GitCommandError as e:
            logging.warning(f"GitCommandError: {e}")

--- a/.github/workflows/call_jira_status_in_progress.yml
+++ b/.github/workflows/call_jira_status_in_progress.yml
@@ -0,0 +1,11 @@
+name: Call Jira Status In Progress
+
+on:
+  pull_request:
+    types: [opened]
+
+jobs:
+  call-jira-status-in-progress:
+    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_progress.yml@main
+    secrets:
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_status_in_review.yml
+++ b/.github/workflows/call_jira_status_in_review.yml
@@ -0,0 +1,11 @@
+name: Call Jira Status In Review
+
+on:
+  pull_request:
+    types: [ready_for_review, review_requested]
+
+jobs:
+  call-jira-status-in-review:
+    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_review.yml@main
+    secrets:
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_status_ready_for_merge.yml
+++ b/.github/workflows/call_jira_status_ready_for_merge.yml
@@ -0,0 +1,13 @@
+name: Call Jira Status Ready For Merge
+
+on:
+  pull_request:
+    types: [labeled]
+
+jobs:
+  call-jira-status-update:
+    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_ready_for_merge.yml@main
+    with:
+      label_name: 'status/merge_candidate'
+    secrets:
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/conflict_reminder.yaml
+++ b/.github/workflows/conflict_reminder.yaml
@@ -1,9 +1,16 @@
 name: Notify PR Authors of Conflicts

+permissions:
+  issues: write
+  pull-requests: write
+
 on:
+  push:
+    branches:
+      - 'master'
+      - 'branch-*'
  schedule:
-    - cron: '0 10 * * 1,4'  # Runs every Monday and Thursday at 10:00am
-  workflow_dispatch:      # Manual trigger for testing
+    - cron: '0 10 * * 1'  # Runs every Monday at 10:00am

 jobs:
  notify_conflict_prs:
@@ -14,32 +21,134 @@ jobs:
        uses: actions/github-script@v7
        with:
          script: |
+            console.log("Starting conflict reminder script...");
+            // Print trigger event
+            if (process.env.GITHUB_EVENT_NAME) {
+              console.log(`Workflow triggered by: ${process.env.GITHUB_EVENT_NAME}`);
+            } else {
+              console.log("Could not determine workflow trigger event.");
+            }
+            const isPushEvent = process.env.GITHUB_EVENT_NAME === 'push';
+            console.log(`isPushEvent: ${isPushEvent}`);
+            const twoMonthsAgo = new Date();
+            twoMonthsAgo.setMonth(twoMonthsAgo.getMonth() - 2);
            const prs = await github.paginate(github.rest.pulls.list, {
              owner: context.repo.owner,
              repo: context.repo.repo,
              state: 'open',
              per_page: 100
            });
+            console.log(`Fetched ${prs.length} open PRs`);
+            const recentPrs = prs.filter(pr => new Date(pr.created_at) >= twoMonthsAgo);
+            const validBaseBranches = ['master'];
            const branchPrefix = 'branch-';
-            const threeDaysAgo = new Date();
-            const conflictLabel = 'conflicts';          
-            threeDaysAgo.setDate(threeDaysAgo.getDate() - 3);
-            for (const pr of prs) {
-              if (!pr.base.ref.startsWith(branchPrefix)) continue;
-              const hasConflictLabel = pr.labels.some(label => label.name === conflictLabel);
-              if (!hasConflictLabel) continue;
+            const oneWeekAgo = new Date();
+            const conflictLabel = 'conflicts';
+            oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
+            console.log(`One week ago: ${oneWeekAgo.toISOString()}`);
+
+            for (const pr of recentPrs) {
+              console.log(`Checking PR #${pr.number} on base branch '${pr.base.ref}'`);
+              const isBranchX = pr.base.ref.startsWith(branchPrefix);
+              const isMaster = validBaseBranches.includes(pr.base.ref);
+              if (!(isBranchX || isMaster)) {
+                console.log(`PR #${pr.number} skipped: base branch is not 'master' or does not start with '${branchPrefix}'`);
+                continue;
+              }
              const updatedDate = new Date(pr.updated_at);
-              if (updatedDate >= threeDaysAgo) continue;
-              if (pr.assignee === null) continue;
-              const assignee = pr.assignee.login;
-              if (assignee) {
-                await github.rest.issues.createComment({
+              console.log(`PR #${pr.number} last updated at: ${updatedDate.toISOString()}`);
+              if (!isPushEvent && updatedDate >= oneWeekAgo) {
+                console.log(`PR #${pr.number} skipped: updated within last week`);
+                continue;
+              }
+              if (pr.assignee === null) {
+                console.log(`PR #${pr.number} skipped: no assignee`);
+                continue;
+              }
+
+              // Fetch PR details to check mergeability
+              let { data: prDetails } = await github.rest.pulls.get({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                pull_number: pr.number,
+              });
+              console.log(`PR #${pr.number} mergeable: ${prDetails.mergeable}`);
+
+              // Wait and re-fetch if mergeable is null
+              if (prDetails.mergeable === null) {
+                console.log(`PR #${pr.number} mergeable is null, waiting 2 seconds and retrying...`);
+                await new Promise(resolve => setTimeout(resolve, 2000)); // wait 2 seconds
+                prDetails = (await github.rest.pulls.get({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  pull_number: pr.number,
+                })).data;
+                console.log(`PR #${pr.number} mergeable after retry: ${prDetails.mergeable}`);
+              }
+
+              if (prDetails.mergeable === false) {
+                const hasConflictLabel = pr.labels.some(label => label.name === conflictLabel);
+                console.log(`PR #${pr.number} has conflict label: ${hasConflictLabel}`);
+
+                // Fetch comments to check for existing notifications
+                const comments = await github.paginate(github.rest.issues.listComments, {
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  issue_number: pr.number,
-                  body: `@${assignee}, this PR has been open with conflicts. Please resolve the conflicts so we can merge it.`,
+                  per_page: 100,
                });
-                console.log(`Notified @${assignee} for PR #${pr.number}`);
-              } 
+                
+                // Find last notification comment from the bot
+                const notificationPrefix = `@${pr.assignee.login}, this PR has merge conflicts with the base branch.`;
+                const lastNotification = comments
+                  .filter(c =>
+                    c.user.type === "Bot" &&
+                    c.body.startsWith(notificationPrefix)
+                  )
+                  .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
+
+                // Check if we should skip notification based on recent notification
+                let shouldSkipNotification = false;
+                if (lastNotification) {
+                  const lastNotified = new Date(lastNotification.created_at);
+                  if (lastNotified >= oneWeekAgo) {
+                    console.log(`PR #${pr.number} skipped: last notification was less than 1 week ago`);
+                    shouldSkipNotification = true;
+                  }
+                }
+
+                // Additional check for push events on draft PRs with conflict labels
+                if (
+                  isPushEvent &&
+                  pr.draft === true &&
+                  hasConflictLabel &&
+                  shouldSkipNotification
+                ) {
+                  continue;
+                }
+
+                if (!hasConflictLabel) {
+                  await github.rest.issues.addLabels({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: pr.number,
+                    labels: [conflictLabel],
+                  });
+                  console.log(`Added 'conflicts' label to PR #${pr.number}`);
+                }
+                
+                const assignee = pr.assignee.login;
+                if (assignee && !shouldSkipNotification) {
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: pr.number,
+                    body: `@${assignee}, this PR has merge conflicts with the base branch. Please resolve the conflicts so we can merge it.`,
+                  });
+                  console.log(`Notified @${assignee} for PR #${pr.number}`);
+                }
+              } else {
+                console.log(`PR #${pr.number} is mergeable, no action needed.`);
+              }
            }
            console.log(`Total PRs checked: ${prs.length}`);
--- a/.github/workflows/urgent_issue_reminder.yml
+++ b/.github/workflows/urgent_issue_reminder.yml
@@ -2,7 +2,7 @@ name: Urgent Issue Reminder

 on:
  schedule:
-    - cron: '10 8 * * 1' # Runs every Monday at 8 AM
+    - cron: '10 8 * * *' # Runs daily at 8 AM

 jobs:
  reminder:
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@ compile_commands.json
 .envrc
 clang_build
 .idea/
+nuke
+rust/target
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,7 +171,6 @@ target_sources(scylla-main
    client_data.cc
    clocks-impl.cc
    collection_mutation.cc
-    compress.cc
    converting_mutation_partition_applier.cc
    counters.cc
    sstable_dict_autotrainer.cc
@@ -181,7 +180,7 @@ target_sources(scylla-main
    generic_server.cc
    debug.cc
    init.cc
-    keys.cc
+    keys/keys.cc
    multishard_mutation_query.cc
    mutation_query.cc
    node_ops/task_manager_module.cc
@@ -363,3 +362,6 @@ endif()
 if(Scylla_BUILD_INSTRUMENTED)
  add_subdirectory(pgo)
 endif()
+
+add_executable(patchelf
+  tools/patchelf.cc)
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1,9 +1,6 @@
 This project includes code developed by the Apache Software Foundation (http://www.apache.org/),
 especially Apache Cassandra.

-It includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
-These files are located in utils/arch/powerpc/crc32-vpmsum. Their license may be found in licenses/LICENSE-crc32-vpmsum.TXT.
-
 It includes modified code from https://gitbox.apache.org/repos/asf?p=cassandra-dtest.git (owned by The Apache Software Foundation)

 It includes modified tests from https://github.com/etcd-io/etcd.git (owned by The etcd Authors)
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.2.0-dev
+VERSION=2025.4.0-dev

 if test -f version
 then
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -167,4 +167,8 @@ future<> controller::request_stop_server() {
    });
 }

+future<utils::chunked_vector<client_data>> controller::get_client_data() {
+    return _server.local().get_client_data();
+}
+
 }
--- a/alternator/controller.hh
+++ b/alternator/controller.hh
@@ -90,6 +90,10 @@ public:
    virtual future<> start_server() override;
    virtual future<> stop_server() override;
    virtual future<> request_stop_server() override;
+    // This virtual function is called (on each shard separately) when the
+    // virtual table "system.clients" is read. It is expected to generate a
+    // list of clients connected to this server (on this shard).
+    virtual future<utils::chunked_vector<client_data>> get_client_data() override;
 };

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -10,8 +10,8 @@

 #include <seastar/core/future.hh>
 #include "seastarx.hh"
-#include <seastar/json/json_elements.hh>
 #include <seastar/core/sharded.hh>
+#include <seastar/util/noncopyable_function.hh>

 #include "service/migration_manager.hh"
 #include "service/client_state.hh"
@@ -58,29 +58,6 @@ namespace alternator {

 class rmw_operation;

-struct make_jsonable : public json::jsonable {
-    rjson::value _value;
-public:
-    explicit make_jsonable(rjson::value&& value);
-    std::string to_json() const override;
-};
-
-/**
- * Make return type for serializing the object "streamed",
- * i.e. direct to HTTP output stream. Note: only useful for
- * (very) large objects as there are overhead issues with this
- * as well, but for massive lists of return objects this can
- * help avoid large allocations/many re-allocs
- */
-json::json_return_type make_streamed(rjson::value&&);
-
-struct json_string : public json::jsonable {
-    std::string _value;
-public:
-    explicit json_string(std::string&& value);
-    std::string to_json() const override;
-};
-
 namespace parsed {
 class path;
 };
@@ -169,8 +146,23 @@ class executor : public peering_sharded_service<executor> {

 public:
    using client_state = service::client_state;
-    using request_return_type = std::variant<json::json_return_type, api_error>;
+    // request_return_type is the return type of the executor methods, which
+    // can be one of:
+    // 1. A string, which is the response body for the request.
+    // 2. A body_writer, an asynchronous function (returning future<>) that
+    //    takes an output_stream and writes the response body into it.
+    // 3. An api_error, which is an error response that should be returned to
+    //    the client.
+    // The body_writer is used for streaming responses, where the response body
+    // is written in chunks to the output_stream. This allows for efficient
+    // handling of large responses without needing to allocate a large buffer
+    // in memory.
+    using body_writer = noncopyable_function<future<>(output_stream<char>&&)>;
+    using request_return_type = std::variant<std::string, body_writer, api_error>;
    stats _stats;
+    // The metric_groups object holds this stat object's metrics registered
+    // as long as the stats object is alive.
+    seastar::metrics::metric_groups _metrics;
    static constexpr auto ATTRS_COLUMN_NAME = ":attrs";
    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
    static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";
@@ -220,6 +212,7 @@ public:
 private:
    static thread_local utils::updateable_value<uint32_t> s_default_timeout_in_ms;
 public:
+    static schema_ptr find_table(service::storage_proxy&, std::string_view table_name);
    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);

 private:
@@ -251,7 +244,7 @@ public:
        uint64_t* item_length_in_bytes = nullptr,
        bool = false);

-    static void add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
+    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
 };
@@ -272,4 +265,13 @@ bool is_big(const rjson::value& val, int big_size = 100'000);
 // appropriate user-readable api_error::access_denied is thrown.
 future<> verify_permission(bool enforce_authorization, const service::client_state&, const schema_ptr&, auth::permission);

+/**
+ * Make return type for serializing the object "streamed",
+ * i.e. direct to HTTP output stream. Note: only useful for
+ * (very) large objects as there are overhead issues with this
+ * as well, but for massive lists of return objects this can
+ * help avoid large allocations/many re-allocs
+ */
+executor::body_writer make_streamed(rjson::value&&);
+
 }
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -10,11 +10,12 @@

 #include "seastarx.hh"
 #include "service/paxos/cas_request.hh"
+#include "service/cas_shard.hh"
 #include "utils/rjson.hh"
 #include "consumed_capacity.hh"
 #include "executor.hh"
 #include "tracing/trace_state.hh"
-#include "keys.hh"
+#include "keys/keys.hh"

 namespace alternator {

@@ -114,13 +115,15 @@ public:
    const rjson::value& request() const { return _request; }
    rjson::value&& move_request() && { return std::move(_request); }
    future<executor::request_return_type> execute(service::storage_proxy& proxy,
+            std::optional<service::cas_shard> cas_shard,
            service::client_state& client_state,
            tracing::trace_state_ptr trace_state,
            service_permit permit,
            bool needs_read_before_write,
-            stats& stats,
+            stats& global_stats,
+            stats& per_table_stats,
            uint64_t& wcu_total);
-    std::optional<shard_id> shard_for_execute(bool needs_read_before_write);
+    std::optional<service::cas_shard> shard_for_execute(bool needs_read_before_write);
 };

 } // namespace alternator
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -13,7 +13,7 @@
 #include <optional>
 #include "types/types.hh"
 #include "schema/schema_fwd.hh"
-#include "keys.hh"
+#include "keys/keys.hh"
 #include "utils/rjson.hh"
 #include "utils/big_decimal.hh"

--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -13,7 +13,6 @@
 #include <seastar/http/function_handlers.hh>
 #include <seastar/http/short_streams.hh>
 #include <seastar/core/coroutine.hh>
-#include <seastar/json/json_elements.hh>
 #include <seastar/util/defer.hh>
 #include <seastar/util/short_streams.hh>
 #include "seastarx.hh"
@@ -31,6 +30,7 @@
 #include "gms/gossiper.hh"
 #include "utils/overloaded_functor.hh"
 #include "utils/aws_sigv4.hh"
+#include "client_data.hh"

 static logging::logger slogger("alternator-server");

@@ -124,22 +124,22 @@ public:
             }
             auto res = resf.get();
             std::visit(overloaded_functor {
-                 [&] (const json::json_return_type& json_return_value) {
-                     slogger.trace("api_handler success case");
-                     if (json_return_value._body_writer) {
-                         // Unfortunately, write_body() forces us to choose
-                         // from a fixed and irrelevant list of "mime-types"
-                         // at this point. But we'll override it with the
-                         // one (application/x-amz-json-1.0) below.
-                         rep->write_body("json", std::move(json_return_value._body_writer));
-                     } else {
-                         rep->_content += json_return_value._res;
-                     }
-                 },
-                 [&] (const api_error& err) {
-                     generate_error_reply(*rep, err);
-                 }
-             }, res);
+                [&] (std::string&& str) {
+                    // Note that despite the move, there is a copy here -
+                    // as str is std::string and rep->_content is sstring.
+                    rep->_content = std::move(str);
+                },
+                [&] (executor::body_writer&& body_writer) {
+                    // Unfortunately, write_body() forces us to choose
+                    // from a fixed and irrelevant list of "mime-types"
+                    // at this point. But we'll override it with the
+                    // correct one (application/x-amz-json-1.0) below.
+                    rep->write_body("json", std::move(body_writer));
+                },
+                [&] (const api_error& err) {
+                    generate_error_reply(*rep, err);
+                }
+             }, std::move(res));

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
@@ -431,6 +431,13 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    SCYLLA_ASSERT(req->content_stream);
    chunked_content content = co_await util::read_entire_stream(*req->content_stream);
    auto username = co_await verify_signature(*req, content);
+    // As long as the system_clients_entry object is alive, this request will
+    // be visible in the "system.clients" virtual table. When requested, this
+    // entry will be formatted by server::ongoing_request::make_client_data().
+    auto system_clients_entry = _ongoing_requests.emplace(
+        req->get_client_address(), req->get_header("User-Agent"),
+        username, current_scheduling_group(),
+        req->get_protocol_name() == "https");

    if (slogger.is_enabled(log_level::trace)) {
        std::string buf;
@@ -462,6 +469,9 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
            client_state = std::move(client_state), trace_state = std::move(trace_state),
            units = std::move(units), req = std::move(req)] () mutable -> future<executor::request_return_type> {
                rjson::value json_request = co_await _json_parser.parse(std::move(content));
+                if (!json_request.IsObject()) {
+                    co_return api_error::validation("Request content must be an object");
+                }
                co_return co_await callback(_executor, client_state, trace_state,
                    make_service_permit(std::move(units)), std::move(json_request), std::move(req));
    };
@@ -678,6 +688,37 @@ future<> server::json_parser::stop() {
    return std::move(_run_parse_json_thread);
 }

+// Convert an entry in the server's list of ongoing Alternator requests
+// (_ongoing_requests) into a client_data object. This client_data object
+// will then be used to produce a row for the "system.clients" virtual table.
+client_data server::ongoing_request::make_client_data() const {
+    client_data cd;
+    cd.ct = client_type::alternator;
+    cd.ip = _client_address.addr();
+    cd.port = _client_address.port();
+    cd.shard_id = this_shard_id();
+    cd.connection_stage = client_connection_stage::established;
+    cd.username = _username;
+    cd.scheduling_group_name = _scheduling_group.name();
+    cd.ssl_enabled = _is_https;
+    // For now, we save the full User-Agent header as the "driver name"
+    // and keep "driver_version" unset.
+    cd.driver_name = _user_agent;
+    // Leave "protocol_version" unset, it has no meaning in Alternator.
+    // Leave "hostname", "ssl_protocol" and "ssl_cipher_suite" unset.
+    // As reported in issue #9216, we never set these fields in CQL
+    // either (see cql_server::connection::make_client_data()).
+    return cd;
+}
+
+future<utils::chunked_vector<client_data>> server::get_client_data() {
+    utils::chunked_vector<client_data> ret;
+    co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
+        ret.emplace_back(r.make_client_data());
+    });
+    co_return ret;
+}
+
 const char* api_error::what() const noexcept {
    if (_what_string.empty()) {
        _what_string = fmt::format("{} {}: {}", std::to_underlying(_http_code), _type, _msg);
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include "alternator/executor.hh"
+#include "utils/scoped_item_list.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/condition-variable.hh>
 #include <seastar/http/httpd.hh>
@@ -20,6 +21,8 @@
 #include "utils/updateable_value.hh"
 #include <seastar/core/units.hh>

+struct client_data;
+
 namespace alternator {

 using chunked_content = rjson::chunked_content;
@@ -74,12 +77,30 @@ class server : public peering_sharded_service<server> {
    };
    json_parser _json_parser;

+    // The server maintains a list of ongoing requests, that are being handled
+    // by handle_api_request(). It uses this list in get_client_data(), which
+    // is called when reading the "system.clients" virtual table.
+    struct ongoing_request {
+        socket_address _client_address;
+        sstring _user_agent;
+        sstring _username;
+        scheduling_group _scheduling_group;
+        bool _is_https;
+        client_data make_client_data() const;
+    };
+    utils::scoped_item_list<ongoing_request> _ongoing_requests;
+
 public:
    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
            utils::updateable_value<bool> enforce_authorization, semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
    future<> stop();
+    // get_client_data() is called (on each shard separately) when the virtual
+    // table "system.clients" is read. It is expected to generate a list of
+    // clients connected to this server (on this shard). This function is
+    // called by alternator::controller::get_client_data().
+    future<utils::chunked_vector<client_data>> get_client_data();
 private:
    void set_routes(seastar::httpd::routes& r);
    // If verification succeeds, returns the authenticated user's username
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -28,27 +28,44 @@ static seastar::metrics::histogram estimated_histogram_to_metrics(const utils::e
    }
    return res;
 }
-stats::stats() : api_operations{} {
+
+static seastar::metrics::label column_family_label("cf");
+static seastar::metrics::label keyspace_label("ks");
+
+
+static void register_metrics_with_optional_table(seastar::metrics::metric_groups& metrics, const stats& stats, const sstring& ks, const sstring& table) {
+
    // Register the
    seastar::metrics::label op("op");
-
-    _metrics.add_group("alternator", {
+    bool has_table = table.length();
+    std::vector<seastar::metrics::label> aggregate_labels;
+    std::vector<seastar::metrics::label_instance> labels = {alternator_label};
+    sstring group_name = (has_table)? "alternator_table" : "alternator";
+    if (has_table) {
+        labels.push_back(column_family_label(table));
+        labels.push_back(keyspace_label(ks));
+        aggregate_labels.push_back(seastar::metrics::shard_label);
+    }
+    metrics.add_group(group_name, {
 #define OPERATION(name, CamelCaseName) \
-                seastar::metrics::make_total_operations("operation", api_operations.name, \
-                        seastar::metrics::description("number of operations via Alternator API"), {op(CamelCaseName), alternator_label, basic_level}).set_skip_when_empty(),
+                seastar::metrics::make_total_operations("operation", stats.api_operations.name, \
+                        seastar::metrics::description("number of operations via Alternator API"), labels)(basic_level)(op(CamelCaseName)).aggregate(aggregate_labels).set_skip_when_empty(),
 #define OPERATION_LATENCY(name, CamelCaseName) \
+		metrics.add_group(group_name, { \
                seastar::metrics::make_histogram("op_latency", \
-                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName), alternator_label, basic_level}, [this]{return to_metrics_histogram(api_operations.name.histogram());}).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(), \
+                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), labels, [&stats]{return to_metrics_histogram(stats.api_operations.name.histogram());})(op(CamelCaseName))(basic_level).aggregate({seastar::metrics::shard_label}).set_skip_when_empty()}); \
+            if (!has_table) {\
+            	metrics.add_group("alternator", { \
 				seastar::metrics::make_summary("op_latency_summary", \
-						                        seastar::metrics::description("Latency summary of an operation via Alternator API"), [this]{return to_metrics_summary(api_operations.name.summary());})(op(CamelCaseName))(basic_level)(alternator_label).set_skip_when_empty(),
+						                        seastar::metrics::description("Latency summary of an operation via Alternator API"), [&stats]{return to_metrics_summary(stats.api_operations.name.summary());})(op(CamelCaseName))(basic_level)(alternator_label).set_skip_when_empty()}); \
+            }
+
            OPERATION(batch_get_item, "BatchGetItem")
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
-            OPERATION(create_table, "CreateTable")
            OPERATION(delete_backup, "DeleteBackup")
            OPERATION(delete_item, "DeleteItem")
-            OPERATION(delete_table, "DeleteTable")
            OPERATION(describe_backup, "DescribeBackup")
            OPERATION(describe_continuous_backups, "DescribeContinuousBackups")
            OPERATION(describe_endpoints, "DescribeEndpoints")
@@ -77,59 +94,74 @@ stats::stats() : api_operations{} {
            OPERATION(update_item, "UpdateItem")
            OPERATION(update_table, "UpdateTable")
            OPERATION(update_time_to_live, "UpdateTimeToLive")
-            OPERATION_LATENCY(put_item_latency, "PutItem")
-            OPERATION_LATENCY(get_item_latency, "GetItem")
-            OPERATION_LATENCY(delete_item_latency, "DeleteItem")
-            OPERATION_LATENCY(update_item_latency, "UpdateItem")
-            OPERATION_LATENCY(batch_write_item_latency, "BatchWriteItem")
-            OPERATION_LATENCY(batch_get_item_latency, "BatchGetItem")
            OPERATION(list_streams, "ListStreams")
            OPERATION(describe_stream, "DescribeStream")
            OPERATION(get_shard_iterator, "GetShardIterator")
            OPERATION(get_records, "GetRecords")
-            OPERATION_LATENCY(get_records_latency, "GetRecords")
    });
-    _metrics.add_group("alternator", {
-            seastar::metrics::make_total_operations("unsupported_operations", unsupported_operations,
-                    seastar::metrics::description("number of unsupported operations via Alternator API"))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("total_operations", total_operations,
-                    seastar::metrics::description("number of total operations via Alternator API"))(basic_level)(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("reads_before_write", reads_before_write,
-                    seastar::metrics::description("number of performed read-before-write operations"))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("write_using_lwt", write_using_lwt,
-                    seastar::metrics::description("number of writes that used LWT"))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("shard_bounce_for_lwt", shard_bounce_for_lwt,
-                    seastar::metrics::description("number writes that had to be bounced from this shard because of LWT requirements"))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("requests_blocked_memory", requests_blocked_memory,
-                    seastar::metrics::description("Counts a number of requests blocked due to memory pressure."))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("requests_shed", requests_shed,
-                    seastar::metrics::description("Counts a number of requests shed due to overload."))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("filtered_rows_read_total", cql_stats.filtered_rows_read_total,
-                    seastar::metrics::description("number of rows read during filtering operations"))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("filtered_rows_matched_total", cql_stats.filtered_rows_matched_total,
-                    seastar::metrics::description("number of rows read and matched during filtering operations")),
-            seastar::metrics::make_counter("rcu_total", [this]{return 0.5 * rcu_half_units_total;},
-                    seastar::metrics::description("total number of consumed read units"))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::PUT_ITEM],
-                    seastar::metrics::description("total number of consumed write units"),{op("PutItem")})(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::DELETE_ITEM],
-                    seastar::metrics::description("total number of consumed write units"),{op("DeleteItem")})(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::UPDATE_ITEM],
-                    seastar::metrics::description("total number of consumed write units"),{op("UpdateItem")})(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::INDEX],
-                    seastar::metrics::description("total number of consumed write units"),{op("Index")})(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_total_operations("filtered_rows_dropped_total", [this] { return cql_stats.filtered_rows_read_total - cql_stats.filtered_rows_matched_total; },
-                    seastar::metrics::description("number of rows read and dropped during filtering operations"))(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"),{op("BatchWriteItem")},
-                    api_operations.batch_write_item_batch_total)(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"),{op("BatchGetItem")},
-                    api_operations.batch_get_item_batch_total)(alternator_label).set_skip_when_empty(),
-            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"),{op("BatchGetItem")},
-                    [this]{ return estimated_histogram_to_metrics(api_operations.batch_get_item_histogram);})(alternator_label).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
-            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"),{op("BatchWriteItem")},
-                    [this]{ return estimated_histogram_to_metrics(api_operations.batch_write_item_histogram);})(alternator_label).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+    OPERATION_LATENCY(put_item_latency, "PutItem")
+    OPERATION_LATENCY(get_item_latency, "GetItem")
+    OPERATION_LATENCY(delete_item_latency, "DeleteItem")
+    OPERATION_LATENCY(update_item_latency, "UpdateItem")
+    OPERATION_LATENCY(batch_write_item_latency, "BatchWriteItem")
+    OPERATION_LATENCY(batch_get_item_latency, "BatchGetItem")
+    OPERATION_LATENCY(get_records_latency, "GetRecords")
+    if (!has_table) {
+        // Create and delete operations are not applicable to a per-table metrics
+        // only register it for the global metrics
+        metrics.add_group("alternator", {
+            OPERATION(create_table, "CreateTable")
+            OPERATION(delete_table, "DeleteTable")
+
+        });
+    }
+    metrics.add_group(group_name, {
+            seastar::metrics::make_total_operations("unsupported_operations", stats.unsupported_operations,
+                    seastar::metrics::description("number of unsupported operations via Alternator API"), labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("total_operations", stats.total_operations,
+                    seastar::metrics::description("number of total operations via Alternator API"), labels)(basic_level).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("reads_before_write", stats.reads_before_write,
+                    seastar::metrics::description("number of performed read-before-write operations"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("write_using_lwt", stats.write_using_lwt,
+                    seastar::metrics::description("number of writes that used LWT"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("shard_bounce_for_lwt", stats.shard_bounce_for_lwt,
+                    seastar::metrics::description("number writes that had to be bounced from this shard because of LWT requirements"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("requests_blocked_memory", stats.requests_blocked_memory,
+                    seastar::metrics::description("Counts a number of requests blocked due to memory pressure."), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("requests_shed", stats.requests_shed,
+                    seastar::metrics::description("Counts a number of requests shed due to overload."), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("filtered_rows_read_total", stats.cql_stats.filtered_rows_read_total,
+                    seastar::metrics::description("number of rows read during filtering operations"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("filtered_rows_matched_total", stats.cql_stats.filtered_rows_matched_total,
+                    seastar::metrics::description("number of rows read and matched during filtering operations"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_counter("rcu_total", [&stats]{return 0.5 * stats.rcu_half_units_total;},
+                    seastar::metrics::description("total number of consumed read units"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_counter("wcu_total", stats.wcu_total[stats::wcu_types::PUT_ITEM],
+                    seastar::metrics::description("total number of consumed write units"), labels)(op("PutItem")).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_counter("wcu_total", stats.wcu_total[stats::wcu_types::DELETE_ITEM],
+                    seastar::metrics::description("total number of consumed write units"), labels)(op("DeleteItem")).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_counter("wcu_total", stats.wcu_total[stats::wcu_types::UPDATE_ITEM],
+                    seastar::metrics::description("total number of consumed write units"), labels)(op("UpdateItem")).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_counter("wcu_total", stats.wcu_total[stats::wcu_types::INDEX],
+                    seastar::metrics::description("total number of consumed write units"), labels)(op("Index")).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_total_operations("filtered_rows_dropped_total", [&stats] { return stats.cql_stats.filtered_rows_read_total - stats.cql_stats.filtered_rows_matched_total; },
+                    seastar::metrics::description("number of rows read and dropped during filtering operations"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"), labels,
+                    stats.api_operations.batch_write_item_batch_total)(op("BatchWriteItem")).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"), labels,
+                    stats.api_operations.batch_get_item_batch_total)(op("BatchGetItem")).aggregate(aggregate_labels).set_skip_when_empty(),
+            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
    });
 }

-
+void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats) {
+    register_metrics_with_optional_table(metrics, stats, "", "");
+}
+table_stats::table_stats(const sstring& ks, const sstring& table) {
+    _stats = make_lw_shared<stats>();
+    register_metrics_with_optional_table(_metrics, *_stats, ks, table);
+}
 }
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -22,7 +22,6 @@ namespace alternator {
 // visible by the metrics REST API, with the "alternator" prefix.
 class stats {
 public:
-    stats();
    // Count of DynamoDB API operations by types
    struct {
        uint64_t batch_get_item = 0;
@@ -102,10 +101,13 @@ public:
    uint64_t wcu_total[NUM_TYPES] = {0};
    // CQL-derived stats
    cql3::cql_stats cql_stats;
-private:
-    // The metric_groups object holds this stat object's metrics registered
-    // as long as the stats object is alive.
-    seastar::metrics::metric_groups _metrics;
 };

+struct table_stats {
+    table_stats(const sstring& ks, const sstring& table);
+    seastar::metrics::metric_groups _metrics;
+    lw_shared_ptr<stats> _stats;
+};
+void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats);
+
 }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -217,7 +217,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        rjson::add(ret, "LastEvaluatedStreamArn", *last);
    }

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct shard_id {
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
    }

    // TODO: label
@@ -617,7 +617,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        rjson::add(stream_desc, "Shards", std::move(shards));
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
            
-        return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
    });
 }

@@ -770,7 +770,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&
    auto ret = rjson::empty_object();
    rjson::add(ret, "ShardIterator", iter);

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct event_id {
@@ -1021,7 +1021,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
            // will notice end end of shard and not return NextShardIterator.
            rjson::add(ret, "NextShardIterator", next_iter);
            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
        }

        // ugh. figure out if we are and end-of-shard
@@ -1047,12 +1047,12 @@ future<executor::request_return_type> executor::get_records(client_state& client
            if (is_big(ret)) {
                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
            }
-            return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
        });
    });
 }

-void executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
+bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
    auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
    if (!stream_enabled || !stream_enabled->IsBool()) {
        throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
@@ -1086,10 +1086,12 @@ void executor::add_stream_options(const rjson::value& stream_specification, sche
                break;
        }
        builder.with_cdc_options(opts);
+        return true;
    } else {
        cdc::options opts;
        opts.enabled(false);
        builder.with_cdc_options(opts);
+        return false;
    }
 }

--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -81,11 +81,6 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
        co_return api_error::validation("UpdateTimeToLive requires boolean Enabled");
    }
    bool enabled = v->GetBool();
-    // Alternator TTL doesn't yet work when the table uses tablets (#16567)
-    if (enabled && _proxy.local_db().find_keyspace(schema->ks_name()).get_replication_strategy().uses_tablets()) {
-        co_return api_error::validation("TTL not yet supported on a table using tablets (issue #16567). "
-            "Create a table with the tag 'experimental:initial_tablets' set to 'none' to use vnodes.");
-    }
    v = rjson::find(*spec, "AttributeName");
    if (!v || !v->IsString()) {
        co_return api_error::validation("UpdateTimeToLive requires string AttributeName");
@@ -123,7 +118,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TimeToLiveSpecification", std::move(*spec));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
@@ -140,7 +135,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
    }
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TimeToLiveDescription", std::move(desc));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 // expiration_service is a sharded service responsible for cleaning up expired
@@ -291,7 +286,7 @@ static future<> expire_item(service::storage_proxy& proxy,
        auto ck = clustering_key::from_exploded(exploded_ck);
        m.partition().clustered_row(*schema, ck).apply(tombstone(ts, gc_clock::now()));
    }
-    std::vector<mutation> mutations;
+    utils::chunked_vector<mutation> mutations;
    mutations.push_back(std::move(m));
    return proxy.mutate(std::move(mutations),
        db::consistency_level::LOCAL_QUORUM,
@@ -315,8 +310,10 @@ static size_t random_offset(size_t min, size_t max) {
 // this range's primary node is down. For this we need to return not just
 // a list of this node's secondary ranges - but also the primary owner of
 // each of those ranges.
+//
+// The function is to be used with vnodes only
 static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_secondary_ranges(
-        const locator::effective_replication_map_ptr& erm,
+        const locator::effective_replication_map* erm,
        locator::host_id ep) {
    const auto& tm = *erm->get_token_metadata_ptr();
    const auto& sorted_tokens = tm.sorted_tokens();
@@ -327,6 +324,7 @@ static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_se
    auto prev_tok = sorted_tokens.back();
    for (const auto& tok : sorted_tokens) {
        co_await coroutine::maybe_yield();
+        // FIXME: pass is_vnode=true to get_natural_replicas since the token is in tm.sorted_tokens()
        host_id_vector_replica_set eps = erm->get_natural_replicas(tok);
        if (eps.size() <= 1 || eps[1] != ep) {
            prev_tok = tok;
@@ -396,7 +394,7 @@ class ranges_holder_primary {
    dht::token_range_vector _token_ranges;
 public:
    explicit ranges_holder_primary(dht::token_range_vector token_ranges) : _token_ranges(std::move(token_ranges)) {}
-    static future<ranges_holder_primary> make(const locator::vnode_effective_replication_map_ptr& erm, locator::host_id ep) {
+    static future<ranges_holder_primary> make(const locator::vnode_effective_replication_map* erm, locator::host_id ep) {
        co_return ranges_holder_primary(co_await erm->get_primary_ranges(ep));
    }
    std::size_t size() const { return _token_ranges.size(); }
@@ -416,7 +414,7 @@ public:
    explicit ranges_holder_secondary(std::vector<std::pair<dht::token_range, locator::host_id>> token_ranges, const gms::gossiper& g)
        : _token_ranges(std::move(token_ranges))
        , _gossiper(g) {}
-    static future<ranges_holder_secondary> make(const locator::effective_replication_map_ptr& erm, locator::host_id ep, const gms::gossiper& g) {
+    static future<ranges_holder_secondary> make(const locator::vnode_effective_replication_map* erm, locator::host_id ep, const gms::gossiper& g) {
        co_return ranges_holder_secondary(co_await get_secondary_ranges(erm, ep), g);
    }
    std::size_t size() const { return _token_ranges.size(); }
@@ -429,6 +427,8 @@ public:
    }
 };

+// The token_ranges_owned_by_this_shard class is only used for vnodes, where the vnodes give a partition range for the entire node
+// and such range still needs to be divided between the shards.
 template<class primary_or_secondary_t>
 class token_ranges_owned_by_this_shard {
    schema_ptr _s;
@@ -522,7 +522,7 @@ struct scan_ranges_context {
        // should be possible (and a must for issue #7751!).
        lw_shared_ptr<service::pager::paging_state> paging_state = nullptr;
        auto regular_columns =
-            s->regular_columns() | std::views::transform([] (const column_definition& cdef) { return cdef.id; })
+            s->regular_columns() | std::views::transform(&column_definition::id)
            | std::ranges::to<query::column_id_vector>();
        selection = cql3::selection::selection::wildcard(s);
        query::partition_slice::option_set opts = selection->get_query_options();
@@ -655,6 +655,17 @@ static future<> scan_table_ranges(
    }
 }

+static future<> scan_tablet(locator::tablet_id tablet, service::storage_proxy& proxy, abort_source& abort_source, named_semaphore& page_sem,
+            expiration_service::stats& expiration_stats, const scan_ranges_context& scan_ctx, const locator::tablet_map& tablet_map) {
+    auto tablet_token_range = tablet_map.get_token_range(tablet);
+    dht::ring_position tablet_start(tablet_token_range.start()->value(), dht::ring_position::token_bound::start),
+                       tablet_end(tablet_token_range.end()->value(), dht::ring_position::token_bound::end);
+    auto partition_range = dht::partition_range::make(std::move(tablet_start), std::move(tablet_end));
+    // Note that because of issue #9167 we need to run a separate query on each partition range, and can't pass
+    // several of them into one partition_range_vector that is passed to scan_table_ranges().
+    return scan_table_ranges(proxy, scan_ctx, {partition_range}, abort_source, page_sem, expiration_stats);
+}
+
 // scan_table() scans, in one table, data "owned" by this shard, looking for
 // expired items and deleting them.
 // We consider each node to "own" its primary token ranges, i.e., the tokens
@@ -730,34 +741,69 @@ static future<bool> scan_table(
    expiration_stats.scan_table++;
    // FIXME: need to pace the scan, not do it all at once.
    scan_ranges_context scan_ctx{s, proxy, std::move(column_name), std::move(member)};
-    auto erm = db.real_database().find_keyspace(s->ks_name()).get_vnode_effective_replication_map();
-    auto my_host_id = erm->get_topology().my_host_id();
-    token_ranges_owned_by_this_shard my_ranges(s, co_await ranges_holder_primary::make(erm, my_host_id));
-    while (std::optional<dht::partition_range> range = my_ranges.next_partition_range()) {
-        // Note that because of issue #9167 we need to run a separate
-        // query on each partition range, and can't pass several of
-        // them into one partition_range_vector.
-        dht::partition_range_vector partition_ranges;
-        partition_ranges.push_back(std::move(*range));
-        // FIXME: if scanning a single range fails, including network errors,
-        // we fail the entire scan (and rescan from the beginning). Need to
-        // reconsider this. Saving the scan position might be a good enough
-        // solution for this problem.
-        co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
-    }
-    // If each node only scans its own primary ranges, then when any node is
-    // down part of the token range will not get scanned. This can be viewed
-    // as acceptable (when the comes back online, it will resume its scan),
-    // but as noted in issue #9787, we can allow more prompt expiration
-    // by tasking another node to take over scanning of the dead node's primary
-    // ranges. What we do here is that this node will also check expiration
-    // on its *secondary* ranges - but only those whose primary owner is down.
-    token_ranges_owned_by_this_shard my_secondary_ranges(s, co_await ranges_holder_secondary::make(erm, my_host_id, gossiper));
-    while (std::optional<dht::partition_range> range = my_secondary_ranges.next_partition_range()) {
-        expiration_stats.secondary_ranges_scanned++;
-        dht::partition_range_vector partition_ranges;
-        partition_ranges.push_back(std::move(*range));
-        co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
+
+    if (s->table().uses_tablets()) {
+        locator::effective_replication_map_ptr erm = s->table().get_effective_replication_map();
+        auto my_host_id = erm->get_topology().my_host_id();
+        const auto &tablet_map = erm->get_token_metadata().tablets().get_tablet_map(s->id());
+        for (std::optional tablet = tablet_map.first_tablet(); tablet; tablet = tablet_map.next_tablet(*tablet)) {
+            auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet);
+            // check if this is the primary replica for the current tablet
+            if (tablet_primary_replica.host == my_host_id && tablet_primary_replica.shard == this_shard_id()) {
+                co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
+            } else if(erm->get_replication_factor() > 1) {
+                // Check if this is the secondary replica for the current tablet
+                // and if the primary replica is down which means we will take over this work.
+                // If each node only scans its own primary ranges, then when any node is
+                // down part of the token range will not get scanned. This can be viewed
+                // as acceptable (when the comes back online, it will resume its scan),
+                // but as noted in issue #9787, we can allow more prompt expiration
+                // by tasking another node to take over scanning of the dead node's primary
+                // ranges. What we do here is that this node will also check expiration
+                // on its *secondary* ranges - but only those whose primary owner is down.
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
+                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
+                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
+                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
+                    }
+                }
+            }
+        }
+    } else {  // VNodes
+        locator::static_effective_replication_map_ptr ermp =
+                db.real_database().find_keyspace(s->ks_name()).get_static_effective_replication_map();
+        auto* erm = ermp->maybe_as_vnode_effective_replication_map();
+        if (!erm) {
+            on_internal_error(tlogger, format("Keyspace {} is local", s->ks_name()));
+        }
+        auto my_host_id = erm->get_topology().my_host_id();
+        token_ranges_owned_by_this_shard my_ranges(s, co_await ranges_holder_primary::make(erm, my_host_id));
+        while (std::optional<dht::partition_range> range = my_ranges.next_partition_range()) {
+            // Note that because of issue #9167 we need to run a separate
+            // query on each partition range, and can't pass several of
+            // them into one partition_range_vector.
+            dht::partition_range_vector partition_ranges;
+            partition_ranges.push_back(std::move(*range));
+            // FIXME: if scanning a single range fails, including network errors,
+            // we fail the entire scan (and rescan from the beginning). Need to
+            // reconsider this. Saving the scan position might be a good enough
+            // solution for this problem.
+            co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
+        }
+        // If each node only scans its own primary ranges, then when any node is
+        // down part of the token range will not get scanned. This can be viewed
+        // as acceptable (when the comes back online, it will resume its scan),
+        // but as noted in issue #9787, we can allow more prompt expiration
+        // by tasking another node to take over scanning of the dead node's primary
+        // ranges. What we do here is that this node will also check expiration
+        // on its *secondary* ranges - but only those whose primary owner is down.
+        token_ranges_owned_by_this_shard my_secondary_ranges(s, co_await ranges_holder_secondary::make(erm, my_host_id, gossiper));
+        while (std::optional<dht::partition_range> range = my_secondary_ranges.next_partition_range()) {
+            expiration_stats.secondary_ranges_scanned++;
+            dht::partition_range_vector partition_ranges;
+            partition_ranges.push_back(std::move(*range));
+            co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
+        }
    }
    co_return true;
 }
--- a/api/api-doc/compaction_manager.json
+++ b/api/api-doc/compaction_manager.json
@@ -246,6 +246,24 @@
            }
         }
      },
+      "sstableinfo":{
+         "id":"sstableinfo",
+         "description":"Compacted sstable information",
+         "properties":{
+            "generation":{
+               "type": "string",
+               "description":"Generation of the sstable"
+            },
+            "origin":{
+               "type":"string",
+               "description":"Origin of the sstable"
+            },
+            "size":{
+               "type":"long",
+               "description":"Size of the sstable"
+            }
+         }
+      },
      "compaction_info" :{
          "id": "compaction_info",
          "description":"A key value mapping",
@@ -327,6 +345,10 @@
               "type":"string",
               "description":"The UUID"
            },
+            "shard_id":{
+               "type":"int",
+               "description":"The shard id the compaction was executed on"
+            },
            "cf":{
               "type":"string",
               "description":"The column family name"
@@ -335,9 +357,17 @@
               "type":"string",
               "description":"The keyspace name"
            },
+            "compaction_type":{
+               "type":"string",
+               "description":"Type of compaction"
+            },
+            "started_at":{
+               "type":"long",
+               "description":"The time compaction started"
+            },
            "compacted_at":{
               "type":"long",
-               "description":"The time of compaction"
+               "description":"The time compaction completed"
            },
            "bytes_in":{
               "type":"long",
@@ -353,6 +383,32 @@
                  "type":"row_merged"
               },
               "description":"The merged rows"
+            },
+            "sstables_in": {
+               "type":"array",
+               "items":{
+                  "type":"sstableinfo"
+               },
+               "description":"List of input sstables for compaction"
+            },
+            "sstables_out": {
+               "type":"array",
+               "items":{
+                  "type":"sstableinfo"
+               },
+               "description":"List of output sstables from compaction"
+            },
+            "total_tombstone_purge_attempt":{
+               "type":"long",
+               "description":"Total number of tombstone purge attempts"
+            },
+            "total_tombstone_purge_failure_due_to_overlapping_with_memtable":{
+               "type":"long",
+               "description":"Number of tombstone purge failures due to data overlapping with memtables"
+            },
+            "total_tombstone_purge_failure_due_to_overlapping_with_uncompacting_sstable":{
+               "type":"long",
+               "description":"Number of tombstone purge failures due to data overlapping with non-compacting sstables"
            }
        }
      }
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2144,6 +2144,31 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"skip_cleanup",
+                     "description":"Don't cleanup keys from loaded sstables. Invalid if load_and_stream is true",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"skip_reshape",
+                     "description":"Don't reshape the loaded sstables. Invalid if load_and_stream is true",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"scope",
+                     "description":"Defines the set of nodes to which mutations can be streamed",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query",
+                     "enum": ["all", "dc", "rack", "node"]
                  }
               ]
            }
@@ -3136,6 +3161,54 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/storage_service/raft_topology/cmd_rpc_status",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get information about currently running topology cmd rpc",
+               "type":"string",
+               "nickname":"raft_topology_get_cmd_status",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/drop_quarantined_sstables",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Drops all quarantined sstables in all keyspaces or specified keyspace and tables",
+               "type":"void",
+               "nickname":"drop_quarantined_sstables",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace name to drop quarantined sstables from.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"tables",
+                     "description":"Comma-separated table names to drop quarantined sstables from.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
      }
   ],
   "models":{
--- a/api/api.cc
+++ b/api/api.cc
@@ -391,32 +391,5 @@ future<> unset_server_raft(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
 }

-void req_params::process(const request& req) {
-    // Process mandatory parameters
-    for (auto& [name, ent] : params) {
-        if (!ent.is_mandatory) {
-            continue;
-        }
-        try {
-            ent.value = req.get_path_param(name);
-        } catch (std::out_of_range&) {
-            throw httpd::bad_param_exception(fmt::format("Mandatory parameter '{}' was not provided", name));
-        }
-    }
-
-    // Process optional parameters
-    for (auto& [name, value] : req.query_parameters) {
-        try {
-            auto& ent = params.at(name);
-            if (ent.is_mandatory) {
-                throw httpd::bad_param_exception(fmt::format("Parameter '{}' is expected to be provided as part of the request url", name));
-            }
-            ent.value = value;
-        } catch (std::out_of_range&) {
-            throw httpd::bad_param_exception(fmt::format("Unsupported optional parameter '{}'", name));
-        }
-    }
-}
-
 }

--- a/api/api.hh
+++ b/api/api.hh
@@ -23,17 +23,6 @@

 namespace api {

-template<class T>
-std::vector<sstring> container_to_vec(const T& container) {
-    std::vector<sstring> res;
-    res.reserve(std::size(container));
-
-    for (const auto& i : container) {
-        res.push_back(fmt::to_string(i));
-    }
-    return res;
-}
-
 template<class T>
 std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
    std::vector<T> res;
@@ -67,17 +56,6 @@ T map_sum(T&& dest, const S& src) {
    return std::move(dest);
 }

-template <typename MAP>
-std::vector<sstring> map_keys(const MAP& map) {
-    std::vector<sstring> res;
-    res.reserve(std::size(map));
-
-    for (const auto& i : map) {
-        res.push_back(fmt::to_string(i.first));
-    }
-    return res;
-}
-
 /**
 * General sstring splitting function
 */
@@ -252,67 +230,6 @@ public:
    operator T() const { return value; }
 };

-using mandatory = bool_class<struct mandatory_tag>;
-
-class req_params {
-public:
-    struct def {
-        std::optional<sstring> value;
-        mandatory is_mandatory = mandatory::no;
-
-        def(std::optional<sstring> value_ = std::nullopt, mandatory is_mandatory_ = mandatory::no)
-            : value(std::move(value_))
-            , is_mandatory(is_mandatory_)
-        { }
-
-        def(mandatory is_mandatory_)
-            : is_mandatory(is_mandatory_)
-        { }
-    };
-
-private:
-    std::unordered_map<sstring, def> params;
-
-public:
-    req_params(std::initializer_list<std::pair<sstring, def>> l) {
-        for (const auto& [name, ent] : l) {
-            add(std::move(name), std::move(ent));
-        }
-    }
-
-    void add(sstring name, def ent) {
-        params.emplace(std::move(name), std::move(ent));
-    }
-
-    void process(const request& req);
-
-    const std::optional<sstring>& get(const char* name) const {
-        return params.at(name).value;
-    }
-
-    template <typename T = sstring>
-    const std::optional<T> get_as(const char* name) const {
-        return get(name);
-    }
-
-    template <typename T = sstring>
-    requires std::same_as<T, bool>
-    const std::optional<bool> get_as(const char* name) const {
-        auto value = get(name);
-        if (!value) {
-            return std::nullopt;
-        }
-        std::transform(value->begin(), value->end(), value->begin(), ::tolower);
-        if (value == "true" || value == "yes" || value == "1") {
-            return true;
-        }
-        if (value == "false" || value == "no" || value == "0") {
-            return false;
-        }
-        throw boost::bad_lexical_cast{};
-    }
-};
-
 httpd::utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimated_histogram& val);

 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -360,13 +360,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        });

    cf::get_column_family_name_keyspace.set(r, [&ctx] (const_req req){
-        std::vector<sstring> res;
-        const flat_hash_map<sstring, replica::keyspace>& keyspaces = ctx.db.local().get_keyspaces();
-        res.reserve(keyspaces.size());
-        for (const auto& i : keyspaces) {
-            res.push_back(i.first);
-        }
-        return res;
+        return ctx.db.local().get_all_keyspaces();
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -944,9 +938,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_built_indexes.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
-        auto ks_cf = parse_fully_qualified_cf_name(req->get_path_param("name"));
-        auto&& ks = std::get<0>(ks_cf);
-        auto&& cf_name = std::get<1>(ks_cf);
+        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->get_path_param("name"));
        // Use of load_built_views() as filtering table should be in sync with
        // built_indexes_virtual_reader filtering with BUILT_VIEWS table
        return sys_ks.local().load_built_views().then([ks, cf_name, &ctx](const std::vector<db::system_keyspace::view_name>& vb) mutable {
@@ -1052,7 +1044,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
            a.merge(b);
            return a;
        }).then([](const std::unordered_set<sstring>& res) {
-            return make_ready_future<json::json_return_type>(container_to_vec(res));
+            return make_ready_future<json::json_return_type>(res | std::ranges::to<std::vector>());
        });
    });

@@ -1074,19 +1066,12 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto params = req_params({
-            std::pair("name", mandatory::yes),
-            std::pair("flush_memtables", mandatory::no),
-            std::pair("consider_only_existing_data", mandatory::no),
-            std::pair("split_output", mandatory::no),
-        });
-        params.process(*req);
-        if (params.get("split_output")) {
+        if (req->query_parameters.contains("split_output")) {
            fail(unimplemented::cause::API);
        }
-        auto [ks, cf] = parse_fully_qualified_cf_name(*params.get("name"));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
+        auto [ks, cf] = parse_fully_qualified_cf_name(req->get_path_param("name"));
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
        apilog.info("column_family/force_major_compaction: name={} flush={} consider_only_existing_data={}", req->get_path_param("name"), flush, consider_only_existing_data);

        auto keyspace = validate_keyspace(ctx, ks);
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -28,10 +28,14 @@ template<class Mapper, class I, class Reducer>
 future<I> map_reduce_cf_raw(http_context& ctx, const sstring& name, I init,
        Mapper mapper, Reducer reducer) {
    auto uuid = parse_table_info(name, ctx.db.local()).id;
-    using mapper_type = std::function<std::unique_ptr<std::any>(replica::database&)>;
+    using mapper_type = std::function<future<std::unique_ptr<std::any>>(replica::database&)>;
    using reducer_type = std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)>;
    return ctx.db.map_reduce0(mapper_type([mapper, uuid](replica::database& db) {
-        return std::make_unique<std::any>(I(mapper(db.find_column_family(uuid))));
+        return futurize_invoke([mapper, &db, uuid] {
+            return mapper(db.find_column_family(uuid));
+        }).then([] (auto result) {
+            return std::make_unique<std::any>(I(std::move(result)));
+        });
    }), std::make_unique<std::any>(std::move(init)), reducer_type([reducer = std::move(reducer)] (std::unique_ptr<std::any> a, std::unique_ptr<std::any> b) mutable {
        return std::make_unique<std::any>(I(reducer(std::any_cast<I>(std::move(*a)), std::any_cast<I>(std::move(*b)))));
    })).then([] (std::unique_ptr<std::any> r) {
@@ -61,13 +65,12 @@ future<json::json_return_type> map_reduce_cf_time_histogram(http_context& ctx, c

 struct map_reduce_column_families_locally {
    std::any init;
-    std::function<std::unique_ptr<std::any>(replica::column_family&)> mapper;
+    std::function<future<std::unique_ptr<std::any>>(replica::column_family&)> mapper;
    std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)> reducer;
    future<std::unique_ptr<std::any>> operator()(replica::database& db) const {
        auto res = seastar::make_lw_shared<std::unique_ptr<std::any>>(std::make_unique<std::any>(init));
-        return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) {
-            *res = reducer(std::move(*res), mapper(*table.get()));
-            return make_ready_future();
+        return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) -> future<> {
+            *res = reducer(std::move(*res), co_await mapper(*table.get()));
        }).then([res] () {
            return std::move(*res);
        });
@@ -77,10 +80,14 @@ struct map_reduce_column_families_locally {
 template<class Mapper, class I, class Reducer>
 future<I> map_reduce_cf_raw(http_context& ctx, I init,
        Mapper mapper, Reducer reducer) {
-    using mapper_type = std::function<std::unique_ptr<std::any>(replica::column_family&)>;
+    using mapper_type = std::function<future<std::unique_ptr<std::any>>(replica::column_family&)>;
    using reducer_type = std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)>;
    auto wrapped_mapper = mapper_type([mapper = std::move(mapper)] (replica::column_family& cf) mutable {
-        return std::make_unique<std::any>(I(mapper(cf)));
+        return futurize_invoke([&cf, mapper] {
+            return mapper(cf);
+        }).then([] (auto result) {
+            return std::make_unique<std::any>(I(std::move(result)));
+        });
    });
    auto wrapped_reducer = reducer_type([reducer = std::move(reducer)] (std::unique_ptr<std::any> a, std::unique_ptr<std::any> b) mutable {
        return std::make_unique<std::any>(I(reducer(std::any_cast<I>(std::move(*a)), std::any_cast<I>(std::move(*b)))));
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -14,6 +14,7 @@
 #include "api/api.hh"
 #include "api/api-doc/compaction_manager.json.hh"
 #include "api/api-doc/storage_service.json.hh"
+#include "db/compaction_history_entry.hh"
 #include "db/system_keyspace.hh"
 #include "column_family.hh"
 #include "unimplemented.hh"
@@ -71,10 +72,9 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction_man
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) {
+                return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) -> future<> {
                    replica::table& cf = *table.get();
-                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
-                    return make_ready_future<>();
+                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = co_await cf.estimate_pending_compactions();
                }).then([&tasks] {
                    return std::move(tasks);
                });
@@ -117,7 +117,7 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction_man
            auto& cm = db.get_compaction_manager();
            return parallel_for_each(tables, [&] (const table_info& ti) {
                auto& t = db.find_column_family(ti.id);
-                return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                return t.parallel_foreach_compaction_group_view([&] (compaction::compaction_group_view& ts) {
                    return cm.stop_compaction(type, &ts);
                });
            });
@@ -159,8 +159,11 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction_man
                co_await cm.local().get_compaction_history([&s, &first](const db::compaction_history_entry& entry) mutable -> future<> {
                        cm::history h;
                        h.id = fmt::to_string(entry.id);
+                        h.shard_id = entry.shard_id;
                        h.ks = std::move(entry.ks);
                        h.cf = std::move(entry.cf);
+                        h.compaction_type = entry.compaction_type;
+                        h.started_at = entry.started_at;
                        h.compacted_at = entry.compacted_at;
                        h.bytes_in = entry.bytes_in;
                        h.bytes_out =  entry.bytes_out;
@@ -172,6 +175,24 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction_man
                            e.value = it.second;
                            h.rows_merged.push(std::move(e));
                        }
+                        for (const auto& data : entry.sstables_in) {
+                            httpd::compaction_manager_json::sstableinfo sstable;
+                            sstable.generation = fmt::to_string(data.generation),
+                            sstable.origin = data.origin,
+                            sstable.size = data.size,
+                            h.sstables_in.push(std::move(sstable));
+                        }
+                        for (const auto& data : entry.sstables_out) {
+                            httpd::compaction_manager_json::sstableinfo sstable;
+                            sstable.generation = fmt::to_string(data.generation),
+                            sstable.origin = data.origin,
+                            sstable.size = data.size,
+                            h.sstables_out.push(std::move(sstable));
+                        }
+                        h.total_tombstone_purge_attempt = entry.total_tombstone_purge_attempt;
+                        h.total_tombstone_purge_failure_due_to_overlapping_with_memtable = entry.total_tombstone_purge_failure_due_to_overlapping_with_memtable;
+                        h.total_tombstone_purge_failure_due_to_overlapping_with_uncompacting_sstable = entry.total_tombstone_purge_failure_due_to_overlapping_with_uncompacting_sstable;
+
                        if (!first) {
                            co_await s.write(", ");
                        }
--- a/api/config.cc
+++ b/api/config.cc
@@ -23,22 +23,6 @@ using namespace seastar::httpd;
 namespace sp = httpd::storage_proxy_json;
 namespace ss = httpd::storage_service_json;

-template<class T>
-json::json_return_type get_json_return_type(const T& val) {
-    return json::json_return_type(val);
-}
-
-/*
- * As commented on db::seed_provider_type is not used
- * and probably never will.
- *
- * Just in case, we will return its name
- */
-template<>
-json::json_return_type get_json_return_type(const db::seed_provider_type& val) {
-    return json::json_return_type(val.class_name);
-}
-
 std::string_view format_type(std::string_view type) {
    if (type == "int") {
        return "integer";
@@ -187,7 +171,7 @@ void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx
    });

    ss::get_all_data_file_locations.set(r, [&cfg](const_req req) {
-        return container_to_vec(cfg.data_file_directories());
+        return cfg.data_file_directories();
    });

    ss::get_saved_caches_location.set(r, [&cfg](const_req req) {
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -40,7 +40,9 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
                }
                res.emplace_back(std::move(val));
            });
-            return make_ready_future<json::json_return_type>(res);
+            return make_ready_future<json::json_return_type>(json::stream_range_as_array(res, [](const fd::endpoint_state& i){
+                return i;
+            }));
        });
    });

@@ -64,11 +66,15 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            std::map<sstring, sstring> nodes_status;
+            std::vector<fd::mapper> nodes_status;
+            nodes_status.reserve(g.num_endpoints());
            g.for_each_endpoint_state([&] (const gms::endpoint_state& es) {
-                nodes_status.emplace(fmt::to_string(es.get_ip()), g.is_alive(es.get_host_id()) ? "UP" : "DOWN");
+                fd::mapper val;
+                val.key = fmt::to_string(es.get_ip());
+                val.value = g.is_alive(es.get_host_id()) ? "UP" : "DOWN";
+                nodes_status.emplace_back(std::move(val));
            });
-            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+            return make_ready_future<json::json_return_type>(std::move(nodes_status));
        });
    });

--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -21,14 +21,13 @@ using namespace json;
 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto res = co_await g.get_unreachable_members_synchronized();
-        co_return json::json_return_type(container_to_vec(res));
+        co_return json::json_return_type(res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>());
    });


-    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (std::unique_ptr<request> req) {
-        return g.get_live_members_synchronized().then([] (auto res) {
-            return make_ready_future<json::json_return_type>(container_to_vec(res));
-        });
+    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto res = co_await g.get_live_members_synchronized();
+        co_return json::json_return_type(res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>());
    });

    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -127,6 +127,21 @@ bool validate_bool(const sstring& param) {
    }
 }

+bool validate_bool_x(const sstring& param, bool default_value) {
+    if (param.empty()) {
+        return default_value;
+    }
+
+    if (strcasecmp(param.c_str(), "true") == 0 || strcasecmp(param.c_str(), "yes") == 0 || param == "1") {
+        return true;
+    }
+    if (strcasecmp(param.c_str(), "false") == 0 || strcasecmp(param.c_str(), "no") == 0 || param == "0") {
+        return false;
+    }
+
+    throw std::runtime_error("Invalid boolean parameter value");
+}
+
 static
 int64_t validate_int(const sstring& param) {
    return std::atoll(param.c_str());
@@ -215,28 +230,19 @@ seastar::future<json::json_return_type> run_toppartitions_query(db::toppartition

 future<scrub_info> parse_scrub_options(const http_context& ctx, sharded<db::snapshot_ctl>& snap_ctl, std::unique_ptr<http::request> req) {
    scrub_info info;
-    auto rp = req_params({
-        {"keyspace", {mandatory::yes}},
-        {"cf", {""}},
-        {"scrub_mode", {}},
-        {"skip_corrupted", {}},
-        {"disable_snapshot", {}},
-        {"quarantine_mode", {}},
-    });
-    rp.process(*req);
-    info.keyspace = validate_keyspace(ctx, *rp.get("keyspace"));
-    info.column_families = parse_table_infos(info.keyspace, ctx, *rp.get("cf")) | std::views::transform([] (auto ti) { return ti.name; }) | std::ranges::to<std::vector>();
-    auto scrub_mode_opt = rp.get("scrub_mode");
+    auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+    info.keyspace = std::move(keyspace);
+    info.column_families = table_infos | std::views::transform(&table_info::name) | std::ranges::to<std::vector>();
+    auto scrub_mode_str = req->get_query_param("scrub_mode");
    auto scrub_mode = sstables::compaction_type_options::scrub::mode::abort;

-    if (!scrub_mode_opt) {
-        const auto skip_corrupted = rp.get_as<bool>("skip_corrupted").value_or(false);
+    if (scrub_mode_str.empty()) {
+        const auto skip_corrupted = validate_bool_x(req->get_query_param("skip_corrupted"), false);

        if (skip_corrupted) {
            scrub_mode = sstables::compaction_type_options::scrub::mode::skip;
        }
    } else {
-        auto scrub_mode_str = *scrub_mode_opt;
        if (scrub_mode_str == "ABORT") {
            scrub_mode = sstables::compaction_type_options::scrub::mode::abort;
        } else if (scrub_mode_str == "SKIP") {
@@ -353,6 +359,9 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair, s
            // if the option is not sane, repair_start() throws immediately, so
            // convert the exception to an HTTP error
            throw httpd::bad_param_exception(e.what());
+        } catch (const tablets_unsupported& e) {
+            throw base_exception("Cannot repair tablet keyspace. Use /storage_service/tablets/repair to repair tablet keyspaces.",
+                    http::reply::status_type::forbidden);
        }
    });

@@ -453,17 +462,27 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto cf = req->get_query_param("cf");
        auto stream = req->get_query_param("load_and_stream");
        auto primary_replica = req->get_query_param("primary_replica_only");
+        auto skip_cleanup_p = req->get_query_param("skip_cleanup");
        boost::algorithm::to_lower(stream);
        boost::algorithm::to_lower(primary_replica);
        bool load_and_stream = stream == "true" || stream == "1";
        bool primary_replica_only = primary_replica == "true" || primary_replica == "1";
+        bool skip_cleanup = skip_cleanup_p == "true" || skip_cleanup_p == "1";
+        auto scope = parse_stream_scope(req->get_query_param("scope"));
+        auto skip_reshape_p = req->get_query_param("skip_reshape");
+        auto skip_reshape = skip_reshape_p == "true" || skip_reshape_p == "1";
+
+        if (scope != sstables_loader::stream_scope::all && !load_and_stream) {
+            throw httpd::bad_param_exception("scope takes no effect without load-and-stream");
+        }
+
        // No need to add the keyspace, since all we want is to avoid always sending this to the same
        // CPU. Even then I am being overzealous here. This is not something that happens all the time.
        auto coordinator = std::hash<sstring>()(cf) % smp::count;
        return sst_loader.invoke_on(coordinator,
                [ks = std::move(ks), cf = std::move(cf),
-                load_and_stream, primary_replica_only] (sstables_loader& loader) {
-            return loader.load_new_sstables(ks, cf, load_and_stream, primary_replica_only, sstables_loader::stream_scope::all);
+                load_and_stream, primary_replica_only, skip_cleanup, skip_reshape, scope] (sstables_loader& loader) {
+            return loader.load_new_sstables(ks, cf, load_and_stream, primary_replica_only, skip_cleanup, skip_reshape, scope);
        }).then_wrapped([] (auto&& f) {
            if (f.failed()) {
                auto msg = fmt::format("Failed to load new sstables: {}", f.get_exception());
@@ -632,7 +651,7 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
            auto& ks = ctx.db.local().find_keyspace(keyspace);
            if (table.empty()) {
                ensure_tablets_disabled(ctx, keyspace, "storage_service/range_to_endpoint_map");
-                return ks.get_vnode_effective_replication_map();
+                return ks.get_static_effective_replication_map();
            } else {
                auto table_id = validate_table(ctx.db.local(), keyspace, table);
                auto& cf = ctx.db.local().find_column_family(table_id);
@@ -705,8 +724,8 @@ static
 json::json_return_type
 rest_get_natural_endpoints(http_context& ctx, sharded<service::storage_service>& ss, const_req req) {
        auto keyspace = validate_keyspace(ctx, req);
-        return container_to_vec(ss.local().get_natural_endpoints(keyspace, req.get_query_param("cf"),
-                req.get_query_param("key")));
+        auto res = ss.local().get_natural_endpoints(keyspace, req.get_query_param("cf"), req.get_query_param("key"));
+        return res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>();
 }

 static
@@ -723,13 +742,8 @@ static
 future<json::json_return_type>
 rest_force_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("flush_memtables", mandatory::no),
-            std::pair("consider_only_existing_data", mandatory::no),
-        });
-        params.process(*req);
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
        apilog.info("force_compaction: flush={} consider_only_existing_data={}", flush, consider_only_existing_data);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
@@ -738,13 +752,7 @@ rest_force_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_compaction failed: {}", std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
 }

@@ -752,17 +760,9 @@ static
 future<json::json_return_type>
 rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("keyspace", mandatory::yes),
-            std::pair("cf", mandatory::no),
-            std::pair("flush_memtables", mandatory::no),
-            std::pair("consider_only_existing_data", mandatory::no),
-        });
-        params.process(*req);
-        auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
-        auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
+        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
@@ -771,13 +771,7 @@ rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request>
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
 }

@@ -787,8 +781,8 @@ rest_force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>
        auto& db = ctx.db;
        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
        const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
-        if (rs.get_type() == locator::replication_strategy_type::local || !rs.is_vnode_based()) {
-            auto reason = rs.get_type() == locator::replication_strategy_type::local ? "require" : "support";
+        if (rs.is_local() || !rs.is_vnode_based()) {
+            auto reason = rs.is_local() ? "require" : "support";
            apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
            co_return json::json_return_type(0);
        }
@@ -802,13 +796,7 @@ rest_force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>(
            {}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -830,12 +818,7 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<global_cleanup_compaction_task_impl>({}, db);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("cleanup_all failed: {}", std::current_exception());
-            throw;
-        }
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -847,13 +830,7 @@ rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, &res);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(res);
 }

@@ -868,13 +845,7 @@ rest_upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req) {

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -1026,7 +997,7 @@ rest_get_keyspaces(http_context& ctx, const_req req) {
        } else if (type == "non_local_strategy") {
            keyspaces = ctx.db.local().get_non_local_strategy_keyspaces();
        } else {
-            keyspaces = map_keys(ctx.db.local().get_keyspaces());
+            keyspaces = ctx.db.local().get_all_keyspaces();
        }
        if (replication.empty() || replication == "all") {
            return keyspaces;
@@ -1667,6 +1638,18 @@ rest_raft_topology_upgrade_status(sharded<service::storage_service>& ss, std::un
        co_return sstring(format("{}", ustate));
 }

+static
+future<json::json_return_type>
+rest_raft_topology_get_cmd_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        const auto status = co_await ss.invoke_on(0, [] (auto& ss) {
+            return ss.get_topology_cmd_status();
+        });
+        if (status.active_dst.empty()) {
+            co_return sstring("none");
+        }
+        co_return sstring(fmt::format("{}[{}]: {}", status.current, status.index, fmt::join(status.active_dst, ",")));
+}
+
 static
 future<json::json_return_type>
 rest_move_tablet(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1793,6 +1776,7 @@ future<json::json_return_type>
 rest_get_schema_versions(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
        return ss.local().describe_schema_versions().then([] (auto result) {
            std::vector<sp::mapper_list> res;
+            res.reserve(result.size());
            for (auto e : result) {
                sp::mapper_list entry;
                entry.key = std::move(e.first);
@@ -1803,6 +1787,36 @@ rest_get_schema_versions(sharded<service::storage_service>& ss, std::unique_ptr<
        });
 }

+static
+future<json::json_return_type>
+rest_drop_quarantined_sstables(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto keyspace = req->get_query_param("keyspace");
+    try {
+        if (!keyspace.empty()) {
+            keyspace = validate_keyspace(ctx, keyspace);
+            auto it = req->query_parameters.find("tables");
+            auto table_infos = parse_table_infos(keyspace, ctx, it != req->query_parameters.end() ? it->second : "");
+
+            co_await ctx.db.invoke_on_all([&table_infos](replica::database& db) -> future<> {
+                return parallel_for_each(table_infos, [&db](const auto& table) -> future<> {
+                    const auto& [table_name, table_id] = table;
+                    return db.find_column_family(table_id).drop_quarantined_sstables();
+                });
+            });
+        } else {
+            co_await ctx.db.invoke_on_all([](replica::database& db) -> future<> {
+                return db.get_tables_metadata().parallel_for_each_table([](table_id, lw_shared_ptr<replica::table> t) -> future<> {
+                    return t->drop_quarantined_sstables();
+                });
+            });
+        }
+    } catch (...) {
+        apilog.error("drop_quarantined_sstables: failed with exception: {}", std::current_exception());
+        throw;
+    }
+
+    co_return json_void();
+}

 // Disambiguate between a function that returns a future and a function that returns a plain value, also
 // add std::ref() as a courtesy. Also handles ks_cf_func signatures.
@@ -1898,6 +1912,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
+    ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
    ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
    ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
    ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
@@ -1905,6 +1920,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
    ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
    sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
+    ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
 }

 void unset_storage_service(http_context& ctx, routes& r) {
@@ -1979,6 +1995,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
+    ss::raft_topology_get_cmd_status.unset(r);
    ss::move_tablet.unset(r);
    ss::add_tablet_replica.unset(r);
    ss::del_tablet_replica.unset(r);
@@ -1986,6 +2003,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::tablet_balancing_enable.unset(r);
    ss::quiesce_topology.unset(r);
    sp::get_schema_versions.unset(r);
+    ss::drop_quarantined_sstables.unset(r);
 }

 void set_load_meter(http_context& ctx, routes& r, service::load_meter& lm) {
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -83,4 +83,11 @@ void set_load_meter(http_context& ctx, httpd::routes& r, service::load_meter& lm
 void unset_load_meter(http_context& ctx, httpd::routes& r);
 seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);

+// converts string value of boolean parameter into bool
+// maps (case insensitively)
+//     "true", "yes" and "1" into true
+//     "false", "no" and "0" into false
+// otherwise throws runtime_error
+bool validate_bool_x(const sstring& param, bool default_value);
+
 } // namespace api
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -39,15 +39,8 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
 void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
    t::force_keyspace_compaction_async.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("keyspace", mandatory::yes),
-            std::pair("cf", mandatory::no),
-            std::pair("flush_memtables", mandatory::no),
-        });
-        params.process(*req);
-        auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
-        auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
+        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
        apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
--- a/api/token_metadata.cc
+++ b/api/token_metadata.cc
@@ -54,12 +54,12 @@ void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_to
        for (const auto host_id: leaving_host_ids) {
            eps.insert(g.local().get_address_map().get(host_id));
        }
-        return container_to_vec(eps);
+        return eps | std::views::transform([] (auto& i) { return fmt::to_string(i); }) | std::ranges::to<std::vector>();
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
        std::unordered_set<sstring> addr;
-        return container_to_vec(addr);
+        return addr | std::ranges::to<std::vector>();
    });

    ss::get_joining_nodes.set(r, [&tm, &g](const_req req) {
@@ -70,15 +70,21 @@ void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_to
        for (const auto& [token, host_id]: points) {
            eps.insert(g.local().get_address_map().get(host_id));
        }
-        return container_to_vec(eps);
+        return eps | std::views::transform([] (auto& i) { return fmt::to_string(i); }) | std::ranges::to<std::vector>();
    });

    ss::get_host_id_map.set(r, [&tm, &g](const_req req) {
-        std::vector<ss::mapper> res;
-        auto map = tm.local().get()->get_host_ids() |
-            std::views::transform([&g] (locator::host_id id) { return std::make_pair(g.local().get_address_map().get(id), id); }) |
-            std::ranges::to<std::unordered_map>();
-        return map_to_key_value(std::move(map), res);
+        if (!g.local().is_enabled()) {
+            throw std::runtime_error("The gossiper is not ready yet");
+        }
+        return tm.local().get()->get_host_ids()
+            | std::views::transform([&g] (locator::host_id id) {
+                ss::mapper m;
+                m.key = fmt::to_string(g.local().get_address_map().get(id));
+                m.value = fmt::to_string(id);
+                return m;
+            })
+            | std::ranges::to<std::vector<ss::mapper>>();
    });

    static auto host_or_broadcast = [&tm](const_req req) {
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -209,6 +209,11 @@ future<> audit::log(const audit_info* audit_info, service::query_state& query_st
    static const sstring anonymous_username("anonymous");
    const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
    socket_address client_ip = client_state.get_client_address().addr();
+    if (logger.is_enabled(logging::log_level::debug)) {
+        logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
+            node_ip, audit_info->category_string(), cl, error, audit_info->keyspace(),
+            audit_info->query(), client_ip, audit_info->table(), username);
+    }
    return futurize_invoke(std::mem_fn(&storage_helper::write), _storage_helper_ptr, audit_info, node_ip, client_ip, cl, username, error)
        .handle_exception([audit_info, node_ip, client_ip, cl, username, error] (auto ep) {
            logger.error("Unexpected exception when writing log with: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {} exception {}",
@@ -219,6 +224,10 @@ future<> audit::log(const audit_info* audit_info, service::query_state& query_st

 future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
+    if (logger.is_enabled(logging::log_level::debug)) {
+        logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
+            node_ip, client_ip, username, error ? "true" : "false");
+    }
    return futurize_invoke(std::mem_fn(&storage_helper::write_login), _storage_helper_ptr, username, node_ip, client_ip, error)
        .handle_exception([username, node_ip, client_ip, error] (auto ep) {
            logger.error("Unexpected exception when writing login log with: node_ip {} client_ip {} username {} error {} exception {}",
--- a/audit/audit_syslog_storage_helper.cc
+++ b/audit/audit_syslog_storage_helper.cc
@@ -108,7 +108,7 @@ future<> audit_syslog_storage_helper::write(const audit_info* audit_info,
    auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
    tm time;
    localtime_r(&now, &time);
-    sstring msg = seastar::format(R"(<{}>{:%h %e %T} scylla-audit: node="{}" category="{}" cl="{}" error="{}" keyspace="{}" query="{}" client_ip="{}" table="{}" username="{}")",
+    sstring msg = seastar::format(R"(<{}>{:%h %e %T} scylla-audit: node="{}", category="{}", cl="{}", error="{}", keyspace="{}", query="{}", client_ip="{}", table="{}", username="{}")",
                                    LOG_NOTICE | LOG_USER,
                                    time,
                                    node_ip,
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -9,6 +9,7 @@
 #include "auth/allow_all_authenticator.hh"

 #include "service/migration_manager.hh"
+#include "utils/alien_worker.hh"
 #include "utils/class_registrator.hh"

 namespace auth {
@@ -21,6 +22,7 @@ static const class_registrator<
        allow_all_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");

 }
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -13,6 +13,7 @@
 #include "auth/authenticated_user.hh"
 #include "auth/authenticator.hh"
 #include "auth/common.hh"
+#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -28,7 +29,7 @@ extern const std::string_view allow_all_authenticator_name;

 class allow_all_authenticator final : public authenticator {
 public:
-    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&) {
+    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&) {
    }

    virtual future<> start() override {
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -33,13 +33,14 @@ static const class_registrator<auth::authenticator
    , auth::certificate_authenticator
    , cql3::query_processor&
    , ::service::raft_group0_client&
-    , ::service::migration_manager&> cert_auth_reg(CERT_AUTH_NAME);
+    , ::service::migration_manager&
+    , utils::alien_worker&> cert_auth_reg(CERT_AUTH_NAME);

 enum class auth::certificate_authenticator::query_source {
    subject, altname
 };

-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&)
+auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
    : _queries([&] {
        auto& conf = qp.db().get_config();
        auto queries = conf.auth_certificate_role_queries();
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -10,6 +10,7 @@
 #pragma once

 #include "auth/authenticator.hh"
+#include "utils/alien_worker.hh"
 #include <boost/regex_fwd.hpp>  // IWYU pragma: keep

 namespace cql3 {
@@ -31,7 +32,7 @@ class certificate_authenticator : public authenticator {
    enum class query_source;
    std::vector<std::pair<query_source, boost::regex>> _queries;
 public:
-    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);
    ~certificate_authenticator();

    future<> start() override;
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -119,9 +119,14 @@ future<> create_legacy_metadata_table_if_missing(
    return qs;
 }

+::service::raft_timeout get_raft_timeout() noexcept {
+    auto dur = internal_distributed_query_state().get_client_state().get_timeout_config().other_timeout;
+    return ::service::raft_timeout{.value = lowres_clock::now() + dur};
+}
+
 static future<> announce_mutations_with_guard(
        ::service::raft_group0_client& group0_client,
-        std::vector<canonical_mutation> muts,
+        utils::chunked_vector<canonical_mutation> muts,
        ::service::group0_guard group0_guard,
        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
@@ -149,7 +154,7 @@ future<> announce_mutations_with_batching(
    });

    size_t memory_usage = 0;
-    std::vector<canonical_mutation> muts;
+    utils::chunked_vector<canonical_mutation> muts;

    // guard has to be taken before we execute code in gen as
    // it can do read-before-write and we want announce_mutations
@@ -199,7 +204,7 @@ future<> announce_mutations(
            internal_distributed_query_state(),
            timestamp,
            std::move(values));
-    std::vector<canonical_mutation> cmuts = {muts.begin(), muts.end()};
+    utils::chunked_vector<canonical_mutation> cmuts = {muts.begin(), muts.end()};
    co_await announce_mutations_with_guard(group0_client, std::move(cmuts), std::move(group0_guard), as, timeout);
 }

--- a/auth/common.hh
+++ b/auth/common.hh
@@ -17,6 +17,7 @@

 #include "types/types.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "timeout_config.hh"

 using namespace std::chrono_literals;

@@ -77,6 +78,8 @@ future<> create_legacy_metadata_table_if_missing(
 ///
 ::service::query_state& internal_distributed_query_state() noexcept;

+::service::raft_timeout get_raft_timeout() noexcept;
+
 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
 // Use this function when need to perform read before write on a single guard or if
 // you have more than one mutation and potentially exceed single command size limit.
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -338,8 +338,7 @@ future<std::vector<cql3::description>> ldap_role_manager::describe_role_grants()
 }

 future<> ldap_role_manager::ensure_superuser_is_created() {
-    // ldap is responsible for users
-    co_return;
+    return _std_mgr.ensure_superuser_is_created();
 }

 } // namespace auth
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -48,14 +48,14 @@ static const class_registrator<
        password_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

 static std::string_view get_config_value(std::string_view value, std::string_view def) {
    return value.empty() ? def : value;
 }
-
 std::string password_authenticator::default_superuser(const db::config& cfg) {
    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
 }
@@ -63,12 +63,13 @@ std::string password_authenticator::default_superuser(const db::config& cfg) {
 password_authenticator::~password_authenticator() {
 }

-password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
+password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
    : _qp(qp)
    , _group0_client(g0)
    , _migration_manager(mm)
    , _stopped(make_ready_future<>()) 
    , _superuser(default_superuser(qp.db().get_config()))
+    , _hashing_worker(hashing_worker)
 {}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -117,33 +118,95 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    });
 }

-future<> password_authenticator::create_default_if_missing() {
+future<> password_authenticator::legacy_create_default_if_missing() {
+    SCYLLA_ASSERT(legacy_mode(_qp));
    const auto exists = co_await default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
    if (exists) {
        co_return;
    }
    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
    if (salted_pwd.empty()) {
-        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
+        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
    }
    const auto query = update_row_query();
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(
+    co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_query_state(),
            {salted_pwd, _superuser},
            cql3::query_processor::cache_internal::no);
-        plogger.info("Created default superuser authentication record.");
-    } else {
-        co_await announce_mutations(_qp, _group0_client, query,
-            {salted_pwd, _superuser}, _as, ::service::raft_timeout{});
-        plogger.info("Created default superuser authentication record.");
+    plogger.info("Created default superuser authentication record.");
+}
+
+future<> password_authenticator::maybe_create_default_password() {
+    auto needs_password = [this] () -> future<bool> {
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        auto results = co_await _qp.execute_internal(query,
+                db::consistency_level::LOCAL_ONE,
+                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
+        // Don't add default password if
+        // - there is no default superuser
+        // - there is a superuser with a password.
+        bool has_default = false;
+        bool has_superuser_with_password = false;
+        for (auto& result : *results) {
+            if (result.get_as<sstring>(meta::roles_table::role_col_name) == _superuser) {
+                has_default = true;
+            }
+            if (has_salted_hash(result)) {
+                has_superuser_with_password = true;
+            }
+        }
+        co_return has_default && !has_superuser_with_password;
+    };
+    if (!co_await needs_password()) {
+        co_return;
+    }
+    // We don't want to start operation earlier to avoid quorum requirement in
+    // a common case.
+    ::service::group0_batch batch(
+            co_await _group0_client.start_operation(_as, get_raft_timeout()));
+    // Check again as the state may have changed before we took the guard (batch).
+    if (!co_await needs_password()) {
+        co_return;
+    }
+    // Set default superuser's password.
+    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
+    if (salted_pwd.empty()) {
+        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
+    }
+    const auto update_query = update_row_query();
+    co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
+    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
+    plogger.info("Created default superuser authentication record.");
+}
+
+future<> password_authenticator::maybe_create_default_password_with_retries() {
+    size_t retries = _migration_manager.get_concurrent_ddl_retries();
+    while (true)  {
+        try {
+            co_return co_await maybe_create_default_password();
+        } catch (const ::service::group0_concurrent_modification& ex) {
+            plogger.warn("Failed to execute maybe_create_default_password due to guard conflict.{}.", retries ? " Retrying" : " Number of retries exceeded, giving up");
+            if (retries--) {
+                continue;
+            }
+            // Log error but don't crash the whole node startup sequence.
+            plogger.error("Failed to create default superuser password due to guard conflict.");
+            co_return;
+        } catch (const ::service::raft_operation_timeout_error& ex) {
+            plogger.error("Failed to create default superuser password due to exception: {}", ex.what());
+            co_return;
+        }
    }
 }

 future<> password_authenticator::start() {
    return once_among_shards([this] {
+        // Verify that at least one hashing scheme is supported.
+        passwords::detail::verify_scheme(_scheme);
+        plogger.info("Using password hashing scheme: {}", passwords::detail::prefix_for_scheme(_scheme));
+
        _stopped = do_after_system_ready(_as, [this] {
            return async([this] {
                if (legacy_mode(_qp)) {
@@ -164,11 +227,14 @@ future<> password_authenticator::start() {
                        migrate_legacy_metadata().get();
                        return;
                    }
+                    legacy_create_default_if_missing().get();
                }
                utils::get_local_injector().inject("password_authenticator_start_pause", utils::wait_for_message(5min)).get();
-                create_default_if_missing().get();
                if (!legacy_mode(_qp)) {
-                    _superuser_created_promise.set_value();
+                    maybe_create_default_password_with_retries().get();
+                    if (!_superuser_created_promise.available()) {
+                        _superuser_created_promise.set_value();
+                    }
                }
            });
        });
@@ -228,7 +294,13 @@ future<authenticated_user> password_authenticator::authenticate(

    try {
        const std::optional<sstring> salted_hash = co_await get_password_hash(username);
-        if (!salted_hash || !passwords::check(password, *salted_hash)) {
+        if (!salted_hash) {
+            throw exceptions::authentication_exception("Username and/or password are incorrect");
+        }
+        const bool password_match = co_await _hashing_worker.submit<bool>([password = std::move(password), salted_hash = std::move(salted_hash)]{
+            return passwords::check(password, *salted_hash);
+        });
+        if (!password_match) {
            throw exceptions::authentication_exception("Username and/or password are incorrect");
        }
        co_return username;
@@ -252,7 +324,7 @@ future<> password_authenticator::create(std::string_view role_name, const authen
    auto maybe_hash = options.credentials.transform([&] (const auto& creds) -> sstring {
        return std::visit(make_visitor(
                [&] (const password_option& opt) {
-                    return passwords::hash(opt.password, rng_for_salt);
+                    return passwords::hash(opt.password, rng_for_salt, _scheme);
                },
                [] (const hashed_password_option& opt) {
                    return opt.hashed_password;
@@ -295,11 +367,11 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
                query,
                consistency_for_user(role_name),
                internal_distributed_query_state(),
-                {passwords::hash(password, rng_for_salt), sstring(role_name)},
+                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)},
                cql3::query_processor::cache_internal::no).discard_result();
    } else {
        co_await collect_mutations(_qp, mc, query,
-                {passwords::hash(password, rng_for_salt), sstring(role_name)});
+                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
    }
 }

--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -15,7 +15,9 @@

 #include "db/consistency_level_type.hh"
 #include "auth/authenticator.hh"
+#include "auth/passwords.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "utils/alien_worker.hh"

 namespace db {
    class config;
@@ -41,14 +43,17 @@ class password_authenticator : public authenticator {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    abort_source _as;
-    std::string _superuser;
+    std::string _superuser; // default superuser name from the config (may or may not be present in roles table)
    shared_promise<> _superuser_created_promise;
+    // We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
+    constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
+    utils::alien_worker& _hashing_worker;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
    static std::string default_superuser(const db::config&);

-    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);

    ~password_authenticator();

@@ -89,7 +94,10 @@ private:

    future<> migrate_legacy_metadata() const;

-    future<> create_default_if_missing();
+    future<> legacy_create_default_if_missing();
+
+    future<> maybe_create_default_password();
+    future<> maybe_create_default_password_with_retries();

    sstring update_row_query() const;
 };
--- a/auth/passwords.cc
+++ b/auth/passwords.cc
@@ -21,18 +21,14 @@ static thread_local crypt_data tlcrypt = {};

 namespace detail {

-scheme identify_best_supported_scheme() {
-    const auto all_schemes = { scheme::bcrypt_y, scheme::bcrypt_a, scheme::sha_512, scheme::sha_256, scheme::md5 };
-    // "Random", for testing schemes.
+void verify_scheme(scheme scheme) {
    const sstring random_part_of_salt = "aaaabbbbccccdddd";

-    for (scheme c : all_schemes) {
-        const sstring salt = sstring(prefix_for_scheme(c)) + random_part_of_salt;
-        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+    const sstring salt = sstring(prefix_for_scheme(scheme)) + random_part_of_salt;
+    const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);

-        if (e && (e[0] != '*')) {
-            return c;
-        }
+    if (e && (e[0] != '*')) {
+        return;
    }

    throw no_supported_schemes();
--- a/auth/passwords.hh
+++ b/auth/passwords.hh
@@ -21,10 +21,11 @@ class no_supported_schemes : public std::runtime_error {
 public:
    no_supported_schemes();
 };
-
 ///
-/// Apache Cassandra uses a library to provide the bcrypt scheme. Many Linux implementations do not support bcrypt, so
-/// we support alternatives. The cost is loss of direct compatibility with Apache Cassandra system tables.
+/// Apache Cassandra uses a library to provide the bcrypt scheme. In ScyllaDB, we use SHA-512
+/// instead of bcrypt for performance and for historical reasons (see scylladb#24524).
+/// Currently, SHA-512 is always chosen as the hashing scheme for new passwords, but the other
+/// algorithms remain supported for CREATE ROLE WITH HASHED PASSWORD and backward compatibility.
 ///
 enum class scheme {
    bcrypt_y,
@@ -51,11 +52,11 @@ sstring generate_random_salt_bytes(RandomNumberEngine& g) {
 }

 ///
-/// Test each allowed hashing scheme and report the best supported one on the current system.
+/// Test given hashing scheme on the current system.
 ///
-/// \throws \ref no_supported_schemes when none of the known schemes is supported.
+/// \throws \ref no_supported_schemes when scheme is unsupported.
 ///
-scheme identify_best_supported_scheme();
+void verify_scheme(scheme scheme);

 std::string_view prefix_for_scheme(scheme) noexcept;

@@ -67,8 +68,7 @@ std::string_view prefix_for_scheme(scheme) noexcept;
 /// \throws \ref no_supported_schemes when no known hashing schemes are supported on the system.
 ///
 template <typename RandomNumberEngine>
-sstring generate_salt(RandomNumberEngine& g) {
-    static const scheme scheme = identify_best_supported_scheme();
+sstring generate_salt(RandomNumberEngine& g, scheme scheme) {
    static const sstring prefix = sstring(prefix_for_scheme(scheme));
    return prefix + generate_random_salt_bytes(g);
 }
@@ -93,8 +93,8 @@ sstring hash_with_salt(const sstring& pass, const sstring& salt);
 /// \throws \ref std::system_error when the implementation-specific implementation fails to hash the cleartext.
 ///
 template <typename RandomNumberEngine>
-sstring hash(const sstring& pass, RandomNumberEngine& g) {
-    return detail::hash_with_salt(pass, detail::generate_salt(g));
+sstring hash(const sstring& pass, RandomNumberEngine& g, scheme scheme) {
+    return detail::hash_with_salt(pass, detail::generate_salt(g, scheme));
 }

 ///
--- a/auth/resource.cc
+++ b/auth/resource.cc
@@ -193,9 +193,7 @@ service_level_resource_view::service_level_resource_view(const resource &r) {

 sstring encode_signature(std::string_view name, std::vector<data_type> args) {
    return seastar::format("{}[{}]", name,
-            fmt::join(args | std::views::transform([] (const data_type t) {
-                return t->name();
-            }), "^"));
+            fmt::join(args | std::views::transform(&abstract_type::name), "^"));
 }

 std::pair<sstring, std::vector<data_type>> decode_signature(std::string_view encoded_signature) {
@@ -221,9 +219,7 @@ std::pair<sstring, std::vector<data_type>> decode_signature(std::string_view enc
 static sstring decoded_signature_string(std::string_view encoded_signature) {
    auto [function_name, arg_types] = decode_signature(encoded_signature);
    return seastar::format("{}({})", cql3::util::maybe_quote(sstring(function_name)),
-            fmt::join(arg_types | std::views::transform([] (data_type t) {
-                return t->cql3_type_name();
-            }), ", "));
+            fmt::join(arg_types | std::views::transform(&abstract_type::cql3_type_name), ", "));
 }

 resource make_functions_resource(const cql3::functions::function& f) {
--- a/auth/saslauthd_authenticator.cc
+++ b/auth/saslauthd_authenticator.cc
@@ -34,9 +34,10 @@ static const class_registrator<
        saslauthd_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");

-saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&)
+saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
    : _socket_path(qp.db().get_config().saslauthd_socket_path())
 {}

--- a/auth/saslauthd_authenticator.hh
+++ b/auth/saslauthd_authenticator.hh
@@ -11,6 +11,7 @@
 #pragma once

 #include "auth/authenticator.hh"
+#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -28,7 +29,7 @@ namespace auth {
 class saslauthd_authenticator : public authenticator {
    sstring _socket_path; ///< Path to the domain socket on which saslauthd is listening.
 public:
-    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);

    future<> start() override;

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -47,6 +47,7 @@
 #include "data_dictionary/keyspace_metadata.hh"
 #include "service/storage_service.hh"
 #include "service_permit.hh"
+#include "utils/managed_string.hh"

 using namespace std::chrono_literals;

@@ -83,7 +84,6 @@ private:
    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
-    void on_update_tablet_metadata(const locator::tablet_metadata_change_hint&) override {}

    void on_drop_keyspace(const sstring& ks_name) override {
        if (!legacy_mode(_qp)) {
@@ -187,14 +187,15 @@ service::service(
        ::service::migration_notifier& mn,
        ::service::migration_manager& mm,
        const service_config& sc,
-        maintenance_socket_enabled used_by_maintenance_socket)
+        maintenance_socket_enabled used_by_maintenance_socket,
+        utils::alien_worker& hashing_worker)
            : service(
                      std::move(c),
                      qp,
                      g0,
                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
-                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm),
+                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, hashing_worker),
                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm),
                      used_by_maintenance_socket) {
 }
@@ -240,6 +241,13 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        });
    }
    co_await _role_manager->start();
+    if (this_shard_id() == 0) {
+        // Role manager and password authenticator have this odd startup
+        // mechanism where they asynchronously create the superuser role
+        // in the background. Correct password creation depends on role
+        // creation therefore we need to wait here.
+        co_await _role_manager->ensure_superuser_is_created();
+    }
    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
    _permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
    co_await once_among_shards([this] {
@@ -468,12 +476,14 @@ future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_
        const bool can_login = co_await _role_manager->can_login(role);
        const bool is_superuser = co_await _role_manager->is_superuser(role);

+        sstring create_statement = produce_create_statement(formatted_role_name, maybe_hashed_password, can_login, is_superuser);
+
        result.push_back(cql3::description {
            // Roles do not belong to any keyspace.
            .keyspace = std::nullopt,
            .type = "role",
            .name = role,
-            .create_statement = produce_create_statement(formatted_role_name, maybe_hashed_password, can_login, is_superuser)
+            .create_statement = managed_string(create_statement)
        });
    }

@@ -614,19 +624,21 @@ future<std::vector<cql3::description>> service::describe_permissions() const {

    for (const auto& permissions : permission_list) {
        for (const auto& permission : permissions.permissions) {
+            sstring create_statement = describe_resource_kind(permission, permissions.resource, permissions.role_name);
+
            result.push_back(cql3::description {
                // Permission grants do not belong to any keyspace.
                .keyspace = std::nullopt,
                .type = "grant_permission",
                .name = permissions.role_name,
-                .create_statement = describe_resource_kind(permission, permissions.resource, permissions.role_name)
+                .create_statement = managed_string(create_statement)
            });
        }

        co_await coroutine::maybe_yield();
    }

-    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) noexcept {
+    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) {
        return std::make_tuple(std::ref(desc.name), std::ref(*desc.create_statement));
    });

@@ -885,7 +897,7 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
                for (const auto& col : schema->all_columns()) {
                    if (row.has(col.name_as_text())) {
                        values.push_back(
-                                col.type->deserialize(row.get_blob(col.name_as_text())));
+                                col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
                    } else {
                        values.push_back(unset_value{});
                    }
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -26,6 +26,7 @@
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "utils/alien_worker.hh"
 #include "utils/observable.hh"
 #include "utils/serialized_action.hh"
 #include "service/maintenance_mode.hh"
@@ -126,7 +127,8 @@ public:
            ::service::migration_notifier&,
            ::service::migration_manager&,
            const service_config&,
-            maintenance_socket_enabled);
+            maintenance_socket_enabled,
+            utils::alien_worker&);

    future<> start(::service::migration_manager&, db::system_keyspace&);

--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -9,6 +9,7 @@
 #include "auth/standard_role_manager.hh"

 #include <optional>
+#include <stdexcept>
 #include <unordered_set>
 #include <vector>

@@ -28,6 +29,7 @@
 #include "cql3/util.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/error_injection.hh"
 #include "utils/log.hh"
 #include <seastar/core/loop.hh>
 #include <seastar/coroutine/maybe_yield.hh>
@@ -35,6 +37,7 @@
 #include "utils/class_registrator.hh"
 #include "service/migration_manager.hh"
 #include "password_authenticator.hh"
+#include "utils/managed_string.hh"

 namespace auth {

@@ -126,7 +129,7 @@ static future<record> require_record(cql3::query_processor& qp, std::string_view
 }

 static bool has_can_login(const cql3::untyped_result_set_row& row) {
-    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob("can_login")).is_null());
+    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob_unfragmented("can_login")).is_null());
 }

 standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
@@ -178,7 +181,8 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
                    _migration_manager)).discard_result();
 }

-future<> standard_role_manager::create_default_role_if_missing() {
+future<> standard_role_manager::legacy_create_default_role_if_missing() {
+    SCYLLA_ASSERT(legacy_mode(_qp));
    try {
        const auto exists = co_await default_role_row_satisfies(_qp, &has_can_login, _superuser);
        if (exists) {
@@ -188,16 +192,12 @@ future<> standard_role_manager::create_default_role_if_missing() {
                get_auth_ks_name(_qp),
                meta::roles_table::name,
                meta::roles_table::role_col_name);
-        if (legacy_mode(_qp)) {
-            co_await _qp.execute_internal(
-                    query,
-                    db::consistency_level::QUORUM,
-                    internal_distributed_query_state(),
-                    {_superuser},
-                    cql3::query_processor::cache_internal::no).discard_result();
-        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {_superuser}, _as, ::service::raft_timeout{});
-        }
+        co_await _qp.execute_internal(
+                query,
+                db::consistency_level::QUORUM,
+                internal_distributed_query_state(),
+                {_superuser},
+                cql3::query_processor::cache_internal::no).discard_result();
        log.info("Created default superuser role '{}'.", _superuser);
    } catch(const exceptions::unavailable_exception& e) {
        log.warn("Skipped default role setup: some nodes were not ready; will retry");
@@ -205,6 +205,60 @@ future<> standard_role_manager::create_default_role_if_missing() {
    }
 }

+future<> standard_role_manager::maybe_create_default_role() {
+    auto has_superuser = [this] () -> future<bool> {
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        auto results = co_await _qp.execute_internal(query, db::consistency_level::LOCAL_ONE,
+                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
+        for (const auto& result : *results) {
+            if (has_can_login(result)) {
+                co_return true;
+            }
+        }
+        co_return false;
+    };
+    if (co_await has_superuser()) {
+        co_return;
+    }
+    // We don't want to start operation earlier to avoid quorum requirement in
+    // a common case.
+    ::service::group0_batch batch(
+            co_await _group0_client.start_operation(_as, get_raft_timeout()));
+    // Check again as the state may have changed before we took the guard (batch).
+    if (co_await has_superuser()) {
+        co_return;
+    }
+    // There is no superuser which has can_login field - create default role.
+    // Note that we don't check if can_login is set to true.
+    const sstring insert_query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
+            get_auth_ks_name(_qp),
+            meta::roles_table::name,
+            meta::roles_table::role_col_name);
+    co_await collect_mutations(_qp, batch, insert_query, {_superuser});
+    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
+    log.info("Created default superuser role '{}'.", _superuser);
+}
+
+future<> standard_role_manager::maybe_create_default_role_with_retries() {
+    size_t retries = _migration_manager.get_concurrent_ddl_retries();
+    while (true)  {
+        try {
+            co_return co_await maybe_create_default_role();
+        } catch (const ::service::group0_concurrent_modification& ex) {
+            log.warn("Failed to execute maybe_create_default_role due to guard conflict.{}.", retries ? " Retrying" : " Number of retries exceeded, giving up");
+            if (retries--) {
+                continue;
+            }
+            // Log error but don't crash the whole node startup sequence.
+            log.error("Failed to create default superuser role due to guard conflict.");
+            co_return;
+        } catch (const ::service::raft_operation_timeout_error& ex) {
+            log.error("Failed to create default superuser role due to exception: {}", ex.what());
+            co_return;
+        }
+    }
+}
+
 static const sstring legacy_table_name{"users"};

 bool standard_role_manager::legacy_metadata_exists() {
@@ -266,10 +320,13 @@ future<> standard_role_manager::start() {
                    co_await migrate_legacy_metadata();
                    co_return;
                }
+                co_await legacy_create_default_role_if_missing();
            }
-            co_await create_default_role_if_missing();
            if (!legacy) {
-                _superuser_created_promise.set_value();
+                co_await maybe_create_default_role_with_retries();
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
+                }
            }
        };

@@ -619,6 +676,12 @@ future<role_set> standard_role_manager::query_all() {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

+    if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
+        if (legacy_mode(_qp)) {
+            throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
+        }
+    }
+
    const auto results = co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
@@ -722,18 +785,20 @@ future<std::vector<cql3::description>> standard_role_manager::describe_role_gran
        const auto formatted_grantee = cql3::util::maybe_quote(grantee_role);
        const auto formatted_granted = cql3::util::maybe_quote(granted_role);

+        sstring create_statement = seastar::format("GRANT {} TO {};", formatted_granted, formatted_grantee);
+
        result.push_back(cql3::description {
            // Role grants do not belong to any keyspace.
            .keyspace = std::nullopt,
            .type = "grant_role",
            .name = granted_role,
-            .create_statement = seastar::format("GRANT {} TO {};", formatted_granted, formatted_grantee)
+            .create_statement = managed_string(create_statement)
        });

        co_await coroutine::maybe_yield();
    }

-    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) noexcept {
+    std::ranges::sort(result, std::less<>{}, [] (const cql3::description& desc) {
        return std::make_tuple(std::ref(desc.name), std::ref(*desc.create_statement));
    });

--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -95,7 +95,10 @@ private:

    future<> migrate_legacy_metadata();

-    future<> create_default_role_if_missing();
+    future<> legacy_create_default_role_if_missing();
+
+    future<> maybe_create_default_role();
+    future<> maybe_create_default_role_with_retries();

    future<> create_or_replace(std::string_view role_name, const role_config&, ::service::group0_batch&);

--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -37,8 +37,8 @@ class transitional_authenticator : public authenticator {
 public:
    static const sstring PASSWORD_AUTHENTICATOR_NAME;

-    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
-            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm)) {
+    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
+            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, hashing_worker)) {
    }
    transitional_authenticator(std::unique_ptr<authenticator> a)
            : _authenticator(std::move(a)) {
@@ -239,7 +239,8 @@ static const class_registrator<
        auth::transitional_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");

 static const class_registrator<
        auth::authorizer,
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -139,7 +139,7 @@ private:
    // size must not be zero.
    [[gnu::always_inline]]
    value_type* alloc(size_type size) {
-        if (__builtin_expect(size <= current_space_left(), true)) {
+        if (size <= current_space_left()) [[likely]] {
            auto ret = _current->data + _current->frag_size;
            _current->frag_size += size;
            _size += size;
@@ -249,7 +249,7 @@ public:
        }

        auto this_size = std::min(v.size(), size_t(current_space_left()));
-        if (__builtin_expect(this_size, true)) {
+        if (this_size) [[likely]] {
            memcpy(_current->data + _current->frag_size, v.begin(), this_size);
            _current->frag_size += this_size;
            _size += this_size;
@@ -268,6 +268,14 @@ public:
        write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
    }

+    // Writes the fragmented view
+    template<FragmentedView View>
+    void write(View v) {
+        for (bytes_view f : fragment_range(v)) {
+            write(f);
+        }
+    }
+
    bool is_linearized() const {
        return !_begin || !_begin->next;
    }
--- a/cdc/cdc_partitioner.cc
+++ b/cdc/cdc_partitioner.cc
@@ -12,7 +12,7 @@
 #include "sstables/key.hh"
 #include "utils/class_registrator.hh"
 #include "cdc/generation.hh"
-#include "keys.hh"
+#include "keys/keys.hh"

 namespace cdc {

--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -16,7 +16,7 @@

 #include "gms/endpoint_state.hh"
 #include "gms/versioned_value.hh"
-#include "keys.hh"
+#include "keys/keys.hh"
 #include "replica/database.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -158,7 +158,7 @@ public:
        });
    }

-    void on_before_create_column_family(const keyspace_metadata& ksm, const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+    void on_before_create_column_family(const keyspace_metadata& ksm, const schema& schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
        if (schema.cdc_options().enabled()) {
            auto& db = _ctxt._proxy.get_db().local();
            auto logname = log_name(schema.cf_name());
@@ -175,7 +175,7 @@ public:
        }
    }

-    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
        bool is_cdc = new_schema.cdc_options().enabled();
        bool was_cdc = old_schema.cdc_options().enabled();

@@ -216,7 +216,7 @@ public:
        }
    }

-    void on_before_drop_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+    void on_before_drop_column_family(const schema& schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
        auto logname = log_name(schema.cf_name());
        auto& db = _ctxt._proxy.get_db().local();
        auto has_cdc_log = db.has_schema(schema.ks_name(), logname);
@@ -231,15 +231,15 @@ public:
        }
    }

-    future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>> augment_mutation_call(
+    future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>> augment_mutation_call(
        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations,
+        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
        db::consistency_level write_cl
    );

    template<typename Iter>
-    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, std::vector<mutation>&);
+    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, utils::chunked_vector<mutation>&);

 private:
    static void check_for_attempt_to_create_nested_cdc_log(replica::database& db, const schema& schema) {
@@ -960,8 +960,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given value for the given row.
    void set_value(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes_view& value) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
-        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
    }

    // Each regular and static column in the base schema has a corresponding column in the log schema
@@ -969,7 +973,13 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to `true` for the given row. If not called, the column will be `null`.
    void set_deleted(const clustering_key& log_ck, const column_definition& base_cdef) {
-        _log_mut.set_cell(log_ck, log_data_column_deleted_name_bytes(base_cdef.name()), data_value(true), _ts, _ttl);
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*log_cdef.type, _ts, log_cdef.type->decompose(true), _ttl));
    }

    // Each regular and static non-atomic column in the base schema has a corresponding column in the log schema
@@ -978,7 +988,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given set of keys for the given row.
    void set_deleted_elements(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes& deleted_elements) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*log_cdef.type, _ts, deleted_elements, _ttl));
    }

@@ -1461,7 +1476,7 @@ private:
    row_states_map _clustering_row_states;
    cell_map _static_row_state;

-    std::vector<mutation> _result_mutations;
+    utils::chunked_vector<mutation> _result_mutations;
    std::optional<log_mutation_builder> _builder;

    // When enabled, process_change will update _clustering_row_states and _static_row_state
@@ -1591,8 +1606,8 @@ public:

    // Takes and returns generated cdc log mutations and associated statistics about parts touched during transformer's lifetime.
    // The `transformer` object on which this method was called on should not be used anymore.
-    std::tuple<std::vector<mutation>, stats::part_type_set> finish() && {
-        return std::make_pair<std::vector<mutation>, stats::part_type_set>(std::move(_result_mutations), std::move(_touched_parts));
+    std::tuple<utils::chunked_vector<mutation>, stats::part_type_set> finish() && {
+        return std::make_pair<utils::chunked_vector<mutation>, stats::part_type_set>(std::move(_result_mutations), std::move(_touched_parts));
    }

    static db::timeout_clock::time_point default_timeout() {
@@ -1763,8 +1778,8 @@ public:
 };

 template <typename Func>
-future<std::vector<mutation>>
-transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
+future<utils::chunked_vector<mutation>>
+transform_mutations(utils::chunked_vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
    return parallel_for_each(
            boost::irange(static_cast<decltype(muts.size())>(0), muts.size(), batch_size),
            std::forward<Func>(f))
@@ -1773,8 +1788,8 @@ transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_siz

 } // namespace cdc

-future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
+cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
    // we do all this because in the case of batches, we can have mixed schemas.
    auto e = mutations.end();
    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
@@ -1782,14 +1797,14 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
    });

    if (i == e) {
-        return make_ready_future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), lw_shared_ptr<cdc::operation_result_tracker>()));
+        return make_ready_future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), lw_shared_ptr<cdc::operation_result_tracker>()));
    }

    tracing::trace(tr_state, "CDC: Started generating mutations for log rows");
    mutations.reserve(2 * mutations.size());

    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{},
-            [this, tr_state = std::move(tr_state), write_cl] (std::vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
+            [this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
        return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl] (int idx) mutable {
            auto& m = mutations[idx];
            auto s = m.schema();
@@ -1849,21 +1864,26 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                tracing::trace(tr_state, "CDC: Generated {} log mutations from {}", generated_count, mutations[idx].decorated_key());
                details.touched_parts.add(touched_parts);
            });
-        }).then([this, tr_state, &details](std::vector<mutation> mutations) {
+        }).then([this, tr_state, &details](utils::chunked_vector<mutation> mutations) {
            tracing::trace(tr_state, "CDC: Finished generating all log mutations");
            auto tracker = make_lw_shared<cdc::operation_result_tracker>(_ctxt._proxy.get_cdc_stats(), details);
-            return make_ready_future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), std::move(tracker)));
+            return make_ready_future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>(std::make_tuple(std::move(mutations), std::move(tracker)));
        });
    });
 }

-bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutations) const {
+bool cdc::cdc_service::needs_cdc_augmentation(const utils::chunked_vector<mutation>& mutations) const {
    return std::any_of(mutations.begin(), mutations.end(), [](const mutation& m) {
        return m.schema()->cdc_options().enabled();
    });
 }

-future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
+cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
+        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
+            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+        });
+    }
    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
 }
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -75,13 +75,13 @@ public:
    // appropriate augments to set the log entries.
    // Iff post-image is enabled for any of these, a non-empty callback is also
    // returned to be invoked post the mutation query.
-    future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
+    future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations,
+        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
        db::consistency_level write_cl
        );
-    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
+    bool needs_cdc_augmentation(const utils::chunked_vector<mutation>&) const;
 };

 struct db_context final {
--- a/cmake/Findkmip.cmake
+++ b/cmake/Findkmip.cmake
@@ -20,8 +20,6 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
  set(kmip_arch "aarch64")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64")
  set(kmip_arch "64")
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(powerpc|ppc)64le")
-  set(kmip_arch "ppc64le")
 endif()

 set(kmip_ROOT "${PROJECT_SOURCE_DIR}/kmipc/kmipc-${kmip_ver}-${kmip_distrib}_${kmip_arch}")
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -80,7 +80,7 @@ function(get_padded_dynamic_linker_option output length)
  endif()
  # prefixing a path with "/"s does not actually change it means
  pad_at_begin(padded_dynamic_linker "/" "${dynamic_linker}" ${length})
-  set(${output} "${dynamic_linker_option}=${padded_dynamic_linker}" PARENT_SCOPE)
+  set(${output} "--dynamic-linker=${padded_dynamic_linker}" PARENT_SCOPE)
 endfunction()

 # We want to strip the absolute build paths from the binary,
@@ -147,13 +147,20 @@ macro(update_build_flags config)
  endif()
  string(TOUPPER ${config} CONFIG)
  set(cxx_flags "CMAKE_CXX_FLAGS_${CONFIG}")
+  set(linker_flags "CMAKE_EXE_LINKER_FLAGS_${CONFIG}")
  string(APPEND ${cxx_flags}
    " -O${parsed_args_OPTIMIZATION_LEVEL}")
  if(parsed_args_WITH_DEBUG_INFO)
    string(APPEND ${cxx_flags} " -g -gz")
+  else()
+    # If Scylla is compiled without debug info, strip the debug symbols from
+    # the result in case one of the linked static libraries happens to have
+    # some debug symbols. See issue #23834.
+    string(APPEND ${linker_flags} " -Wl,--strip-debug")
  endif()
  unset(CONFIG)
  unset(cxx_flags)
+  unset(linker_flags)
 endmacro()

 set(pgo_opts "")
@@ -287,7 +294,7 @@ else()
  # that. The 512 includes the null at the end, hence the 511 below.
  get_padded_dynamic_linker_option(dynamic_linker_option 511)
 endif()
-add_link_options("${dynamic_linker_option}")
+add_link_options("LINKER:${dynamic_linker_option}")

 if(Scylla_ENABLE_LTO)
  include(CheckIPOSupported)
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -54,6 +54,62 @@
 #include "replica/database.hh"
 #include "timestamp.hh"

+
+can_gc_fn always_gc = [] (tombstone, is_shadowable) { return true; };
+can_gc_fn never_gc = [] (tombstone, is_shadowable) { return false; };
+
+max_purgeable_fn can_always_purge = [] (const dht::decorated_key&, is_shadowable) -> max_purgeable { return max_purgeable(api::max_timestamp); };
+max_purgeable_fn can_never_purge = [] (const dht::decorated_key&, is_shadowable) -> max_purgeable { return max_purgeable(api::min_timestamp); };
+
+max_purgeable& max_purgeable::combine(max_purgeable other) {
+    if (!other) {
+        return *this;
+    }
+    if (!*this) {
+        *this = std::move(other);
+        return *this;
+    }
+
+    if (_timestamp > other._timestamp) {
+        _source = other._source;
+        _timestamp = other._timestamp;
+    }
+
+    if (_expiry_threshold && other._expiry_threshold) {
+        _expiry_threshold = std::min(*_expiry_threshold, *other._expiry_threshold);
+    } else {
+        _expiry_threshold = std::nullopt;
+    }
+
+    return *this;
+}
+
+max_purgeable::can_purge_result max_purgeable::can_purge(tombstone t) const {
+    if (!*this) {
+        return { };
+    }
+    return {
+        .can_purge = (t.deletion_time < _expiry_threshold.value_or(gc_clock::time_point::min()) || t.timestamp < _timestamp),
+        .timestamp_source = _source,
+    };
+}
+
+auto fmt::formatter<max_purgeable::timestamp_source>::format(max_purgeable::timestamp_source s, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+    switch (s) {
+        case max_purgeable::timestamp_source::none:
+            return format_to(ctx.out(), "none");
+        case max_purgeable::timestamp_source::memtable_possibly_shadowing_data:
+            return format_to(ctx.out(), "memtable_possibly_shadowing_data");
+        case max_purgeable::timestamp_source::other_sstables_possibly_shadowing_data:
+            return format_to(ctx.out(), "other_sstables_possibly_shadowing_data");
+    }
+}
+
+auto fmt::formatter<max_purgeable>::format(max_purgeable mp, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+    const sstring expiry_str = mp.expiry_threshold() ? fmt::format("{}", mp.expiry_threshold()->time_since_epoch().count()) : "nullopt";
+    return format_to(ctx.out(), "max_purgeable{{timestamp={}, expiry_treshold={}, source={}}}", mp.timestamp(), expiry_str, mp.source());
+}
+
 namespace sstables {

 bool is_eligible_for_compaction(const shared_sstable& sst) noexcept {
@@ -135,20 +191,25 @@ std::string_view to_string(compaction_type_options::scrub::quarantine_mode quara
    return "(invalid)";
 }

-static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
+static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& table_s, sstable_set::incremental_selector& selector,
        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks,
        const api::timestamp_type compacting_max_timestamp, const bool gc_check_only_compacting_sstables, const is_shadowable is_shadowable) {
    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
-        return api::min_timestamp;
+        clogger.trace("get_max_purgeable_timestamp {}.{}: tombstone_gc_enabled=false, returning min_timestamp",
+                table_s.schema()->ks_name(), table_s.schema()->cf_name());
+        return max_purgeable(api::min_timestamp);
    }

    auto timestamp = api::max_timestamp;
    if (gc_check_only_compacting_sstables) {
        // If gc_check_only_compacting_sstables is enabled, do not
        // check memtables and other sstables not being compacted.
-        return timestamp;
+        clogger.trace("get_max_purgeable_timestamp {}.{}: gc_check_only_compacting_sstables=true, returning max_timestamp",
+                table_s.schema()->ks_name(), table_s.schema()->cf_name());
+        return max_purgeable(timestamp);
    }

+    auto source = max_purgeable::timestamp_source::none;
    api::timestamp_type memtable_min_timestamp;
    if (is_shadowable) {
        // For shadowable tombstones, check the minimum live row_marker timestamp
@@ -166,7 +227,8 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
        // See https://github.com/scylladb/scylladb/issues/20423
        memtable_min_timestamp = table_s.min_memtable_live_timestamp();
    }
-    clogger.trace("memtable_min_timestamp={} compacting_max_timestamp={} memtable_has_key={} is_shadowable={} min_memtable_live_timestamp={} min_memtable_live_row_marker_timestamp={}",
+    clogger.trace("get_max_purgeable_timestamp {}.{}: memtable_min_timestamp={} compacting_max_timestamp={} memtable_has_key={} is_shadowable={} min_memtable_live_timestamp={} min_memtable_live_row_marker_timestamp={}",
+            table_s.schema()->ks_name(), table_s.schema()->cf_name(),
            memtable_min_timestamp, compacting_max_timestamp, table_s.memtable_has_key(dk), is_shadowable, table_s.min_memtable_live_timestamp(), table_s.min_memtable_live_row_marker_timestamp());
    // Use memtable timestamp if it contains live data older than the sstables being compacted,
    // and if the memtable also contains the key we're calculating max purgeable timestamp for.
@@ -174,6 +236,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
    // newer data.
    if (memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
        timestamp = memtable_min_timestamp;
+        source = max_purgeable::timestamp_source::memtable_possibly_shadowing_data;
    }
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -217,12 +280,13 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
        if (sst->filter_has_key(*hk)) {
            bloom_filter_checks++;
            timestamp = min_timestamp;
+            source = max_purgeable::timestamp_source::other_sstables_possibly_shadowing_data;
        }
    }
-    return timestamp;
+    return max_purgeable(timestamp, source);
 }

-static std::vector<shared_sstable> get_uncompacting_sstables(const table_state& table_s, std::vector<shared_sstable> sstables) {
+static std::vector<shared_sstable> get_uncompacting_sstables(const compaction_group_view& table_s, std::vector<shared_sstable> sstables) {
    auto sstable_set = table_s.sstable_set_for_tombstone_gc();
    auto all_sstables = *sstable_set->all() | std::ranges::to<std::vector>();
    auto& compacted_undeleted = table_s.compacted_undeleted_sstables();
@@ -235,17 +299,23 @@ static std::vector<shared_sstable> get_uncompacting_sstables(const table_state&
    return not_compacted_sstables;
 }

+static std::vector<basic_info> extract_basic_info_from_sstables(const std::vector<shared_sstable>& sstables) {
+    return sstables | std::views::transform([] (auto&& sst) {
+        return sstables::basic_info{.generation = sst->generation(), .origin = sst->get_origin(), .size = sst->bytes_on_disk()};
+    }) | std::ranges::to<std::vector<basic_info>>();
+}
+
 class compaction;

 class compaction_write_monitor final : public sstables::write_monitor, public backlog_write_progress_manager {
    sstables::shared_sstable _sst;
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    const sstables::writer_offset_tracker* _tracker = nullptr;
    uint64_t _progress_seen = 0;
    api::timestamp_type _maximum_timestamp;
    unsigned _sstable_level;
 public:
-    compaction_write_monitor(sstables::shared_sstable sst, table_state& table_s, api::timestamp_type max_timestamp, unsigned sstable_level)
+    compaction_write_monitor(sstables::shared_sstable sst, compaction_group_view& table_s, api::timestamp_type max_timestamp, unsigned sstable_level)
        : _sst(sst)
        , _table_s(table_s)
        , _maximum_timestamp(max_timestamp)
@@ -381,7 +451,7 @@ using use_backlog_tracker = bool_class<class use_backlog_tracker_tag>;
 struct compaction_read_monitor_generator final : public read_monitor_generator {
    class compaction_read_monitor final : public  sstables::read_monitor, public backlog_read_progress_manager {
        sstables::shared_sstable _sst;
-        table_state& _table_s;
+        compaction_group_view& _table_s;
        const sstables::reader_position_tracker* _tracker = nullptr;
        uint64_t _last_position_seen = 0;
        use_backlog_tracker _use_backlog_tracker;
@@ -414,7 +484,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
            _sst = {};
        }

-        compaction_read_monitor(sstables::shared_sstable sst, table_state& table_s, use_backlog_tracker use_backlog_tracker)
+        compaction_read_monitor(sstables::shared_sstable sst, compaction_group_view& table_s, use_backlog_tracker use_backlog_tracker)
            : _sst(std::move(sst)), _table_s(table_s), _use_backlog_tracker(use_backlog_tracker) { }

        ~compaction_read_monitor() {
@@ -433,7 +503,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
        return p.first->second;
    }

-    explicit compaction_read_monitor_generator(table_state& table_s, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
+    explicit compaction_read_monitor_generator(compaction_group_view& table_s, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
        : _table_s(table_s), _use_backlog_tracker(use_backlog_tracker) {}

    uint64_t compacted() const {
@@ -449,7 +519,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
        }
    }
 private:
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    std::unordered_map<generation_type, compaction_read_monitor> _generated_monitors;
    use_backlog_tracker _use_backlog_tracker;

@@ -477,12 +547,13 @@ uint64_t compaction_progress_monitor::get_progress() const {
 class compaction {
 protected:
    compaction_data& _cdata;
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    const compaction_sstable_creator_fn _sstable_creator;
    const schema_ptr _schema;
    const reader_permit _permit;
    std::vector<shared_sstable> _sstables;
    std::vector<generation_type> _input_sstable_generations;
+    std::vector<basic_info> _input_sstables_basic_info;
    // Unused sstables are tracked because if compaction is interrupted we can only delete them.
    // Deleting used sstables could potentially result in data loss.
    std::unordered_set<shared_sstable> _new_partial_sstables;
@@ -501,6 +572,7 @@ protected:
    double _estimated_droppable_tombstone_ratio = 0;
    uint64_t _bloom_filter_checks = 0;
    combined_reader_statistics _reader_statistics;
+    tombstone_purge_stats _tombstone_purge_stats;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
    const bool _can_split_large_partition = false;
@@ -525,6 +597,7 @@ protected:
    utils::observable<> _stop_request_observable;
    // optional tombstone_gc_state that is used when gc has to check only the compacting sstables to collect tombstones.
    std::optional<tombstone_gc_state> _tombstone_gc_state_with_commitlog_check_disabled;
+    int64_t _output_repaired_at = 0;
 private:
    // Keeps track of monitors for input sstable.
    // If _update_backlog_tracker is set to true, monitors are responsible for adjusting backlog as compaction progresses.
@@ -554,7 +627,7 @@ private:
        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
    }
 protected:
-    compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
+    compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
        : _cdata(init_compaction_data(cdata, descriptor))
        , _table_s(table_s)
        , _sstable_creator(std::move(descriptor.creator))
@@ -607,6 +680,7 @@ protected:
    }

    void finish_new_sstable(compaction_writer* writer) {
+        writer->writer.set_repaired_at(_output_repaired_at);
        writer->writer.consume_end_of_stream();
        writer->sst->open_data().get();
        _end_size += writer->sst->bytes_on_disk();
@@ -762,7 +836,7 @@ private:
            return dht::to_partition_range(*r);
        };

-        return make_flat_multi_range_reader(_schema, _permit, std::move(source),
+        return make_multi_range_reader(_schema, _permit, std::move(source),
                                            std::move(owned_range_generator),
                                            _schema->full_slice(),
                                            tracing::trace_state_ptr());
@@ -783,12 +857,19 @@ private:

        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
+        _input_sstables_basic_info.reserve(_sstables.size());
+        int64_t repaired_at = 0;
+        std::vector<int64_t> repaired_at_for_compacted_sstables;
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
            auto& sst_stats = sst->get_stats_metadata();
+            repaired_at_for_compacted_sstables.push_back(sst_stats.repaired_at);
+            repaired_at = std::max(sst_stats.repaired_at, repaired_at);
            timestamp_tracker.update(sst_stats.min_timestamp);
            timestamp_tracker.update(sst_stats.max_timestamp);

+            _input_sstables_basic_info.emplace_back(sst->generation(), sst->get_origin(), sst->bytes_on_disk());
+
            // Compacted sstable keeps track of its ancestors.
            _input_sstable_generations.push_back(sst->generation());
            _start_size += sst->bytes_on_disk();
@@ -816,7 +897,11 @@ private:
                _rp = std::max(_rp, sst_stats.position);
            }
        }
-        log_info("{} [{}]", report_start_desc(), fmt::join(_sstables | std::views::transform([] (auto sst) { return to_string(sst, true); }), ","));
+        log_debug("{} [{}]", report_start_desc(), fmt::join(_sstables | std::views::transform([] (auto sst) { return to_string(sst, true); }), ","));
+        if (repaired_at) {
+            _output_repaired_at = repaired_at;
+        }
+        log_debug("repaired_at_vec={} output_repaired_at={}", repaired_at_for_compacted_sstables, _output_repaired_at);
        if (ssts->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->size(), _sstables.size());
@@ -842,7 +927,8 @@ private:
            });
        });
        const auto& gc_state = get_tombstone_gc_state();
-        return consumer(make_compacting_reader(setup_sstable_reader(), compaction_time, max_purgeable_func(), gc_state));
+        return consumer(make_compacting_reader(setup_sstable_reader(), compaction_time, max_purgeable_func(), gc_state,
+                                               streamed_mutation::forwarding::no, &_tombstone_purge_stats));
    }

    future<> consume() {
@@ -859,22 +945,24 @@ private:
                auto close_reader = deferred_close(reader);

                if (enable_garbage_collected_sstable_writer()) {
-                    using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, compacted_fragments_writer>;
+                    using compact_mutations = compact_for_compaction<compacted_fragments_writer, compacted_fragments_writer>;
                    auto cfc = compact_mutations(*schema(), now,
                        max_purgeable_func(),
                        get_tombstone_gc_state(),
                        get_compacted_fragments_writer(),
-                        get_gc_compacted_fragments_writer());
+                        get_gc_compacted_fragments_writer(),
+                        &_tombstone_purge_stats);

                    reader.consume_in_thread(std::move(cfc));
                    return;
                }
-                using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, noop_compacted_fragments_consumer>;
+                using compact_mutations = compact_for_compaction<compacted_fragments_writer, noop_compacted_fragments_consumer>;
                auto cfc = compact_mutations(*schema(), now,
                    max_purgeable_func(),
                    get_tombstone_gc_state(),
                    get_compacted_fragments_writer(),
-                    noop_compacted_fragments_consumer());
+                    noop_compacted_fragments_consumer(),
+                    &_tombstone_purge_stats);
                reader.consume_in_thread(std::move(cfc));
            });
        });
@@ -897,7 +985,7 @@ private:
    // if the derived compaction wants to opt in for this behavior, in addition
    // to overriding `make_interposer_consumer()`, it would have to override
    // `use_interposer_consumer()` so it returns true.
-    virtual reader_consumer_v2 make_interposer_consumer(reader_consumer_v2 end_consumer) {
+    virtual mutation_reader_consumer make_interposer_consumer(mutation_reader_consumer end_consumer) {
        return _table_s.get_compaction_strategy().make_interposer_consumer(_ms_metadata, std::move(end_consumer));
    }

@@ -907,13 +995,19 @@ private:
 protected:
    virtual compaction_result finish(std::chrono::time_point<db_clock> started_at, std::chrono::time_point<db_clock> ended_at) {
        compaction_result ret {
+            .shard_id = this_shard_id(),
+            .type = _type,
+            .sstables_in = std::move(_input_sstables_basic_info),
+            .sstables_out = extract_basic_info_from_sstables(_all_new_sstables),
            .new_sstables = std::move(_all_new_sstables),
            .stats {
+                .started_at = started_at,
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
                .bloom_filter_checks = _bloom_filter_checks,
                .reader_statistics = std::move(_reader_statistics),
+                .tombstone_purge_stats = std::move(_tombstone_purge_stats),
            },
        };

@@ -928,7 +1022,7 @@ protected:
        // - add support to merge summary (message: Partition merge counts were {%s}.).
        // - there is no easy way, currently, to know the exact number of total partitions.
        // By the time being, using estimated key count.
-        log_info("{} {} sstables to [{}]. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
+        log_debug("{} {} sstables to [{}]. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(), _input_sstable_generations.size(),
                fmt::join(ret.new_sstables | std::views::transform([] (auto sst) { return to_string(sst, false); }), ","),
                utils::pretty_printed_data_size(_start_size), utils::pretty_printed_data_size(_end_size), int(ratio * 100),
@@ -1154,7 +1248,7 @@ void compacted_fragments_writer::consume_end_of_stream() {
 class regular_compaction : public compaction {
    seastar::semaphore _replacer_lock = {1};
 public:
-    regular_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
+    regular_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
        : compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker)
    {
    }
@@ -1296,7 +1390,7 @@ private:
        return bool(_replacer);
    }
 public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
+    reshape_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no) {
    }

@@ -1364,7 +1458,7 @@ public:

 class cleanup_compaction final : public regular_compaction {
 public:
-    cleanup_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
+    cleanup_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor)
    {
    }
@@ -1381,14 +1475,14 @@ public:
 class split_compaction final : public regular_compaction {
    compaction_type_options::split _options;
 public:
-    split_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::split options,
+    split_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::split options,
                         compaction_progress_monitor& progress_monitor)
            : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor)
            , _options(std::move(options))
    {
    }

-    reader_consumer_v2 make_interposer_consumer(reader_consumer_v2 end_consumer) override {
+    mutation_reader_consumer make_interposer_consumer(mutation_reader_consumer end_consumer) override {
        return [this, end_consumer = std::move(end_consumer)] (mutation_reader reader) mutable -> future<> {
            return mutation_writer::segregate_by_token_group(std::move(reader),
                    _options.classifier,
@@ -1640,7 +1734,7 @@ private:
    uint64_t _validation_errors = 0;

 public:
-    scrub_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
+    scrub_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
        , _options(options)
        , _scrub_start_description(fmt::format("Scrubbing in {} mode", _options.operation_mode))
@@ -1682,7 +1776,7 @@ public:
        }
    }

-    reader_consumer_v2 make_interposer_consumer(reader_consumer_v2 end_consumer) override {
+    mutation_reader_consumer make_interposer_consumer(mutation_reader_consumer end_consumer) override {
        if (!use_interposer_consumer()) {
            return end_consumer;
        }
@@ -1737,7 +1831,7 @@ private:
                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
-    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
+    resharding_compaction(compaction_group_view& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
        : compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
        , _estimation_per_shard(smp::count)
        , _run_identifiers(smp::count)
@@ -1778,7 +1872,7 @@ public:

    }

-    reader_consumer_v2 make_interposer_consumer(reader_consumer_v2 end_consumer) override {
+    mutation_reader_consumer make_interposer_consumer(mutation_reader_consumer end_consumer) override {
        return [end_consumer = std::move(end_consumer)] (mutation_reader reader) mutable -> future<> {
            return mutation_writer::segregate_by_shard(std::move(reader), std::move(end_consumer));
        };
@@ -1852,9 +1946,9 @@ compaction_type compaction_type_options::type() const {
    return index_to_type[_options.index()];
 }

-static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor) {
+static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor) {
    struct {
-        table_state& table_s;
+        compaction_group_view& table_s;
        sstables::compaction_descriptor&& descriptor;
        compaction_data& cdata;
        compaction_progress_monitor& progress_monitor;
@@ -1885,7 +1979,7 @@ static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstable
    return descriptor.options.visit(visitor_factory);
 }

-static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, read_monitor_generator& monitor_generator) {
+static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, read_monitor_generator& monitor_generator) {
    auto schema = table_s.schema();
    auto permit = table_s.make_compaction_reader_permit();

@@ -1923,7 +2017,7 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    };
 }

-future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor) {
+future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
    progress_monitor.set_generator(std::make_unique<compaction_read_monitor_generator>(table_s, use_backlog_tracker::no));
    auto d = defer([&] { progress_monitor.reset_generator(); });
    auto res = co_await scrub_sstables_validate_mode(descriptor, cdata, table_s, *progress_monitor._generator);
@@ -1931,7 +2025,7 @@ future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_desc
 }

 future<compaction_result>
-compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor) {
+compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
    if (descriptor.sstables.empty()) {
        return make_exception_future<compaction_result>(std::runtime_error(format("Called {} compaction with empty set on behalf of {}.{}",
                compaction_name(descriptor.options.type()), table_s.schema()->ks_name(), table_s.schema()->cf_name())));
@@ -1945,7 +2039,7 @@ compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cd
 }

 std::unordered_set<sstables::shared_sstable>
-get_fully_expired_sstables(const table_state& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point compaction_time) {
+get_fully_expired_sstables(const compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point compaction_time) {
    clogger.debug("Checking droppable sstables in {}.{}", table_s.schema()->ks_name(), table_s.schema()->cf_name());

    if (compacting.empty()) {
@@ -1953,6 +2047,8 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
    }

    std::unordered_set<sstables::shared_sstable> candidates;
+    // Note: This contains both repaired and unrepaired sstables which means
+    // compaction consults both repaired and unrepaired sstables for tombstone gc.
    auto uncompacting_sstables = get_uncompacting_sstables(table_s, compacting);
    // Get list of uncompacting sstables that overlap the ones being compacted.
    std::vector<sstables::shared_sstable> overlapping = leveled_manifest::overlapping(*table_s.schema(), compacting, uncompacting_sstables);
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -11,11 +11,14 @@

 #include "readers/combined_reader_stats.hh"
 #include "sstables/shared_sstable.hh"
+#include "sstables/generation_type.hh"
 #include "compaction/compaction_descriptor.hh"
+#include "mutation/mutation_tombstone_stats.hh"
 #include "gc_clock.hh"
 #include "utils/UUID.hh"
-#include "table_state.hh"
+#include "compaction_group_view.hh"
 #include <seastar/core/abort_source.hh>
+#include "sstables/basic_info.hh"

 using namespace compaction;

@@ -72,6 +75,7 @@ struct compaction_data {
 };

 struct compaction_stats {
+    std::chrono::time_point<db_clock> started_at;
    std::chrono::time_point<db_clock> ended_at;
    uint64_t start_size = 0;
    uint64_t end_size = 0;
@@ -79,13 +83,16 @@ struct compaction_stats {
    // Bloom filter checks during max purgeable calculation
    uint64_t bloom_filter_checks = 0;
    combined_reader_statistics reader_statistics;
+    tombstone_purge_stats tombstone_purge_stats;

    compaction_stats& operator+=(const compaction_stats& r) {
+        started_at = std::max(started_at, r.started_at);
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
        bloom_filter_checks += r.bloom_filter_checks;
+        tombstone_purge_stats += r.tombstone_purge_stats;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
@@ -96,6 +103,10 @@ struct compaction_stats {
 };

 struct compaction_result {
+    shard_id shard_id;
+    compaction_type type;
+    std::vector<sstables::basic_info> sstables_in;
+    std::vector<sstables::basic_info> sstables_out;
    std::vector<sstables::shared_sstable> new_sstables;
    compaction_stats stats;
 };
@@ -112,7 +123,7 @@ public:
    uint64_t get_progress() const;

    friend class compaction;
-    friend future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor, compaction_data&, table_state&, compaction_progress_monitor&);
+    friend future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor, compaction_data&, compaction_group_view&, compaction_progress_monitor&);
 };

 // Compact a list of N sstables into M sstables.
@@ -120,7 +131,7 @@ public:
 //
 // compaction_descriptor is responsible for specifying the type of compaction, and influencing
 // compaction behavior through its available member fields.
-future<compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor);
+future<compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor);

 // Return list of expired sstables for column family cf.
 // A sstable is fully expired *iff* its max_local_deletion_time precedes gc_before and its
@@ -128,7 +139,7 @@ future<compaction_result> compact_sstables(sstables::compaction_descriptor descr
 // In simpler words, a sstable is fully expired if all of its live cells with TTL is expired
 // and possibly doesn't contain any tombstone that covers cells in other sstables.
 std::unordered_set<sstables::shared_sstable>
-get_fully_expired_sstables(const table_state& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point gc_before);
+get_fully_expired_sstables(const compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point gc_before);

 // For tests, can drop after we virtualize sstables.
 mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
--- a/compaction/compaction_fwd.hh
+++ b/compaction/compaction_fwd.hh
@@ -15,7 +15,7 @@

 namespace compaction {

-class table_state;
+class compaction_group_view;
 class strategy_control;
 struct compaction_state;

--- a/compaction/compaction_garbage_collector.hh
+++ b/compaction/compaction_garbage_collector.hh
@@ -22,7 +22,91 @@ using can_gc_fn = std::function<bool(tombstone, is_shadowable)>;
 extern can_gc_fn always_gc;
 extern can_gc_fn never_gc;

-using max_purgeable_fn = std::function<api::timestamp_type(const dht::decorated_key&, is_shadowable)>;
+// For the purposes of overlap with live data, a tombstone is purgeable if:
+//      tombstone.timestamp ∈ (-inf, max_purgeable._timestamp)
+//
+// The above overlap check can be omitted iff:
+//      tombstone.deletion_time ∈ (-inf, max_purgeable._expiry_threshold.value_or(gc_clock::time_point::min()))
+//
+// So in other words, a tombstone is purgeable iff:
+//      tombstone.deletion_time < max_purgeable._expiry_threshold.value_or(gc_clock::time_point::min()) || tombstone.timestamp < max_purgeable._timestamp
+//
+// See can_purge() for more details.
+class max_purgeable {
+public:
+    enum class timestamp_source {
+        none,
+        memtable_possibly_shadowing_data,
+        other_sstables_possibly_shadowing_data
+    };
+
+    using expiry_threshold_opt = std::optional<gc_clock::time_point>;
+
+private:
+    api::timestamp_type _timestamp { api::missing_timestamp };
+    expiry_threshold_opt _expiry_threshold;
+    timestamp_source _source { timestamp_source::none };
+
+public:
+    max_purgeable() = default;
+    explicit max_purgeable(api::timestamp_type timestamp, timestamp_source source = timestamp_source::none)
+        : _timestamp(timestamp), _source(source)
+    { }
+    explicit max_purgeable(api::timestamp_type timestamp, expiry_threshold_opt expiry_threshold, timestamp_source source = timestamp_source::none)
+        : _timestamp(timestamp), _expiry_threshold(expiry_threshold), _source(source)
+    { }
+
+    operator bool() const { return _timestamp != api::missing_timestamp; }
+    bool operator==(const max_purgeable&) const = default;
+    bool operator!=(const max_purgeable&) const = default;
+
+    api::timestamp_type timestamp() const noexcept { return _timestamp; }
+    expiry_threshold_opt expiry_threshold() const noexcept { return _expiry_threshold; }
+    timestamp_source source() const noexcept { return _source; }
+
+    max_purgeable& combine(max_purgeable other);
+
+    struct can_purge_result {
+        bool can_purge { true };
+        timestamp_source timestamp_source { timestamp_source::none };
+
+        // can purge?
+        operator bool() const noexcept {
+            return can_purge;
+        }
+        bool operator!() const noexcept {
+            return !can_purge;
+        }
+    };
+
+    // Determines whether the tombstone can be purged.
+    //
+    // If available, the expiry threshold is used to maybe elide the overlap
+    // check against the min live timestamp. The overlap check elision is
+    // possible if the tombstone's deletion time is < than the expiry threshold
+    // or in other words: the tombstone was already expired when the data
+    // source(s) represented by this max_purgeable were created. Consequently,
+    // all writes in these data sources arrived *after* the tombstone was already
+    // expired and hence it is not relevant to these writes, even if they
+    // otherwise overlap with the tombstone's timestamp.
+    //
+    // The overlap check elision is an optimization, checking whether a tombstone
+    // can be purged by just looking at the timestamps is still correct (but
+    // stricter).
+    can_purge_result can_purge(tombstone) const;
+};
+
+template <>
+struct fmt::formatter<max_purgeable::timestamp_source> : fmt::formatter<string_view> {
+    auto format(max_purgeable::timestamp_source, fmt::format_context& ctx) const -> decltype(ctx.out());
+};
+
+template <>
+struct fmt::formatter<max_purgeable> : fmt::formatter<string_view> {
+    auto format(max_purgeable, fmt::format_context& ctx) const -> decltype(ctx.out());
+};
+
+using max_purgeable_fn = std::function<max_purgeable(const dht::decorated_key&, is_shadowable)>;

 extern max_purgeable_fn can_always_purge;
 extern max_purgeable_fn can_never_purge;
--- a/compaction/compaction_group_view.hh
+++ b/compaction/compaction_group_view.hh
@@ -30,16 +30,16 @@ class compaction_strategy_state;

 namespace compaction {

-class table_state {
+class compaction_group_view {
 public:
-    virtual ~table_state() {}
+    virtual ~compaction_group_view() {}
    virtual dht::token_range token_range() const noexcept = 0;
    virtual const schema_ptr& schema() const noexcept = 0;
    // min threshold as defined by table.
    virtual unsigned min_compaction_threshold() const noexcept = 0;
    virtual bool compaction_enforce_min_threshold() const noexcept = 0;
-    virtual const sstables::sstable_set& main_sstable_set() const = 0;
-    virtual const sstables::sstable_set& maintenance_sstable_set() const = 0;
+    virtual future<lw_shared_ptr<const sstables::sstable_set>> main_sstable_set() const = 0;
+    virtual future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const = 0;
    virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
    virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
    virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
@@ -61,6 +61,7 @@ public:
    virtual const std::string get_group_id() const noexcept = 0;
    virtual seastar::condition_variable& get_staging_done_condition() noexcept = 0;
    virtual dht::token_range get_token_range_after_split(const dht::token& t) const noexcept = 0;
+    virtual int64_t get_sstables_repaired_at() const noexcept = 0;
 };

 } // namespace compaction
@@ -68,9 +69,9 @@ public:
 namespace fmt {

 template <>
-struct formatter<compaction::table_state> : formatter<string_view> {
+struct formatter<compaction::compaction_group_view> : formatter<string_view> {
    template <typename FormatContext>
-    auto format(const compaction::table_state& t, FormatContext& ctx) const {
+    auto format(const compaction::compaction_group_view& t, FormatContext& ctx) const {
        auto s = t.schema();
        return fmt::format_to(ctx.out(), "{}.{} compaction_group={}", s->ks_name(), s->cf_name(), t.get_group_id());
    }
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -16,6 +16,7 @@
 #include <seastar/core/metrics_registration.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/condition-variable.hh>
+#include <seastar/core/rwlock.hh>
 #include "sstables/shared_sstable.hh"
 #include "utils/exponential_backoff_retry.hh"
 #include "utils/updateable_value.hh"
@@ -33,10 +34,11 @@
 #include "sstables/exceptions.hh"
 #include "tombstone_gc.hh"
 #include "utils/pluggable.hh"
+#include "compaction/compaction_reenabler.hh"

 namespace db {
-class system_keyspace;
 class compaction_history_entry;
+class system_keyspace;
 }

 namespace sstables { class test_env_compaction_manager; }
@@ -123,12 +125,12 @@ private:
    future<> _waiting_reevalution = make_ready_future<>();
    condition_variable _postponed_reevaluation;
    // tables that wait for compaction but had its submission postponed due to ongoing compaction.
-    std::unordered_set<compaction::table_state*> _postponed;
+    std::unordered_set<compaction::compaction_group_view*> _postponed;
    // tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
    // weight is value assigned to a compaction job that is log base N of total size of all input sstables.
    std::unordered_set<int> _weight_tracker;

-    std::unordered_map<compaction::table_state*, compaction_state> _compaction_state;
+    std::unordered_map<compaction::compaction_group_view*, compaction_state> _compaction_state;

    // Purpose is to serialize all maintenance (non regular) compaction activity to reduce aggressiveness and space requirement.
    // If the operation must be serialized with regular, then the per-table write lock must be taken.
@@ -160,14 +162,17 @@ private:
    class strategy_control;
    std::unique_ptr<strategy_control> _strategy_control;

-    per_table_history_maps _reconcile_history_maps;
+    shared_tombstone_gc_state _shared_tombstone_gc_state;
+    // TODO: tombstone_gc_state should now have value semantics, but the code
+    // still uses it with reference semantics (inconsistently though).
+    // Drop this member, once the code is converted into using value semantics.
    tombstone_gc_state _tombstone_gc_state;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);

    // Return nullopt if compaction cannot be started
-    std::optional<gate::holder> start_compaction(table_state& t);
+    std::optional<gate::holder> start_compaction(compaction_group_view& t);

    template<typename TaskExecutor, typename... Args>
    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
@@ -177,14 +182,15 @@ private:
    }
    future<compaction_manager::compaction_stats_opt> perform_compaction(throw_if_stopping do_throw_if_stopping, tasks::task_info parent_info, Args&&... args);

-    future<> stop_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>> tasks, sstring reason) noexcept;
+    void stop_tasks(const std::vector<shared_ptr<compaction::compaction_task_executor>>& tasks, sstring reason) noexcept;
+    future<> await_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>>, bool task_stopped) const noexcept;
    future<> update_throughput(uint32_t value_mbs);

    // Return the largest fan-in of currently running compactions
    unsigned current_compaction_fan_in_threshold() const;

    // Return true if compaction can be initiated
-    bool can_register_compaction(compaction::table_state& t, int weight, unsigned fan_in) const;
+    bool can_register_compaction(compaction::compaction_group_view& t, int weight, unsigned fan_in) const;
    // Register weight for a table. Do that only if can_register_weight()
    // returned true.
    void register_weight(int weight);
@@ -192,14 +198,14 @@ private:
    void deregister_weight(int weight);

    // Get candidates for compaction strategy, which are all sstables but the ones being compacted.
-    std::vector<sstables::shared_sstable> get_candidates(compaction::table_state& t) const;
+    future<std::vector<sstables::shared_sstable>> get_candidates(compaction::compaction_group_view& t) const;

    bool eligible_for_compaction(const sstables::shared_sstable& sstable) const;
    bool eligible_for_compaction(const sstables::frozen_sstable_run& sstable_run) const;

    template <std::ranges::range Range>
    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable> || std::convertible_to<std::ranges::range_value_t<Range>, sstables::frozen_sstable_run>
-    std::vector<std::ranges::range_value_t<Range>> get_candidates(table_state& t, const Range& sstables) const;
+    std::vector<std::ranges::range_value_t<Range>> get_candidates(compaction_group_view& t, const Range& sstables) const;

    template <std::ranges::range Range>
    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
@@ -211,23 +217,23 @@ private:

    // gets the table's compaction state
    // throws std::out_of_range exception if not found.
-    compaction_state& get_compaction_state(compaction::table_state* t);
-    const compaction_state& get_compaction_state(compaction::table_state* t) const {
+    compaction_state& get_compaction_state(compaction::compaction_group_view* t);
+    const compaction_state& get_compaction_state(compaction::compaction_group_view* t) const {
        return const_cast<compaction_manager*>(this)->get_compaction_state(t);
    }

    // Return true if compaction manager is enabled and
    // table still exists and compaction is not disabled for the table.
-    inline bool can_proceed(compaction::table_state* t) const;
+    inline bool can_proceed(compaction::compaction_group_view* t) const;

    future<> postponed_compactions_reevaluation();
    void reevaluate_postponed_compactions() noexcept;
    // Postpone compaction for a table that couldn't be executed due to ongoing
    // similar-sized compaction.
-    void postpone_compaction_for_table(compaction::table_state* t);
+    void postpone_compaction_for_table(compaction::compaction_group_view* t);

    using quarantine_invalid_sstables = sstables::compaction_type_options::scrub::quarantine_invalid_sstables;
-    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t, tasks::task_info info, quarantine_invalid_sstables quarantine_sstables);
+    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::compaction_group_view& t, tasks::task_info info, quarantine_invalid_sstables quarantine_sstables);
    future<> update_static_shares(float shares);

    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
@@ -237,9 +243,10 @@ private:
    template<typename TaskType, typename... Args>
    requires std::derived_from<TaskType, compaction_task_executor> &&
            std::derived_from<TaskType, compaction_task_impl>
-    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(tasks::task_info info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);

-    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
+    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(sstring reason, tasks::task_info info, compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);
+
+    future<compaction_stats_opt> rewrite_sstables(compaction::compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
                                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstring options_desc = "");

    // Stop all fibers, without waiting. Safe to be called multiple times.
@@ -247,7 +254,7 @@ private:
    future<> really_do_stop() noexcept;

    // Propagate replacement of sstables to all ongoing compaction of a given table
-    void propagate_replacement(compaction::table_state& t, const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added);
+    void propagate_replacement(compaction::compaction_group_view& t, const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added);

    // This constructor is supposed to only be used for testing so lets be more explicit
    // about invoking it. Ref #10146
@@ -305,18 +312,18 @@ public:
    future<> get_compaction_history(compaction_history_consumer&& f);

    // Submit a table to be compacted.
-    void submit(compaction::table_state& t);
+    void submit(compaction::compaction_group_view& t);

    // Can regular compaction be performed in the given table
-    bool can_perform_regular_compaction(compaction::table_state& t);
+    bool can_perform_regular_compaction(compaction::compaction_group_view& t);

    // Maybe wait before adding more sstables
    // if there are too many sstables.
-    future<> maybe_wait_for_sstable_count_reduction(compaction::table_state& t);
+    future<> maybe_wait_for_sstable_count_reduction(compaction::compaction_group_view& t);

    // Submit a table to be off-strategy compacted.
    // Returns true iff off-strategy compaction was required and performed.
-    future<bool> perform_offstrategy(compaction::table_state& t, tasks::task_info info);
+    future<bool> perform_offstrategy(compaction::compaction_group_view& t, tasks::task_info info);

    // Submit a table to be cleaned up and wait for its termination.
    //
@@ -325,34 +332,34 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, tasks::task_info info);
+    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::compaction_group_view& t, tasks::task_info info);
 private:
-    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, tasks::task_info info);
+    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::compaction_group_view& t, tasks::task_info info);

    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
-    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
+    bool update_sstable_cleanup_state(compaction_group_view& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);

-    future<> on_compaction_completion(table_state& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
+    future<> on_compaction_completion(compaction_group_view& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
 public:
    // Submit a table to be upgraded and wait for its termination.
-    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version, tasks::task_info info);
+    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::compaction_group_view& t, bool exclude_current_version, tasks::task_info info);

    // Submit a table to be scrubbed and wait for its termination.
-    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts, tasks::task_info info);
+    future<compaction_stats_opt> perform_sstable_scrub(compaction::compaction_group_view& t, sstables::compaction_type_options::scrub opts, tasks::task_info info);

    // Submit a table for major compaction.
-    future<> perform_major_compaction(compaction::table_state& t, tasks::task_info info, bool consider_only_existing_data = false);
+    future<> perform_major_compaction(compaction::compaction_group_view& t, tasks::task_info info, bool consider_only_existing_data = false);

    // Splits a compaction group by segregating all its sstable according to the classifier[1].
    // [1]: See sstables::compaction_type_options::splitting::classifier.
    // Returns when all sstables in the main sstable set are split. The only exception is shutdown
    // or user aborted splitting using stop API.
-    future<compaction_stats_opt> perform_split_compaction(compaction::table_state& t, sstables::compaction_type_options::split opt, tasks::task_info info);
+    future<compaction_stats_opt> perform_split_compaction(compaction::compaction_group_view& t, sstables::compaction_type_options::split opt, tasks::task_info info);

    // Splits a single SSTable by segregating all its data according to the classifier.
    // If SSTable doesn't need split, the same input SSTable is returned as output.
    // If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
-    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, table_state& t, sstables::compaction_type_options::split opt);
+    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, sstables::compaction_type_options::split opt);

    // Run a custom job for a given table, defined by a function
    // it completes when future returned by job is ready or returns immediately
@@ -361,35 +368,19 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping);
-
-    class compaction_reenabler {
-        compaction_manager& _cm;
-        compaction::table_state* _table;
-        compaction::compaction_state& _compaction_state;
-        gate::holder _holder;
-
-    public:
-        compaction_reenabler(compaction_manager&, compaction::table_state&);
-        compaction_reenabler(compaction_reenabler&&) noexcept;
-
-        ~compaction_reenabler();
-
-        compaction::table_state* compacting_table() const noexcept {
-            return _table;
-        }
-
-        const compaction::compaction_state& compaction_state() const noexcept {
-            return _compaction_state;
-        }
-    };
+    future<> run_custom_job(compaction::compaction_group_view& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping);

    // Disable compaction temporarily for a table t.
    // Caller should call the compaction_reenabler::reenable
-    future<compaction_reenabler> stop_and_disable_compaction(compaction::table_state& t);
+    future<compaction_reenabler> stop_and_disable_compaction(sstring reason, compaction::compaction_group_view& t);
+
+    future<compaction_reenabler> await_and_disable_compaction(compaction::compaction_group_view& t);
+
+    future<seastar::rwlock::holder> get_incremental_repair_read_lock(compaction::compaction_group_view& t, const sstring& reason);
+    future<seastar::rwlock::holder> get_incremental_repair_write_lock(compaction::compaction_group_view& t, const sstring& reason);

    // Run a function with compaction temporarily disabled for a table T.
-    future<> run_with_compaction_disabled(compaction::table_state& t, std::function<future<> ()> func);
+    future<> run_with_compaction_disabled(compaction::compaction_group_view& t, std::function<future<> ()> func, sstring reason = "custom operation");

    void plug_system_keyspace(db::system_keyspace& sys_ks) noexcept;
    future<> unplug_system_keyspace() noexcept;
@@ -397,28 +388,40 @@ public:
    // Adds a table to the compaction manager.
    // Creates a compaction_state structure that can be used for submitting
    // compaction jobs of all types.
-    void add(compaction::table_state& t);
+    void add(compaction::compaction_group_view& t);
+    // Adds a group with compaction temporarily disabled. Compaction is only enabled back
+    // when the compaction_reenabler returned is destroyed.
+    compaction_reenabler add_with_compaction_disabled(compaction::compaction_group_view& view);

    // Remove a table from the compaction manager.
    // Cancel requests on table and wait for possible ongoing compactions.
-    future<> remove(compaction::table_state& t, sstring reason = "table removal") noexcept;
+    future<> remove(compaction::compaction_group_view& t, sstring reason = "table removal") noexcept;

    const stats& get_stats() const {
        return _stats;
    }

-    const std::vector<sstables::compaction_info> get_compactions(compaction::table_state* t = nullptr) const;
+    const std::vector<sstables::compaction_info> get_compactions(compaction::compaction_group_view* t = nullptr) const;

    // Returns true if table has an ongoing compaction, running on its behalf
-    bool has_table_ongoing_compaction(const compaction::table_state& t) const;
+    bool has_table_ongoing_compaction(const compaction::compaction_group_view& t) const;

-    bool compaction_disabled(compaction::table_state& t) const;
+    bool compaction_disabled(compaction::compaction_group_view& t) const;

    // Stops ongoing compaction of a given type.
-    future<> stop_compaction(sstring type, compaction::table_state* table = nullptr);
+    future<> stop_compaction(sstring type, compaction::compaction_group_view* table = nullptr);

+private:
+    std::vector<shared_ptr<compaction_task_executor>>
+    do_stop_ongoing_compactions(sstring reason, compaction_group_view* t, std::optional<sstables::compaction_type> type_opt) noexcept;
+
+public:
    // Stops ongoing compaction of a given table and/or compaction_type.
-    future<> stop_ongoing_compactions(sstring reason, compaction::table_state* t = nullptr, std::optional<sstables::compaction_type> type_opt = {}) noexcept;
+    future<> stop_ongoing_compactions(sstring reason, compaction::compaction_group_view* t = nullptr, std::optional<sstables::compaction_type> type_opt = {}) noexcept;
+
+    future<> await_ongoing_compactions(compaction_group_view* t);
+
+    compaction_reenabler stop_and_disable_compaction_no_wait(compaction_group_view& t, sstring reason);

    double backlog() {
        return _backlog_manager.backlog();
@@ -427,29 +430,32 @@ public:
    void register_backlog_tracker(compaction_backlog_tracker& backlog_tracker) {
        _backlog_manager.register_backlog_tracker(backlog_tracker);
    }
-    void register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker);

-    compaction_backlog_tracker& get_backlog_tracker(compaction::table_state& t);
+    compaction_backlog_tracker& get_backlog_tracker(compaction::compaction_group_view& t);

    static sstables::compaction_data create_compaction_data();

    compaction::strategy_control& get_strategy_control() const noexcept;

-    tombstone_gc_state& get_tombstone_gc_state() noexcept {
-        return _tombstone_gc_state;
-    };
-
    const tombstone_gc_state& get_tombstone_gc_state() const noexcept {
        return _tombstone_gc_state;
    };

+    shared_tombstone_gc_state& get_shared_tombstone_gc_state() noexcept {
+        return _shared_tombstone_gc_state;
+    };
+
+    const shared_tombstone_gc_state& get_shared_tombstone_gc_state() const noexcept {
+        return _shared_tombstone_gc_state;
+    };
+
    // Uncoditionally erase sst from `sstables_requiring_cleanup`
    // Returns true iff sst was found and erased.
-    bool erase_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst);
+    bool erase_sstable_cleanup_state(compaction_group_view& t, const sstables::shared_sstable& sst);

    // checks if the sstable is in the respective compaction_state.sstables_requiring_cleanup set.
-    bool requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const;
-    const std::unordered_set<sstables::shared_sstable>& sstables_requiring_cleanup(table_state& t) const;
+    bool requires_cleanup(compaction_group_view& t, const sstables::shared_sstable& sst) const;
+    const std::unordered_set<sstables::shared_sstable>& sstables_requiring_cleanup(compaction_group_view& t) const;

    friend class compacting_sstable_registration;
    friend class compaction_weight_registration;
@@ -465,6 +471,7 @@ public:
    friend class compaction::rewrite_sstables_compaction_task_executor;
    friend class compaction::cleanup_sstables_compaction_task_executor;
    friend class compaction::validate_sstables_compaction_task_executor;
+    friend compaction_reenabler;
 };

 namespace compaction {
@@ -489,7 +496,7 @@ public:
    };
 protected:
    compaction_manager& _cm;
-    ::compaction::table_state* _compacting_table = nullptr;
+    ::compaction::compaction_group_view* _compacting_table = nullptr;
    compaction::compaction_state& _compaction_state;
    sstables::compaction_data _compaction_data;
    state _state = state::none;
@@ -505,7 +512,7 @@ private:
    compaction_manager::compaction_stats_opt _stats = std::nullopt;

 public:
-    explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);
+    explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::compaction_group_view* t, sstables::compaction_type type, sstring desc);

    compaction_task_executor(compaction_task_executor&&) = delete;
    compaction_task_executor(const compaction_task_executor&) = delete;
@@ -549,7 +556,7 @@ protected:
    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes,
                                sstables::offstrategy offstrategy = sstables::offstrategy::no);
-    future<> update_history(::compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
+    future<> update_history(::compaction::compaction_group_view& t, sstables::compaction_result&& res, const sstables::compaction_data& cdata);
    bool should_update_history(sstables::compaction_type ct) {
        return ct == sstables::compaction_type::Compaction;
    }
@@ -560,7 +567,7 @@ public:

    future<compaction_manager::compaction_stats_opt> run_compaction() noexcept;

-    const ::compaction::table_state* compacting_table() const noexcept {
+    const ::compaction::compaction_group_view* compacting_table() const noexcept {
        return _compacting_table;
    }

@@ -596,7 +603,7 @@ private:
        return _compaction_done.get_future();
    }

-    future<sstables::sstable_set> sstable_set_for_tombstone_gc(::compaction::table_state& t);
+    future<sstables::sstable_set> sstable_set_for_tombstone_gc(::compaction::compaction_group_view& t);
 public:
    bool stopping() const noexcept {
        return _compaction_data.abort.abort_requested();
@@ -617,7 +624,8 @@ public:
    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, tasks::task_info parent_info, Args&&... args);
    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
    friend fmt::formatter<compaction_task_executor>;
-    friend future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason) noexcept;
+    friend void compaction_manager::stop_tasks(const std::vector<shared_ptr<compaction_task_executor>>& tasks, sstring reason) noexcept;
+    friend future<> compaction_manager::await_tasks(std::vector<shared_ptr<compaction_task_executor>>, bool task_stopped) const noexcept;
    friend sstables::test_env_compaction_manager;
 };

@@ -638,4 +646,4 @@ struct fmt::formatter<compaction::compaction_task_executor> {
 bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges);

 // Return all sstables but those that are off-strategy like the ones in maintenance set and staging dir.
-std::vector<sstables::shared_sstable> in_strategy_sstables(compaction::table_state& table_s);
+future<std::vector<sstables::shared_sstable>> in_strategy_sstables(compaction::compaction_group_view& table_s);
--- a/compaction/compaction_reenabler.hh
+++ b/compaction/compaction_reenabler.hh
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+
+class compaction_manager;
+
+namespace compaction {
+    class compaction_group_view;
+    class compaction_state;
+}
+
+class compaction_reenabler {
+    compaction_manager& _cm;
+    compaction::compaction_group_view* _table;
+    compaction::compaction_state& _compaction_state;
+    seastar::gate::holder _holder;
+
+public:
+    compaction_reenabler(compaction_manager&, compaction::compaction_group_view&);
+    compaction_reenabler(compaction_reenabler&&) noexcept;
+
+    ~compaction_reenabler();
+
+    compaction::compaction_group_view* compacting_table() const noexcept {
+        return _table;
+    }
+
+    const compaction::compaction_state& compaction_state() const noexcept {
+        return _compaction_state;
+    }
+};
+
--- a/compaction/compaction_state.hh
+++ b/compaction/compaction_state.hh
@@ -19,28 +19,41 @@

 namespace compaction {

+// There's 1:1 relationship between compaction_grop_view and compaction_state.
+// Two or more compaction_group_view can be served by the same instance of sstable::sstable_set,
+// so it's not safe to track any sstable state here.
 struct compaction_state {
    // Used both by compaction tasks that refer to the compaction_state
    // and by any function running under run_with_compaction_disabled().
    seastar::named_gate gate;

-    // Prevents table from running major and minor compaction at the same time.
+    // Used for synchronizing selection of sstable for compaction.
+    // Write lock is held when getting sstable list, feeding them into strategy, and registering compacting sstables.
+    // The lock prevents two concurrent compaction tasks from picking the same sstables. And it also helps major
+    // to synchronize with minor, such that major doesn't miss any sstable.
    seastar::rwlock lock;

+    // Compations like major need to work on all sstables in the unrepaired
+    // set, no matter if the sstable is being repaired or not. The
+    // incremental_repair_lock lock is introduced to serialize repair and such
+    // compactions. This lock guarantees that no sstables are being repaired.
+    // Note that the minor compactions do not need to take this lock because
+    // they ignore sstables that are being repaired.
+    seastar::rwlock incremental_repair_lock;
+
    // Raised by any function running under run_with_compaction_disabled();
    long compaction_disabled_counter = 0;

    // Signaled whenever a compaction task completes.
    condition_variable compaction_done;

-    std::optional<compaction_backlog_tracker> backlog_tracker;
-
+    // Used only with vnodes, will not work with tablets. Can be removed once vnodes are gone.
    std::unordered_set<sstables::shared_sstable> sstables_requiring_cleanup;
    compaction::owned_ranges_ptr owned_ranges_ptr;

    gc_clock::time_point last_regular_compaction;

-    explicit compaction_state(table_state& t);
+    explicit compaction_state(compaction_group_view& t);
    compaction_state(compaction_state&&) = delete;
    ~compaction_state();

--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -46,7 +46,7 @@ compaction_descriptor compaction_strategy_impl::make_major_compaction_job(std::v
    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes);
 }

-std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    // The default implementation is suboptimal and causes the writeamp problem described issue in #10097.
    // The compaction strategy relying on it should strive to implement its own method, to make cleanup bucket aware.
    return candidates | std::views::transform([] (const shared_sstable& sst) {
@@ -55,7 +55,7 @@ std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compact
    }) | std::ranges::to<std::vector>();
 }

-bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t) {
+bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const compaction_group_view& t) {
    if (_disable_tombstone_compaction) {
        return false;
    }
@@ -77,7 +77,7 @@ uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_sour
    return partition_estimate;
 }

-reader_consumer_v2 compaction_strategy_impl::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const {
+mutation_reader_consumer compaction_strategy_impl::make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
    return end_consumer;
 }

@@ -581,12 +581,12 @@ struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
 //
 class null_compaction_strategy : public compaction_strategy_impl {
 public:
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override {
-        return sstables::compaction_descriptor();
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override {
+        return make_ready_future<sstables::compaction_descriptor>();
    }

-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override {
-        return 0;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override {
+        return make_ready_future<int64_t>(0);
    }

    virtual compaction_strategy_type type() const override {
@@ -700,19 +700,19 @@ compaction_strategy_type compaction_strategy::type() const {
    return _compaction_strategy_impl->type();
 }

-compaction_descriptor compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor> compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    return _compaction_strategy_impl->get_sstables_for_compaction(table_s, control);
 }

-compaction_descriptor compaction_strategy::get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
+compaction_descriptor compaction_strategy::get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) {
    return _compaction_strategy_impl->get_major_compaction_job(table_s, std::move(candidates));
 }

-std::vector<compaction_descriptor> compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+std::vector<compaction_descriptor> compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    return _compaction_strategy_impl->get_cleanup_compaction_jobs(table_s, std::move(candidates));
 }

-void compaction_strategy::notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
+void compaction_strategy::notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
    _compaction_strategy_impl->notify_completion(table_s, removed, added);
 }

@@ -720,7 +720,7 @@ bool compaction_strategy::parallel_compaction() const {
    return _compaction_strategy_impl->parallel_compaction();
 }

-int64_t compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    return _compaction_strategy_impl->estimated_pending_compactions(table_s);
 }

@@ -741,7 +741,7 @@ uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_me
    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
 }

-reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const {
+mutation_reader_consumer compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
    return _compaction_strategy_impl->make_interposer_consumer(ms_meta, std::move(end_consumer));
 }

@@ -789,7 +789,7 @@ future<reshape_config> make_reshape_config(const sstables::storage& storage, res
    };
 }

-std::unique_ptr<sstable_set_impl> incremental_compaction_strategy::make_sstable_set(const table_state& ts) const {
+std::unique_ptr<sstable_set_impl> incremental_compaction_strategy::make_sstable_set(const compaction_group_view& ts) const {
    return std::make_unique<partitioned_sstable_set>(ts.schema(), ts.token_range());
 }

--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -12,7 +12,7 @@
 #include "sstables/shared_sstable.hh"
 #include "exceptions/exceptions.hh"
 #include "compaction_strategy_type.hh"
-#include "table_state.hh"
+#include "compaction_group_view.hh"
 #include "strategy_control.hh"

 struct mutation_source_metadata;
@@ -41,15 +41,15 @@ public:
    compaction_strategy& operator=(compaction_strategy&&);

    // Return a list of sstables to be compacted after applying the strategy.
-    compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control);
+    future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control);

-    compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<shared_sstable> candidates);
+    compaction_descriptor get_major_compaction_job(compaction_group_view& table_s, std::vector<shared_sstable> candidates);

-    std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const;
+    std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const;

    // Some strategies may look at the compacted and resulting sstables to
    // get some useful information for subsequent compactions.
-    void notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added);
+    void notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added);

    // Return if parallel compaction is allowed by strategy.
    bool parallel_compaction() const;
@@ -58,7 +58,7 @@ public:
    bool use_clustering_key_filter() const;

    // An estimation of number of compaction for strategy to be satisfied.
-    int64_t estimated_pending_compactions(table_state& table_s) const;
+    future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const;

    static sstring name(compaction_strategy_type type) {
        switch (type) {
@@ -105,13 +105,13 @@ public:
        return name(type());
    }

-    sstable_set make_sstable_set(const table_state& ts) const;
+    sstable_set make_sstable_set(const compaction_group_view& ts) const;

    compaction_backlog_tracker make_backlog_tracker() const;

    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr) const;

-    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const;
+    mutation_reader_consumer make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const;

    // Returns whether or not interposer consumer is used by a given strategy.
    bool use_interposer_consumer() const;
--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -45,18 +45,18 @@ protected:
            uint64_t max_sstable_bytes = compaction_descriptor::default_max_sstable_bytes);
 public:
    virtual ~compaction_strategy_impl() {}
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) = 0;
-    virtual compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) = 0;
+    virtual compaction_descriptor get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) {
        return make_major_compaction_job(std::move(candidates));
    }
-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const;
-    virtual void notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) { }
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const;
+    virtual void notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) { }
    virtual compaction_strategy_type type() const = 0;
    virtual bool parallel_compaction() const {
        return true;
    }
-    virtual int64_t estimated_pending_compactions(table_state& table_s) const = 0;
-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const = 0;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const;

    bool use_clustering_key_filter() const {
        return _use_clustering_key_filter;
@@ -64,7 +64,7 @@ public:

    // Check if a given sstable is entitled for tombstone compaction based on its
    // droppable tombstone histogram and gc_before.
-    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t);
+    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const compaction_group_view& t);

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const = 0;

@@ -82,7 +82,7 @@ public:
    /// @return A new functor that wraps the end consumer with additional processing capabilities
    /// @note The returned functor preserves the original consumer's semantics while allowing
    ///       preprocessing of data
-    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const;
+    virtual mutation_reader_consumer make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const;

    virtual bool use_interposer_consumer() const {
        return false;
--- a/compaction/incremental_compaction_strategy.cc
+++ b/compaction/incremental_compaction_strategy.cc
@@ -244,7 +244,7 @@ incremental_compaction_strategy::most_interesting_bucket(std::vector<std::vector
 }

 compaction_descriptor
-incremental_compaction_strategy::find_garbage_collection_job(const compaction::table_state& t, std::vector<size_bucket_t>& buckets) {
+incremental_compaction_strategy::find_garbage_collection_job(const compaction::compaction_group_view& t, std::vector<size_bucket_t>& buckets) {
    auto worth_dropping_tombstones = [this, &t, now = db_clock::now()] (const sstable_run& run, gc_clock::time_point compaction_time) {
        if (run.all().empty()) {
            return false;
@@ -318,9 +318,9 @@ incremental_compaction_strategy::find_garbage_collection_job(const compaction::t
    return compaction_descriptor(runs_to_sstables(std::move(input)), 0, _fragment_size);
 }

-compaction_descriptor
-incremental_compaction_strategy::get_sstables_for_compaction(table_state& t, strategy_control& control) {
-    auto candidates = control.candidates_as_runs(t);
+future<compaction_descriptor>
+incremental_compaction_strategy::get_sstables_for_compaction(compaction_group_view& t, strategy_control& control) {
+    auto candidates = co_await control.candidates_as_runs(t);

    // make local copies so they can't be changed out from under us mid-method
    size_t min_threshold = t.min_compaction_threshold();
@@ -330,28 +330,28 @@ incremental_compaction_strategy::get_sstables_for_compaction(table_state& t, str

    if (is_any_bucket_interesting(buckets, min_threshold)) {
        std::vector<sstables::frozen_sstable_run> most_interesting = most_interesting_bucket(std::move(buckets), min_threshold, max_threshold);
-        return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
+        co_return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
    }
    // If we are not enforcing min_threshold explicitly, try any pair of sstable runs in the same tier.
    if (!t.compaction_enforce_min_threshold() && is_any_bucket_interesting(buckets, 2)) {
        std::vector<sstables::frozen_sstable_run> most_interesting = most_interesting_bucket(std::move(buckets), 2, max_threshold);
-        return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
+        co_return sstables::compaction_descriptor(runs_to_sstables(std::move(most_interesting)), 0, _fragment_size);
    }

    // The cross-tier behavior is only triggered once we're done with all the pending same-tier compaction to
    // increase overall efficiency.
    if (control.has_ongoing_compaction(t)) {
-        return sstables::compaction_descriptor();
+        co_return sstables::compaction_descriptor();
    }

    auto desc = find_garbage_collection_job(t, buckets);
    if (!desc.sstables.empty()) {
-        return desc;
+        co_return desc;
    }

    if (_space_amplification_goal) {
        if (buckets.size() < 2) {
-            return sstables::compaction_descriptor();
+            co_return sstables::compaction_descriptor();
        }
        // Let S0 be the size of largest tier
        // Let S1 be the size of second-largest tier,
@@ -383,33 +383,34 @@ incremental_compaction_strategy::get_sstables_for_compaction(table_state& t, str
            cross_tier_input.reserve(cross_tier_input.size() + s1.size());
            std::move(s1.begin(), s1.end(), std::back_inserter(cross_tier_input));

-            return sstables::compaction_descriptor(runs_to_sstables(std::move(cross_tier_input)),
+            co_return sstables::compaction_descriptor(runs_to_sstables(std::move(cross_tier_input)),
                                                   0, _fragment_size);
        }
    }

-    return sstables::compaction_descriptor();
+    co_return sstables::compaction_descriptor();
 }

 compaction_descriptor
-incremental_compaction_strategy::get_major_compaction_job(table_state& t, std::vector<sstables::shared_sstable> candidates) {
+incremental_compaction_strategy::get_major_compaction_job(compaction_group_view& t, std::vector<sstables::shared_sstable> candidates) {
    if (candidates.empty()) {
        return compaction_descriptor();
    }
    return make_major_compaction_job(std::move(candidates), 0, _fragment_size);
 }

-int64_t incremental_compaction_strategy::estimated_pending_compactions(table_state& t) const {
+future<int64_t> incremental_compaction_strategy::estimated_pending_compactions(compaction_group_view& t) const {
    size_t min_threshold = t.schema()->min_compaction_threshold();
    size_t max_threshold = t.schema()->max_compaction_threshold();
    int64_t n = 0;

-    for (auto& bucket : get_buckets(t.main_sstable_set().all_sstable_runs())) {
+    auto main_set = co_await t.main_sstable_set();
+    for (auto& bucket : get_buckets(main_set->all_sstable_runs())) {
        if (bucket.size() >= min_threshold) {
            n += (bucket.size() + max_threshold - 1) / max_threshold;
        }
    }
-    return n;
+    co_return n;
 }

 std::vector<shared_sstable>
@@ -483,7 +484,7 @@ incremental_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
 }

 std::vector<compaction_descriptor>
-incremental_compaction_strategy::get_cleanup_compaction_jobs(table_state& t, std::vector<shared_sstable> candidates) const {
+incremental_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& t, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;
    const auto& schema = t.schema();
    unsigned max_threshold = schema->max_compaction_threshold();
--- a/compaction/incremental_compaction_strategy.hh
+++ b/compaction/incremental_compaction_strategy.hh
@@ -70,7 +70,7 @@ private:

    bool is_any_bucket_interesting(const std::vector<std::vector<sstables::frozen_sstable_run>>& buckets, size_t min_threshold) const;

-    compaction_descriptor find_garbage_collection_job(const table_state& t, std::vector<size_bucket_t>& buckets);
+    compaction_descriptor find_garbage_collection_job(const compaction_group_view& t, std::vector<size_bucket_t>& buckets);

    static std::vector<shared_sstable> runs_to_sstables(std::vector<frozen_sstable_run> runs);
    static std::vector<frozen_sstable_run> sstables_to_runs(std::vector<shared_sstable> sstables);
@@ -82,13 +82,13 @@ public:

    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

-    virtual compaction_descriptor get_sstables_for_compaction(table_state& t, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& t, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& t, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& t, std::vector<shared_sstable> candidates) const override;

-    virtual compaction_descriptor get_major_compaction_job(table_state& t, std::vector<sstables::shared_sstable> candidates) override;
+    virtual compaction_descriptor get_major_compaction_job(compaction_group_view& t, std::vector<sstables::shared_sstable> candidates) override;

-    virtual int64_t estimated_pending_compactions(table_state& t) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& t) const override;

    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::incremental;
@@ -98,7 +98,7 @@ public:

    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;

-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const override;

    friend class ::incremental_backlog_tracker;
 };
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -13,13 +13,13 @@

 namespace sstables {

-leveled_compaction_strategy_state& leveled_compaction_strategy::get_state(table_state& table_s) const {
+leveled_compaction_strategy_state& leveled_compaction_strategy::get_state(compaction_group_view& table_s) const {
    return table_s.get_compaction_strategy_state().get<leveled_compaction_strategy_state>();
 }

-compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    auto& state = get_state(table_s);
-    auto candidates = control.candidates(table_s);
+    auto candidates = co_await control.candidates(table_s);
    // NOTE: leveled_manifest creation may be slightly expensive, so later on,
    // we may want to store it in the strategy itself. However, the sstable
    // lists managed by the manifest may become outdated. For example, one
@@ -32,12 +32,13 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
    auto candidate = manifest.get_compaction_candidates(*state.last_compacted_keys, state.compaction_counter);

    if (!candidate.sstables.empty()) {
-        leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), table_s.main_sstable_set().all()->size());
-        return candidate;
+        auto main_set = co_await table_s.main_sstable_set();
+        leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
+        co_return candidate;
    }

    if (!table_s.tombstone_gc_enabled()) {
-        return compaction_descriptor();
+        co_return compaction_descriptor();
    }

    // if there is no sstable to compact in standard way, try compacting based on droppable tombstone ratio
@@ -59,12 +60,12 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
            auto ratio_j = j->estimate_droppable_tombstone_ratio(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
            return ratio_i < ratio_j;
        });
-        return sstables::compaction_descriptor({ sst }, sst->get_sstable_level());
+        co_return sstables::compaction_descriptor({ sst }, sst->get_sstable_level());
    }
-    return {};
+    co_return compaction_descriptor();
 }

-compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
+compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) {
    if (candidates.empty()) {
        return compaction_descriptor();
    }
@@ -75,7 +76,7 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(tabl
                                 ideal_level, max_sstable_size_in_bytes);
 }

-void leveled_compaction_strategy::notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
+void leveled_compaction_strategy::notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
    auto& state = get_state(table_s);
    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
@@ -132,14 +133,15 @@ void leveled_compaction_strategy::generate_last_compacted_keys(leveled_compactio
    state.last_compacted_keys = std::move(last_compacted_keys);
 }

-int64_t leveled_compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> leveled_compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    std::vector<sstables::shared_sstable> sstables;
-    auto all_sstables = table_s.main_sstable_set().all();
+    auto main_set = co_await table_s.main_sstable_set();
+    auto all_sstables = main_set->all();
    sstables.reserve(all_sstables->size());
    for (auto& entry : *all_sstables) {
        sstables.push_back(entry);
    }
-    return leveled_manifest::get_estimated_tasks(leveled_manifest::get_levels(sstables), _max_sstable_size_in_mb * 1024 * 1024);
+    co_return leveled_manifest::get_estimated_tasks(leveled_manifest::get_levels(sstables), _max_sstable_size_in_mb * 1024 * 1024);
 }

 compaction_descriptor
@@ -222,7 +224,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
 }

 std::vector<compaction_descriptor>
-leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+leveled_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;

    auto levels = leveled_manifest::get_levels(candidates);
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -43,25 +43,25 @@ class leveled_compaction_strategy : public compaction_strategy_impl {
 private:
    int32_t calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const;

-    leveled_compaction_strategy_state& get_state(table_state& table_s) const;
+    leveled_compaction_strategy_state& get_state(compaction_group_view& table_s) const;
 public:
    static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

    leveled_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const override;

-    virtual compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) override;
+    virtual compaction_descriptor get_major_compaction_job(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) override;

-    virtual void notify_completion(table_state& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) override;
+    virtual void notify_completion(compaction_group_view& table_s, const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) override;

    // for each level > 0, get newest sstable and use its last key as last
    // compacted key for the previous level.
    void generate_last_compacted_keys(leveled_compaction_strategy_state&, leveled_manifest& manifest);

-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override;

    virtual bool parallel_compaction() const override {
        return false;
@@ -70,7 +70,7 @@ public:
    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::leveled;
    }
-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const override;

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

--- a/compaction/leveled_manifest.hh
+++ b/compaction/leveled_manifest.hh
@@ -15,11 +15,11 @@
 #include "utils/assert.hh"
 #include "sstables/sstables.hh"
 #include "size_tiered_compaction_strategy.hh"
-#include "interval.hh"
+#include "utils/interval.hh"
 #include "utils/log.hh"

 class leveled_manifest {
-    table_state& _table_s;
+    compaction_group_view& _table_s;
    schema_ptr _schema;
    std::vector<std::vector<sstables::shared_sstable>> _generations;
    uint64_t _max_sstable_size_in_bytes;
@@ -52,7 +52,7 @@ public:
    // level to be considered worth compacting.
    static constexpr float TARGET_SCORE = 1.001f;
 private:
-    leveled_manifest(table_state& table_s, int max_sstable_size_in_MB, const sstables::size_tiered_compaction_strategy_options& stcs_options)
+    leveled_manifest(compaction_group_view& table_s, int max_sstable_size_in_MB, const sstables::size_tiered_compaction_strategy_options& stcs_options)
        : _table_s(table_s)
        , _schema(table_s.schema())
        , _max_sstable_size_in_bytes(max_sstable_size_in_MB * 1024 * 1024)
@@ -77,7 +77,7 @@ public:
        return levels;
    }

-    static leveled_manifest create(table_state& table_s, std::vector<sstables::shared_sstable>& sstables, int max_sstable_size_in_mb,
+    static leveled_manifest create(compaction_group_view& table_s, std::vector<sstables::shared_sstable>& sstables, int max_sstable_size_in_mb,
            const sstables::size_tiered_compaction_strategy_options& stcs_options) {
        leveled_manifest manifest = leveled_manifest(table_s, max_sstable_size_in_mb, stcs_options);

--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -207,13 +207,13 @@ size_tiered_compaction_strategy::most_interesting_bucket(std::vector<std::vector
    return std::move(max);
 }

-compaction_descriptor
-size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor>
+size_tiered_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    // make local copies so they can't be changed out from under us mid-method
    int min_threshold = table_s.min_compaction_threshold();
    int max_threshold = table_s.schema()->max_compaction_threshold();
    auto compaction_time = gc_clock::now();
-    auto candidates = control.candidates(table_s);
+    auto candidates = co_await control.candidates(table_s);

    // TODO: Add support to filter cold sstables (for reference: SizeTieredCompactionStrategy::filterColdSSTables).

@@ -221,17 +221,17 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_

    if (is_any_bucket_interesting(buckets, min_threshold)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), min_threshold, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        co_return sstables::compaction_descriptor(std::move(most_interesting));
    }

    // If we are not enforcing min_threshold explicitly, try any pair of SStables in the same tier.
    if (!table_s.compaction_enforce_min_threshold() && is_any_bucket_interesting(buckets, 2)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), 2, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        co_return sstables::compaction_descriptor(std::move(most_interesting));
    }

    if (!table_s.tombstone_gc_enabled()) {
-        return compaction_descriptor();
+        co_return compaction_descriptor();
    }

    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
@@ -250,9 +250,9 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
        auto it = std::min_element(sstables.begin(), sstables.end(), [] (auto& i, auto& j) {
            return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
        });
-        return sstables::compaction_descriptor({ *it });
+        co_return sstables::compaction_descriptor({ *it });
    }
-    return sstables::compaction_descriptor();
+    co_return sstables::compaction_descriptor();
 }

 int64_t size_tiered_compaction_strategy::estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
@@ -266,18 +266,19 @@ int64_t size_tiered_compaction_strategy::estimated_pending_compactions(const std
    return n;
 }

-int64_t size_tiered_compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> size_tiered_compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    int min_threshold = table_s.min_compaction_threshold();
    int max_threshold = table_s.schema()->max_compaction_threshold();
    std::vector<sstables::shared_sstable> sstables;

-    auto all_sstables = table_s.main_sstable_set().all();
+    auto main_set = co_await table_s.main_sstable_set();
+    auto all_sstables = main_set->all();
    sstables.reserve(all_sstables->size());
    for (auto& entry : *all_sstables) {
        sstables.push_back(entry);
    }

-    return estimated_pending_compactions(sstables, min_threshold, max_threshold, _options);
+    co_return estimated_pending_compactions(sstables, min_threshold, max_threshold, _options);
 }

 std::vector<sstables::shared_sstable>
@@ -337,7 +338,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
 }

 std::vector<compaction_descriptor>
-size_tiered_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+size_tiered_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;
    const auto& schema = table_s.schema();
    unsigned max_threshold = schema->max_compaction_threshold();
--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -75,13 +75,13 @@ public:
    explicit size_tiered_compaction_strategy(const size_tiered_compaction_strategy_options& options);
    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const override;

    static int64_t estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
        int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options);
-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override;

    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::size_tiered;
--- a/compaction/strategy_control.hh
+++ b/compaction/strategy_control.hh
@@ -18,9 +18,9 @@ namespace compaction {
 class strategy_control {
 public:
    virtual ~strategy_control() {}
-    virtual bool has_ongoing_compaction(table_state& table_s) const noexcept = 0;
-    virtual std::vector<sstables::shared_sstable> candidates(table_state&) const = 0;
-    virtual std::vector<sstables::frozen_sstable_run> candidates_as_runs(table_state&) const = 0;
+    virtual bool has_ongoing_compaction(compaction_group_view& table_s) const noexcept = 0;
+    virtual future<std::vector<sstables::shared_sstable>> candidates(compaction_group_view&) const = 0;
+    virtual future<std::vector<sstables::frozen_sstable_run>> candidates_as_runs(compaction_group_view&) const = 0;
 };

 }
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -158,7 +158,7 @@ future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory::
    // There is a semaphore inside the compaction manager in run_resharding_jobs. So we
    // parallel_for_each so the statistics about pending jobs are updated to reflect all
    // jobs. But only one will run in parallel at a time
-    auto& t = table.try_get_table_state_with_static_sharding();
+    auto& t = table.try_get_compaction_group_view_with_static_sharding();
    co_await coroutine::parallel_for_each(buckets, [&] (std::vector<sstables::shared_sstable>& sstlist) mutable {
        return table.get_compaction_manager().run_custom_job(t, sstables::compaction_type::Reshard, "Reshard compaction", [&] (sstables::compaction_data& info, sstables::compaction_progress_monitor& progress_monitor) -> future<> {
            auto erm = table.get_effective_replication_map(); // keep alive around compaction.
@@ -453,7 +453,7 @@ future<> global_cleanup_compaction_task_impl::run() {
        co_await coroutine::parallel_for_each(keyspaces, [&] (const sstring& ks) -> future<> {
            const auto& keyspace = db.find_keyspace(ks);
            const auto& replication_strategy = keyspace.get_replication_strategy();
-            if (replication_strategy.get_type() == locator::replication_strategy_type::local) {
+            if (replication_strategy.is_local()) {
                // this keyspace does not require cleanup
                co_return;
            }
@@ -495,7 +495,7 @@ future<> table_cleanup_keyspace_compaction_task_impl::run() {
    // it is the responsibility of the system operator to not
    // perform additional incompatible range movements during cleanup.
    auto get_owned_ranges = [&] (std::string_view ks_name) -> future<owned_ranges_ptr> {
-        const auto& erm = _db.find_keyspace(ks_name).get_vnode_effective_replication_map();
+        const auto& erm = _db.find_keyspace(ks_name).get_static_effective_replication_map();
        co_return compaction::make_owned_ranges_ptr(co_await _db.get_keyspace_local_ranges(erm));
    };
    auto owned_ranges_ptr = co_await get_owned_ranges(_status.keyspace);
@@ -575,14 +575,15 @@ future<> table_upgrade_sstables_compaction_task_impl::run() {
        if (ks.get_replication_strategy().is_per_table()) {
            co_return nullptr;
        }
-        const auto& erm = ks.get_vnode_effective_replication_map();
+        const auto& erm = ks.get_static_effective_replication_map();
        co_return compaction::make_owned_ranges_ptr(co_await _db.get_keyspace_local_ranges(erm));
    };
    auto owned_ranges_ptr = co_await get_owned_ranges(_status.keyspace);
    tasks::task_info info{_status.id, _status.shard};
    co_await run_on_table("upgrade_sstables", _db, _status.keyspace, _ti, [&] (replica::table& t) -> future<> {
-        return t.parallel_foreach_table_state([&] (compaction::table_state& ts) -> future<> {
-            return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, _exclude_current_version, info);
+        return t.parallel_foreach_compaction_group_view([&] (compaction::compaction_group_view& ts) -> future<> {
+            auto lock_holder = co_await t.get_compaction_manager().get_incremental_repair_read_lock(ts, "upgrade_sstables_compaction");
+            co_await t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, _exclude_current_version, info);
        });
    });
 }
@@ -620,7 +621,8 @@ future<> table_scrub_sstables_compaction_task_impl::run() {
    auto& cm = _db.get_compaction_manager();
    auto& cf = _db.find_column_family(_status.keyspace, _status.table);
    tasks::task_info info{_status.id, _status.shard};
-    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
+    co_await cf.parallel_foreach_compaction_group_view([&] (compaction::compaction_group_view& ts) mutable -> future<> {
+        auto lock_holder = co_await cm.get_incremental_repair_read_lock(ts, "scrub_sstables_compaction");
        auto r = co_await cm.perform_sstable_scrub(ts, _opts, info);
        _stats += r.value_or(sstables::compaction_stats{});
    });
@@ -648,19 +650,20 @@ future<> shard_reshaping_compaction_task_impl::run() {
    auto holder = table.async_gate().hold();
    tasks::task_info info{_status.id, _status.shard};

-    std::unordered_map<compaction::table_state*, std::unordered_set<sstables::shared_sstable>> sstables_grouped_by_compaction_group;
+    std::unordered_map<compaction::compaction_group_view*, std::unordered_set<sstables::shared_sstable>> sstables_grouped_by_compaction_group;
    for (auto& sstable : _dir.get_unshared_local_sstables()) {
-        auto& t = table.table_state_for_sstable(sstable);
+        auto& t = table.compaction_group_view_for_sstable(sstable);
        sstables_grouped_by_compaction_group[&t].insert(sstable);
    }

    // reshape sstables individually within the compaction groups
    for (auto& sstables_in_cg : sstables_grouped_by_compaction_group) {
+        auto lock_holder = co_await table.get_compaction_manager().get_incremental_repair_read_lock(*sstables_in_cg.first, "reshaping_compaction");
        co_await reshape_compaction_group(*sstables_in_cg.first, sstables_in_cg.second, table, info);
    }
 }

-future<> shard_reshaping_compaction_task_impl::reshape_compaction_group(compaction::table_state& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info) {
+future<> shard_reshaping_compaction_task_impl::reshape_compaction_group(compaction::compaction_group_view& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info) {

    while (true) {
        auto reshape_candidates = sstables_in_cg
--- a/compaction/task_manager_module.hh
+++ b/compaction/task_manager_module.hh
@@ -628,7 +628,7 @@ private:
    std::function<bool (const sstables::shared_sstable&)> _filter;
    uint64_t& _total_shard_size;

-    future<> reshape_compaction_group(compaction::table_state& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info);
+    future<> reshape_compaction_group(compaction::compaction_group_view& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info);
 public:
    shard_reshaping_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -21,7 +21,7 @@ extern logging::logger clogger;

 using timestamp_type = api::timestamp_type;

-time_window_compaction_strategy_state& time_window_compaction_strategy::get_state(table_state& table_s) const {
+time_window_compaction_strategy_state& time_window_compaction_strategy::get_state(compaction_group_view& table_s) const {
    return table_s.get_compaction_strategy_state().get<time_window_compaction_strategy_state>();
 }

@@ -208,7 +208,7 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
 }

-reader_consumer_v2 time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const {
+mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
    if (ms_meta.min_timestamp && ms_meta.max_timestamp
            && get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
        return end_consumer;
@@ -332,14 +332,14 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
    return compaction_descriptor();
 }

-compaction_descriptor
-time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+future<compaction_descriptor>
+time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
    auto& state = get_state(table_s);
    auto compaction_time = gc_clock::now();
-    auto candidates = control.candidates(table_s);
+    auto candidates = co_await control.candidates(table_s);

    if (candidates.empty()) {
-        return compaction_descriptor();
+        co_return compaction_descriptor();
    }

    auto now = db_clock::now();
@@ -350,7 +350,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
        auto expired = table_s.fully_expired_sstables(candidates, compaction_time);
        if (!expired.empty()) {
            clogger.debug("[{}] Going to compact {} expired sstables", fmt::ptr(this), expired.size());
-            return compaction_descriptor(has_only_fully_expired::yes, std::vector<shared_sstable>(expired.begin(), expired.end()));
+            co_return compaction_descriptor(has_only_fully_expired::yes, std::vector<shared_sstable>(expired.begin(), expired.end()));
        }
        // Keep checking for fully_expired_sstables until we don't find
        // any among the candidates, meaning they are either already compacted
@@ -362,7 +362,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_

    auto compaction_candidates = get_next_non_expired_sstables(table_s, control, std::move(candidates), compaction_time);
    clogger.debug("[{}] Going to compact {} non-expired sstables", fmt::ptr(this), compaction_candidates.size());
-    return compaction_descriptor(std::move(compaction_candidates));
+    co_return compaction_descriptor(std::move(compaction_candidates));
 }

 time_window_compaction_strategy::bucket_compaction_mode
@@ -382,7 +382,7 @@ time_window_compaction_strategy::compaction_mode(const time_window_compaction_st
 }

 std::vector<shared_sstable>
-time_window_compaction_strategy::get_next_non_expired_sstables(table_state& table_s, strategy_control& control,
+time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
        std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time) {
    auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables);

@@ -409,7 +409,7 @@ time_window_compaction_strategy::get_next_non_expired_sstables(table_state& tabl
 }

 std::vector<shared_sstable>
-time_window_compaction_strategy::get_compaction_candidates(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables) {
+time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables) {
    auto& state = get_state(table_s);
    auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
    // Update the highest window seen, if necessary
@@ -463,7 +463,7 @@ struct fmt::formatter<std::map<sstables::timestamp_type, std::vector<sstables::s
 namespace sstables {

 std::vector<shared_sstable>
-time_window_compaction_strategy::newest_bucket(table_state& table_s, strategy_control& control, std::map<timestamp_type, std::vector<shared_sstable>> buckets,
+time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<timestamp_type, std::vector<shared_sstable>> buckets,
        int min_threshold, int max_threshold, timestamp_type now) {
    auto& state = get_state(table_s);
    clogger.debug("time_window_compaction_strategy::newest_bucket:\n  now {}\n{}", now, buckets);
@@ -515,11 +515,12 @@ time_window_compaction_strategy::trim_to_threshold(std::vector<shared_sstable> b
    return bucket;
 }

-int64_t time_window_compaction_strategy::estimated_pending_compactions(table_state& table_s) const {
+future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
    auto& state = get_state(table_s);
    auto min_threshold = table_s.min_compaction_threshold();
    auto max_threshold = table_s.schema()->max_compaction_threshold();
-    auto candidate_sstables = *table_s.main_sstable_set().all() | std::ranges::to<std::vector>();
+    auto main_set = co_await table_s.main_sstable_set();
+    auto candidate_sstables = *main_set->all() | std::ranges::to<std::vector>();
    auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);

    int64_t n = 0;
@@ -535,11 +536,11 @@ int64_t time_window_compaction_strategy::estimated_pending_compactions(table_sta
            break;
        }
    }
-    return n;
+    co_return n;
 }

 std::vector<compaction_descriptor>
-time_window_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
+time_window_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const {
    std::vector<compaction_descriptor> ret;
    for (auto&& [_, sstables] : get_buckets(std::move(candidates), _options).first) {
        auto per_window_jobs = size_tiered_compaction_strategy(_stcs_options).get_cleanup_compaction_jobs(table_s, std::move(sstables));
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -81,13 +81,13 @@ public:
    enum class bucket_compaction_mode { none, size_tiered, major };
 public:
    time_window_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual future<compaction_descriptor> get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) override;

-    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;
+    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<shared_sstable> candidates) const override;

    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
 private:
-    time_window_compaction_strategy_state& get_state(table_state& table_s) const;
+    time_window_compaction_strategy_state& get_state(compaction_group_view& table_s) const;

    static api::timestamp_type
    to_timestamp_type(time_window_compaction_strategy_options::timestamp_resolutions resolution, int64_t timestamp_from_sstable) {
@@ -111,9 +111,9 @@ private:
    compaction_mode(const time_window_compaction_strategy_state&, const bucket_t& bucket, api::timestamp_type bucket_key, api::timestamp_type now, size_t min_threshold) const;

    std::vector<shared_sstable>
-    get_next_non_expired_sstables(table_state& table_s, strategy_control& control, std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time);
+    get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control, std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time);

-    std::vector<shared_sstable> get_compaction_candidates(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables);
+    std::vector<shared_sstable> get_compaction_candidates(compaction_group_view& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables);
 public:
    // Find the lowest timestamp for window of given size
    static api::timestamp_type
@@ -126,7 +126,7 @@ public:
    get_buckets(std::vector<shared_sstable> files, const time_window_compaction_strategy_options& options);

    std::vector<shared_sstable>
-    newest_bucket(table_state& table_s, strategy_control& control, std::map<api::timestamp_type, std::vector<shared_sstable>> buckets,
+    newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<api::timestamp_type, std::vector<shared_sstable>> buckets,
        int min_threshold, int max_threshold, api::timestamp_type now);

    static std::vector<shared_sstable>
@@ -144,19 +144,19 @@ public:
 private:
    friend class time_window_backlog_tracker;
 public:
-    virtual int64_t estimated_pending_compactions(table_state& table_s) const override;
+    virtual future<int64_t> estimated_pending_compactions(compaction_group_view& table_s) const override;

    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::time_window;
    }

-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const compaction_group_view& ts) const override;

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) const override;

-    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const override;
+    virtual mutation_reader_consumer make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const override;

    virtual bool use_interposer_consumer() const override {
        return true;
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -776,6 +776,35 @@ maintenance_socket: ignore
 #       ...
 #

+#
+# Azure Key Vault host(s).
+#
+# The unique name of azure host/account config that can be referenced in table schema.
+#
+# host.yourdomain.com={ azure_tenant_id=<the tenant hosting your service principal>, azure_client_id=<ID of your service principal>, azure_client_secret=<secret of the service principal>, azure_client_certificate_path=<path to PEM-encoded certificate and private key of the service principal>, master_key=<vault name>/<keyname>, truststore=/path/to/truststore.pem, priority_string=<tls priority string>, key_cache_expiry=<cache expiry in ms>, key_cache_refersh=<cache refresh in ms>}:...
+#
+# Authentication can be explicit with Service Principal credentials. Either secret or certificate can be provided.
+# If both are provided, the secret will be used. If no credentials are provided, the provider will try to detect them
+# from the environment, the Azure CLI, and IMDS, in this specific order.
+#
+# master_key is a Vault key that will be used to wrap all keys used for actual encryption of scylla data.
+# This key must be pre-created and the principal must have permissions for Wrapkey and Unwrapkey operations on this key.
+#
+# azure_hosts:
+#    <name>:
+#       azure_tenant_id: <the tenant hosting your service principal> (optional)
+#       azure_client_id: <ID of your service principal> (optional)
+#       azure_client_secret: <secret of the service principal> (optional)
+#       azure_client_certificate_path: <path to PEM-encoded certificate and private key of the service principal> (optional)
+#       master_key: <vault name>/<keyname> - named Vault key for key wrapping (optional)
+#       truststore: <PEM file with CA certificates for TLS connection> (optional)
+#       priority_string: <GnuTLS priority string for TLS handshake> (optional)
+#       key_cache_expiry: <key cache expiry period (ms)> (optional)
+#       key_cache_refresh: <key cache refresh/prune period (ms)> (optional)
+#   <name>:
+#       ...
+#
+
 #
 # Server-global user information encryption settings
 #
@@ -855,3 +884,25 @@ rf_rack_valid_keyspaces: false
 # Maximum number of items in single BatchWriteItem command. Default is 100.
 # Note: DynamoDB has a hard-coded limit of 25.
 # alternator_max_items_in_batch_write: 100
+
+#
+# Vector Store options
+#
+# Uri for the vector store using dns name. Only http schema is supported. Port number is mandatory.
+# Default is empty, which means that the vector store is not used.
+# vector_store_uri: http://vector-store.dns.name:{port}
+
+# 
+# io-streaming rate limiting
+# When setting this value to be non-zero scylla throttles disk throughput for
+# stream (network) activities such as backup, repair, tablet migration and more.
+# This limit is useful for user queries so the network interface does 
+# not get saturated by streaming activities.
+# The recommended value is 75% of network bandwidth
+# E.g for i4i.8xlarge (https://github.com/scylladb/scylla-machine-image/tree/next/common/aws_net_params.json):
+# network: 18.75 GiB/s --> 18750 Mib/s --> 1875 MB/s (from network bits to network bytes: divide by 10, not 8)
+# Converted to disk bytes: 1875 * 1000 / 1024 = 1831 MB/s (disk wise)
+# 75% of disk bytes is: 0.75 * 1831 = 1373 megabytes/s
+# stream_io_throughput_mb_per_sec: 1373
+# 
+
--- a/configure.py
+++ b/configure.py
@@ -274,8 +274,8 @@ def generate_compdb(compdb, ninja, buildfile, modes):
            mode_out = outdir + '/' + mode
            submodule_compdbs = [mode_out + '/' + submodule + '/' + compdb for submodule in ['seastar', 'abseil']]
            with open(mode_out + '/' + compdb, 'w+b') as combined_mode_specific_compdb:
-                subprocess.run(['./scripts/merge-compdb.py', outdir + '/' + mode,
-                                ninja_compdb.name] + submodule_compdbs, stdout=combined_mode_specific_compdb)
+                subprocess.run(['./scripts/merge-compdb.py', ninja_compdb.name + ':' + mode_out] + submodule_compdbs, 
+                               stdout=combined_mode_specific_compdb)

    # sort modes by supposed indexing speed
    for mode in ['dev', 'debug', 'release', 'sanitize']:
@@ -469,6 +469,7 @@ scylla_tests = set([
    'test/boost/chunked_vector_test',
    'test/boost/clustering_ranges_walker_test',
    'test/boost/compaction_group_test',
+    'test/boost/comparable_bytes_test',
    'test/boost/compound_test',
    'test/boost/compress_test',
    'test/boost/config_test',
@@ -550,6 +551,7 @@ scylla_tests = set([
    'test/boost/sstable_conforms_to_mutation_source_test',
    'test/boost/sstable_datafile_test',
    'test/boost/sstable_generation_test',
+    'test/boost/sstable_inexact_index_test',
    'test/boost/sstable_move_test',
    'test/boost/sstable_mutation_test',
    'test/boost/sstable_partition_index_cache_test',
@@ -563,15 +565,21 @@ scylla_tests = set([
    'test/boost/token_metadata_test',
    'test/boost/top_k_test',
    'test/boost/transport_test',
+    'test/boost/bti_key_translation_test',
+    'test/boost/bti_node_sink_test',
+    'test/boost/trie_traversal_test',
+    'test/boost/trie_writer_test',
    'test/boost/symmetric_key_test',
    'test/boost/types_test',
    'test/boost/utf8_test',
+    'test/boost/vector_store_client_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_table_mutation_source_test',
    'test/boost/wasm_alloc_test',
    'test/boost/wasm_test',
    'test/boost/wrapping_interval_test',
    'test/boost/unique_view_test',
+    'test/boost/scoped_item_list_test',
    'test/manual/ec2_snitch_test',
    'test/manual/enormous_table_scan_test',
    'test/manual/gce_snitch_test',
@@ -642,6 +650,7 @@ wasms = set([

 apps = set([
    'scylla',
+    'patchelf',
 ])

 lto_binaries = set([
@@ -768,6 +777,7 @@ scylla_raft_core = [

 scylla_core = (['message/messaging_service.cc',
                'replica/database.cc',
+                'replica/schema_describe_helper.cc',
                'replica/table.cc',
                'replica/tablets.cc',
                'replica/distributed_loader.cc',
@@ -827,9 +837,8 @@ scylla_core = (['message/messaging_service.cc',
                'readers/mutation_reader.cc',
                'readers/mutation_readers.cc',
                'mutation_query.cc',
-                'keys.cc',
+                'keys/keys.cc',
                'counters.cc',
-                'compress.cc',
                'sstable_dict_autotrainer.cc',
                'sstables/sstables.cc',
                'sstables/sstables_manager.cc',
@@ -841,6 +850,7 @@ scylla_core = (['message/messaging_service.cc',
                'sstables/kl/reader.cc',
                'sstables/sstable_version.cc',
                'sstables/compress.cc',
+                'sstables/compressor.cc',
                'sstables/checksummed_data_source.cc',
                'sstables/sstable_mutation_reader.cc',
                'compaction/compaction.cc',
@@ -859,6 +869,10 @@ scylla_core = (['message/messaging_service.cc',
                'sstables/random_access_reader.cc',
                'sstables/metadata_collector.cc',
                'sstables/writer.cc',
+                'sstables/trie/bti_key_translation.cc',
+                'sstables/trie/bti_node_reader.cc',
+                'sstables/trie/bti_node_sink.cc',
+                'sstables/trie/trie_writer.cc',
                'transport/cql_protocol_extension.cc',
                'transport/event.cc',
                'transport/event_notifier.cc',
@@ -956,6 +970,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/murmur_hash.cc',
                'utils/uuid.cc',
                'utils/big_decimal.cc',
+                'types/comparable_bytes.cc',
                'types/types.cc',
                'validation.cc',
                'service/migration_manager.cc',
@@ -981,6 +996,7 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/result_set.cc',
                'cql3/prepare_context.cc',
                'db/batchlog_manager.cc',
+                'db/corrupt_data_handler.cc',
                'db/commitlog/commitlog.cc',
                'db/commitlog/commitlog_entry.cc',
                'db/commitlog/commitlog_replayer.cc',
@@ -1021,6 +1037,7 @@ scylla_core = (['message/messaging_service.cc',
                'db/tablet_options.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
+                'index/vector_index.cc',
                'utils/UUID_gen.cc',
                'utils/i_filter.cc',
                'utils/bloom_filter.cc',
@@ -1034,6 +1051,8 @@ scylla_core = (['message/messaging_service.cc',
                'utils/multiprecision_int.cc',
                'utils/gz/crc_combine.cc',
                'utils/gz/crc_combine_table.cc',
+                'utils/http.cc',
+                'utils/rest/client.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
                'utils/s3/retryable_http_client.cc',
@@ -1046,6 +1065,11 @@ scylla_core = (['message/messaging_service.cc',
                'utils/s3/credentials_providers/aws_credentials_provider_chain.cc',
                'utils/s3/utils/manip_s3.cc',
                'utils/advanced_rpc_compressor.cc',
+                'utils/azure/identity/credentials.cc',
+                'utils/azure/identity/service_principal_credentials.cc',
+                'utils/azure/identity/managed_identity_credentials.cc',
+                'utils/azure/identity/azure_cli_credentials.cc',
+                'utils/azure/identity/default_credentials.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -1114,6 +1138,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/incremental.cc',
                'streaming/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
@@ -1152,7 +1177,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/aws_sigv4.cc',
                'duration.cc',
                'vint-serialization.cc',
-                'utils/arch/powerpc/crc32-vpmsum/crc32_wrapper.cc',
                'querier.cc',
                'mutation_writer/multishard_writer.cc',
                'ent/encryption/encryption_config.cc',
@@ -1169,6 +1193,8 @@ scylla_core = (['message/messaging_service.cc',
                'ent/encryption/gcp_host.cc',
                'ent/encryption/gcp_key_provider.cc',
                'ent/encryption/utils.cc',
+                'ent/encryption/azure_host.cc',
+                'ent/encryption/azure_key_provider.cc',
                'ent/ldap/ldap_connection.cc',
                'multishard_mutation_query.cc',
                'reader_concurrency_semaphore.cc',
@@ -1213,6 +1239,7 @@ scylla_core = (['message/messaging_service.cc',
                'node_ops/task_manager_module.cc',
                'reader_concurrency_semaphore_group.cc',
                'utils/disk_space_monitor.cc',
+                'service/vector_store_client.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
               )
@@ -1338,6 +1365,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/replica_exception.idl.hh',
        'idl/per_partition_rate_limit_info.idl.hh',
        'idl/position_in_partition.idl.hh',
+        'idl/full_position.idl.hh',
        'idl/experimental/broadcast_tables_lang.idl.hh',
        'idl/storage_service.idl.hh',
        'idl/join_node.idl.hh',
@@ -1368,6 +1396,7 @@ scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_gener
    'test/lib/exception_utils.cc',
    'test/lib/random_schema.cc',
    'test/lib/key_utils.cc',
+    'test/lib/proc_utils.cc',
 ]

 scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc', 'utils/exceptions.cc']
@@ -1402,6 +1431,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',

 deps = {
    'scylla': idls + ['main.cc'] + scylla_core + api + alternator + redis + scylla_tools + scylla_perfs,
+    'patchelf': ['tools/patchelf.cc'],
 }

 pure_boost_tests = set([
@@ -1538,6 +1568,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/secondary_index_test.cc',
    'test/boost/sessions_test.cc',
    'test/boost/sstable_compaction_test.cc',
+    'test/boost/sstable_compressor_factory_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
    'test/boost/statement_restrictions_test.cc',
@@ -2087,7 +2118,7 @@ libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-l
                 '-ldeflate',
                ])

-args.user_cflags += " " + pkg_config('p11-kit-1', '--cflags')
+user_cflags += " " + pkg_config('p11-kit-1', '--cflags')

 if not args.staticboost:
    user_cflags += ' -DBOOST_ALL_DYN_LINK'
@@ -2122,7 +2153,6 @@ def kmip_arch():

 kmipc_dir = f'kmipc/kmipc-2.1.0t-{kmiplib()}_{kmip_arch()}'
 kmipc_lib = f'{kmipc_dir}/lib/libkmip.a'
-libs += ' -lboost_filesystem'
 if os.path.exists(kmipc_lib):
    libs += f' {kmipc_lib}'
    user_cflags += f' -I{kmipc_dir}/include -DHAVE_KMIP'
@@ -2398,7 +2428,6 @@ def write_build_file(f,
            objs = ['$builddir/' + mode + '/' + src.replace('.cc', '.o')
                    for src in srcs
                    if src.endswith('.cc')]
-            objs.append('$builddir/../utils/arch/powerpc/crc32-vpmsum/crc32.S')
            has_rust = False
            for dep in deps[binary]:
                if isinstance(dep, Antlr3Grammar):
@@ -2610,8 +2639,8 @@ def write_build_file(f,
            include_scylla_and_iotune_stripped = ''
            include_scylla_and_iotune_debug = ''
        else:
-            include_scylla_and_iotune = f'$builddir/{mode}/scylla $builddir/{mode}/iotune'
-            include_scylla_and_iotune_stripped = f'$builddir/{mode}/scylla.stripped $builddir/{mode}/iotune.stripped'
+            include_scylla_and_iotune = f'$builddir/{mode}/scylla $builddir/{mode}/iotune $builddir/{mode}/patchelf'
+            include_scylla_and_iotune_stripped = f'$builddir/{mode}/scylla.stripped $builddir/{mode}/iotune.stripped $builddir/{mode}/patchelf.stripped'
            include_scylla_and_iotune_debug = f'$builddir/{mode}/scylla.debug $builddir/{mode}/iotune.debug'
        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-unstripped-{scylla_version}-{scylla_release}.{arch}.tar.gz: package {include_scylla_and_iotune} $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter/node_exporter | always\n'.format(**locals()))
        f.write('  mode = {mode}\n'.format(**locals()))
@@ -2806,6 +2835,12 @@ def create_build_system(args):
        mode_config.update(query_seastar_flags(f'{outdir}/{mode}/seastar/seastar.pc',
                                               mode_config['build_seastar_shared_libs'],
                                               args.staticcxx))
+    # If Scylla is compiled without -g, strip the debug symbols from
+    # the result in case one of the linked static libraries happens to
+    ## have some debug symbols. See issue #23834.
+    for mode, mode_config in build_modes.items():
+        if '-g' not in user_cflags.split() + mode_config['cxxflags'].split():
+            mode_config['cxx_ld_flags'] += ' -Wl,--strip-debug'

    ninja = find_ninja()
    with open(args.buildfile, 'w') as f:
@@ -2839,13 +2874,7 @@ def generate_compdb_for_cmake_build(source_dir, build_dir):
    assert seastar_compdb_path, "Seasetar's building system is not configured yet."
    # if the file exists, just overwrite it so we can keep it updated
    with open(os.path.join(source_dir, compdb), 'w+b') as merged_compdb:
-        # "merge-compdb.py" considers all object files under the "--prefix"
-        # directory as relevant. Since CMake generates .o files in
-        # "CMakeFiles" directories, we preserve the compilation rules for
-        # these generated files.
-        prefix = ""
        subprocess.run([os.path.join(source_dir, 'scripts/merge-compdb.py'),
-                        prefix,
                        scylla_compdb_path,
                        seastar_compdb_path],
                       stdout=merged_compdb,
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -413,6 +413,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool bypass_cache = false;
        auto attrs = std::make_unique<cql3::attributes::raw>();
        expression wclause = conjunction{};
+        bool is_ann_ordering = false;
    }
    : K_SELECT (
                ( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
@@ -425,7 +426,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
             )
      ( K_WHERE w=whereClause { wclause = std::move(w); } )?
      ( K_GROUP K_BY gbcolumns=listOfIdentifiers)?
-      ( K_ORDER K_BY orderByClause[orderings] ( ',' orderByClause[orderings] )* )?
+      ( K_ORDER K_BY orderByClause[orderings, is_ann_ordering] ( ',' orderByClause[orderings, is_ann_ordering] )* )?
      ( K_PER K_PARTITION K_LIMIT rows=intValue { per_partition_limit = std::move(rows); } )?
      ( K_LIMIT rows=intValue { limit = std::move(rows); } )?
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
@@ -484,11 +485,37 @@ whereClause returns [uexpression clause]
        { clause = conjunction{std::move(terms)}; }
    ;

-orderByClause[raw::select_statement::parameters::orderings_type& orderings]
+orderByClause[raw::select_statement::parameters::orderings_type& orderings, bool& is_ann_ordering]
    @init{
        raw::select_statement::ordering ordering = raw::select_statement::ordering::ascending;
+        std::optional<expression> ann_ordering;
+    }
+    : c=cident (K_ANN K_OF t=term {ann_ordering=std::move(t);})? (K_ASC | K_DESC { ordering = raw::select_statement::ordering::descending; })?
+    {
+        if (!ann_ordering) {
+            if (is_ann_ordering) {
+                throw exceptions::invalid_request_exception(
+                    "ANN ordering does not support any other ordering");
+            }
+            orderings.emplace_back(c, ordering);
+        } else {
+            if (ordering != raw::select_statement::ordering::ascending) {
+                throw exceptions::invalid_request_exception(
+                    "Descending ANN ordering is not supported");
+            }
+            if (!orderings.empty()) {
+                if (is_ann_ordering) {
+                    throw exceptions::invalid_request_exception(
+                        "Cannot specify more than one ANN ordering");
+                } else {
+                    throw exceptions::invalid_request_exception(
+                        "ANN ordering does not support any other ordering");
+                }
+            }
+            is_ann_ordering = true;
+            orderings.emplace_back(c, ann_ordering.value());
+        }
    }
-    : c=cident (K_ASC | K_DESC { ordering = raw::select_statement::ordering::descending; })? { orderings.emplace_back(c, ordering); }
    ;

 jsonValue returns [uexpression value]
@@ -2243,6 +2270,7 @@ K_ORDER:       O R D E R;
 K_BY:          B Y;
 K_ASC:         A S C;
 K_DESC:        D E S C;
+K_ANN:         A N N;
 K_ALLOW:       A L L O W;
 K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
--- a/cql3/description.cc
+++ b/cql3/description.cc
@@ -18,21 +18,21 @@ static logging::logger dlogger{"description"};

 namespace cql3 {

-std::vector<bytes_opt> description::serialize(bool serialize_create_statement) const {
-    std::vector<bytes_opt> result{};
+std::vector<managed_bytes_opt> description::serialize(bool serialize_create_statement) && {
+    std::vector<managed_bytes_opt> result{};
    result.reserve(serialize_create_statement ? 4 : 3);

    if (keyspace) {
-        result.push_back(to_bytes(cql3::util::maybe_quote(*keyspace)));
+        result.push_back(to_managed_bytes(cql3::util::maybe_quote(*keyspace)));
    } else {
-        result.push_back(data_value::make_null(utf8_type).serialize());
+        result.push_back(to_managed_bytes_opt(data_value::make_null(utf8_type).serialize()));
    }

-    result.push_back(to_bytes(type));
-    result.push_back(to_bytes(cql3::util::maybe_quote(name)));
+    result.push_back(to_managed_bytes(type));
+    result.push_back(to_managed_bytes(cql3::util::maybe_quote(name)));

    if (serialize_create_statement && create_statement) {
-        result.push_back(to_bytes(*create_statement));
+        result.push_back(std::move(create_statement.value()).as_managed_bytes());
    } else if (serialize_create_statement) {
        on_internal_error(dlogger, "create_statement field is empty");
    }
--- a/cql3/description.hh
+++ b/cql3/description.hh
@@ -11,7 +11,7 @@
 #include <seastar/core/sstring.hh>
 #include <seastar/util/bool_class.hh>

-#include "bytes_fwd.hh"
+#include "utils/managed_string.hh"

 #include <optional>
 #include <vector>
@@ -69,8 +69,18 @@ struct description {
    sstring type;
    /// The name of the entity itself, e.g. a keyspace of name `ks` will be of name: ks
    sstring name;
-    /// CQL statement that can be used to restore the entity.
-    std::optional<sstring> create_statement;
+    /// Encoded CQL statement that can be used to restore the entity.
+    ///
+    /// Technical note:
+    /// ---------------
+    /// This field could (and used to) be an optional of `sstring`.
+    /// The reason why we use `managed_string` instead is that some create statements
+    /// may be quite large and lead to oversized allocations if we use a contiguous
+    /// memory buffer. That's a rare occurrence (in my own experience), but it has
+    /// happened: see issue scylladb/scylladb#24018. That's why we need to use
+    /// `managed_string` right away: it's less convenient to handle, but this struct
+    /// is pretty much only used for serialization purposes, so it's a good trade-off.
+    std::optional<managed_string> create_statement;

    /// Serialize the description to represent multiple UTF-8 columns.
    /// The number of columns will be equal to 4 unless `serialize_create_statement`
@@ -80,7 +90,7 @@ struct description {
    ///
    /// Precondition: if `serialize_create_statement` is true, then `create_statement.has_value()`
    ///               is also true.
-    std::vector<bytes_opt> serialize(bool serialize_create_statement = true) const;
+    std::vector<managed_bytes_opt> serialize(bool serialize_create_statement = true) &&;
 };

 } // namespace cql3
--- a/cql3/expr/expr-utils.hh
+++ b/cql3/expr/expr-utils.hh
@@ -6,8 +6,8 @@
 #include "expression.hh"

 #include "bytes.hh"
-#include "keys.hh"
-#include "interval.hh"
+#include "keys/keys.hh"
+#include "utils/interval.hh"
 #include "cql3/expr/restrictions.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/statements/bound.hh"
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -26,6 +26,7 @@
 #include <cstdint>
 #include <optional>
 #include <type_traits>
+#include "utils/managed_string.hh"

 using namespace cql3;
 using namespace functions;
@@ -357,7 +358,7 @@ user_aggregate::user_aggregate(function_name fname, bytes_opt initcond, ::shared
 bool user_aggregate::has_finalfunc() const { return _agg.state_to_result_function != nullptr; }

 description user_aggregate::describe(with_create_statement with_stmt) const {
-    auto maybe_create_statement = std::invoke([&] -> std::optional<sstring> {
+    auto maybe_create_statement = std::invoke([&] -> std::optional<managed_string> {
        if (!with_stmt) {
            return std::nullopt;
        }
@@ -365,7 +366,7 @@ description user_aggregate::describe(with_create_statement with_stmt) const {
        auto ks = cql3::util::maybe_quote(name().keyspace);
        auto na = cql3::util::maybe_quote(name().name);

-        std::ostringstream os;
+        fragmented_ostringstream os;

        os << "CREATE AGGREGATE " << ks << "." << na << "(";
        auto a = arg_types();
@@ -390,7 +391,7 @@ description user_aggregate::describe(with_create_statement with_stmt) const {
        }
        os << ";";

-        return std::move(os).str();
+        return std::move(os).to_managed_string();
    });

    return description {
--- a/cql3/functions/functions.hh
+++ b/cql3/functions/functions.hh
@@ -16,6 +16,7 @@
 #include "cql3/functions/function_name.hh"
 #include "schema/schema.hh"
 #include <unordered_map>
+#include "data_dictionary/user_types_metadata.hh"

 namespace cql3 {

@@ -102,6 +103,13 @@ const functions& instance();

 class change_batch : public functions {
 public:
+    struct func_name_and_args {
+        function_name name;
+        std::vector<data_type> arg_types;
+        bool aggregate;
+    };
+    std::vector<func_name_and_args> removed_functions;
+
    // Skip init as we copy data from static instance.
    change_batch() : functions(skip_init{}) {
        _declared = instance()._declared;
@@ -112,6 +120,15 @@ public:

    // Used only by unittest.
    void clear_functions() noexcept;
+
+    void remove_function(function_name& name, std::vector<data_type>& arg_types, bool aggregate = false) {
+        removed_functions.emplace_back(name, arg_types, aggregate);
+        functions::remove_function(name, arg_types);
+    };
+
+    void remove_aggregate(function_name& name, std::vector<data_type>& arg_types) {
+        remove_function(name, arg_types, true);
+    }
 };

 }
--- a/cql3/functions/user_function.cc
+++ b/cql3/functions/user_function.cc
@@ -11,6 +11,7 @@
 #include "cql3/util.hh"
 #include "utils/log.hh"
 #include "lang/wasm.hh"
+#include "utils/managed_string.hh"

 #include <seastar/core/thread.hh>

@@ -70,11 +71,13 @@ bytes_opt user_function::execute(std::span<const bytes_opt> parameters) {
 }

 description user_function::describe(with_create_statement with_stmt) const {
-    auto maybe_create_statement = std::invoke([&] -> std::optional<sstring> {
+    auto maybe_create_statement = std::invoke([&] -> std::optional<managed_string> {
        if (!with_stmt) {
            return std::nullopt;
        }

+        fragmented_ostringstream stream;
+
        auto arg_type_range = _arg_types | std::views::transform(std::mem_fn(&abstract_type::cql3_type_name_without_frozen));
        auto arg_range = std::views::zip(_arg_names, arg_type_range)
                | std::views::transform([] (std::tuple<std::string_view, std::string_view> arg) {
@@ -82,7 +85,7 @@ description user_function::describe(with_create_statement with_stmt) const {
                    return seastar::format("{} {}", name, type);
                });

-        return seastar::format("CREATE FUNCTION {}.{}({})\n"
+        fmt::format_to(stream.to_iter(), "CREATE FUNCTION {}.{}({})\n"
                "{} ON NULL INPUT\n"
                "RETURNS {}\n"
                "LANGUAGE {}\n"
@@ -92,6 +95,8 @@ description user_function::describe(with_create_statement with_stmt) const {
                _return_type->cql3_type_name_without_frozen(),
                _language,
                _body);
+
+        return std::move(stream).to_managed_string();
    });

    return description {
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -18,7 +18,7 @@ namespace cql3 {
 const cql_config default_cql_config(cql_config::default_tag{});

 thread_local const query_options::specific_options query_options::specific_options::DEFAULT{
-    -1, {}, db::consistency_level::SERIAL, api::missing_timestamp};
+    -1, {}, db::consistency_level::SERIAL, api::missing_timestamp, service::node_local_only::no};

 thread_local query_options query_options::DEFAULT{default_cql_config,
    db::consistency_level::ONE, std::nullopt,
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -19,6 +19,7 @@
 #include "service/pager/paging_state.hh"
 #include "cql3/values.hh"
 #include "utils/small_vector.hh"
+#include "service/storage_proxy_fwd.hh"

 namespace cql3 {

@@ -74,6 +75,7 @@ public:
        const lw_shared_ptr<service::pager::paging_state> state;
        const std::optional<db::consistency_level> serial_consistency;
        const api::timestamp_type timestamp;
+        const service::node_local_only node_local_only;
    };
 private:
    const cql_config& _cql_config;
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -27,6 +27,7 @@
 #include "cql3/untyped_result_set.hh"
 #include "db/config.hh"
 #include "data_dictionary/data_dictionary.hh"
+#include "service/vector_store_client.hh"
 #include "utils/hashers.hh"
 #include "utils/error_injection.hh"
 #include "service/migration_manager.hh"
@@ -68,11 +69,12 @@ static service::query_state query_state_for_internal_call() {
    return {service::client_state::for_internal_calls(), empty_service_permit()};
 }

-query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
+query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, service::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
        : _migration_subscriber{std::make_unique<migration_subscriber>(this)}
        , _proxy(proxy)
        , _db(db)
        , _mnotifier(mn)
+        , _vector_store_client(vsc)
        , _mcfg(mcfg)
        , _cql_config(cql_cfg)
        , _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
@@ -679,15 +681,32 @@ query_processor::prepare(sstring query_string, service::query_state& query_state

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
 query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
-    using namespace cql_transport::messages;
-    return prepare_one<result_message::prepared::cql>(
-            std::move(query_string),
-            client_state,
-            d,
-            [d] (std::string_view query_string, std::string_view keyspace) {
-                return compute_id(query_string, keyspace, d);
-            },
-            prepared_cache_key_type::cql_id);
+    try {
+        auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
+        auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
+                auto prepared = get_statement(query_string, client_state, d);
+                prepared->calculate_metadata_id();
+                auto bound_terms = prepared->statement->get_bound_terms();
+                if (bound_terms > std::numeric_limits<uint16_t>::max()) {
+                    throw exceptions::invalid_request_exception(
+                            format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}",
+                                bound_terms,
+                                std::numeric_limits<uint16_t>::max()));
+                }
+                SCYLLA_ASSERT(bound_terms == prepared->bound_names.size());
+                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
+            });
+
+        const auto& warnings = prep_ptr->warnings;
+        const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
+                    client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
+        for (const auto& w : warnings) {
+            msg->add_warning(w);
+        }
+        co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
+    } catch(typename prepared_statements_cache::statement_is_too_big&) {
+        throw prepared_statement_is_too_big(query_string);
+    }
 }

 static std::string hash_target(std::string_view query_string, std::string_view keyspace) {
@@ -783,7 +802,8 @@ query_options query_processor::make_internal_options(
        const statements::prepared_statement::checked_weak_ptr& p,
        const std::vector<data_value_or_unset>& values,
        db::consistency_level cl,
-        int32_t page_size) const {
+        int32_t page_size,
+        service::node_local_only node_local_only) const {
    if (p->bound_names.size() != values.size()) {
        throw std::invalid_argument(
                format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
@@ -810,16 +830,16 @@ query_options query_processor::make_internal_options(
        }, var);
        ++ni;
    }
-    if (page_size > 0) {
-        lw_shared_ptr<service::pager::paging_state> paging_state;
-        db::consistency_level serial_consistency = db::consistency_level::SERIAL;
-        api::timestamp_type ts = api::missing_timestamp;
-        return query_options(
-                cl,
-                std::move(bound_values),
-                cql3::query_options::specific_options{page_size, std::move(paging_state), serial_consistency, ts});
-    }
-    return query_options(cl, std::move(bound_values));
+    return query_options(
+            cl,
+            std::move(bound_values),
+            cql3::query_options::specific_options {
+                .page_size = page_size,
+                .state = {},
+                .serial_consistency = db::consistency_level::SERIAL,
+                .timestamp = api::missing_timestamp,
+                .node_local_only = node_local_only
+            });
 }

 statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string) {
@@ -939,7 +959,7 @@ query_processor::execute_internal(
    }
 }

-future<std::vector<mutation>> query_processor::get_mutations_internal(
+future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
        const sstring query_string,
        service::query_state& query_state,
        api::timestamp_type timestamp,
@@ -1135,9 +1155,6 @@ void query_processor::migration_subscriber::on_update_view(
    on_update_column_family(ks_name, view_name, columns_changed);
 }

-void query_processor::migration_subscriber::on_update_tablet_metadata(const locator::tablet_metadata_change_hint&) {
-}
-
 void query_processor::migration_subscriber::on_drop_keyspace(const sstring& ks_name) {
    remove_invalid_prepared_statements(ks_name, std::nullopt);
 }
--- a/Show More
+++ b/Show More