Update seastar submodule

* seastar e45cef9c...1b299004 (3): > rpc: Abort server connection streams on stop > rpc: Do not register stream to dying parent > rpc: Fix client-side stream registration race refs: #13100 Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
scylla_fstrim_setup: start scylla-fstrim.timer on setup
2023-09-06 12:35:37 +03:00 · 2023-07-18 16:03:53 +03:00 · 2023-07-14 18:18:05 +03:00 · 2023-07-14 15:48:28 +03:00 · 2023-07-13 22:48:36 +03:00 · 2023-07-13 22:48:30 +03:00
1361 changed files with 40948 additions and 61571 deletions
--- a/.github/scripts/label_promoted_commits.py
+++ b/.github/scripts/label_promoted_commits.py
@@ -1,87 +0,0 @@
-from github import Github
-import argparse
-import re
-import sys
-import os
-
-try:
-    github_token = os.environ["GITHUB_TOKEN"]
-except KeyError:
-    print("Please set the 'GITHUB_TOKEN' environment variable")
-    sys.exit(1)
-
-
-def parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--repository', type=str, required=True,
-                        help='Github repository name (e.g., scylladb/scylladb)')
-    parser.add_argument('--commit_before_merge', type=str, required=True, help='Git commit ID to start labeling from ('
-                                                                               'newest commit).')
-    parser.add_argument('--commit_after_merge', type=str, required=True,
-                        help='Git commit ID to end labeling at (oldest '
-                             'commit, exclusive).')
-    parser.add_argument('--update_issue', type=bool, default=False, help='Set True to update issues when backport was '
-                                                                         'done')
-    parser.add_argument('--ref', type=str, required=True, help='PR target branch')
-    return parser.parse_args()
-
-
-def add_comment_and_close_pr(pr, comment):
-    if pr.state == 'open':
-        pr.create_issue_comment(comment)
-        pr.edit(state="closed")
-
-
-def mark_backport_done(repo, ref_pr_number, branch):
-    pr = repo.get_pull(int(ref_pr_number))
-    label_to_remove = f'backport/{branch}'
-    label_to_add = f'{label_to_remove}-done'
-    current_labels = [label.name for label in pr.get_labels()]
-    if label_to_remove in current_labels:
-        pr.remove_from_labels(label_to_remove)
-    if label_to_add not in current_labels:
-        pr.add_to_labels(label_to_add)
-
-
-def main():
-    # This script is triggered by a push event to either the master branch or a branch named branch-x.y (where x and y represent version numbers). Based on the pushed branch, the script performs the following actions:
-    # - When ref branch is `master`, it will add the `promoted-to-master` label, which we need later for the auto backport process
-    # - When ref branch is `branch-x.y` (which means we backported a patch), it will replace in the original PR the `backport/x.y` label with `backport/x.y-done` and will close the backport PR (Since GitHub close only the one referring to default branch)
-    args = parser()
-    pr_pattern = re.compile(r'Closes .*#([0-9]+)')
-    target_branch = re.search(r'branch-(\d+\.\d+)', args.ref)
-    g = Github(github_token)
-    repo = g.get_repo(args.repository, lazy=False)
-    commits = repo.compare(head=args.commit_after_merge, base=args.commit_before_merge)
-    processed_prs = set()
-    # Print commit information
-    for commit in commits.commits:
-        print(f'Commit sha is: {commit.sha}')
-        match = pr_pattern.search(commit.commit.message)
-        if match:
-            pr_number = int(match.group(1))
-            if pr_number in processed_prs:
-                continue
-            if target_branch:
-                pr = repo.get_pull(pr_number)
-                branch_name = target_branch[1]
-                refs_pr = re.findall(r'Refs (?:#|https.*?)(\d+)', pr.body)
-                if refs_pr:
-                    print(f'branch-{target_branch.group(1)}, pr number is: {pr_number}')
-                    # 1. change the backport label of the parent PR to note that
-                    #    we've merge the corresponding backport PR
-                    # 2. close the backport PR and leave a comment on it to note
-                    #    that it has been merged with a certain git commit,
-                    ref_pr_number = refs_pr[0]
-                    mark_backport_done(repo, ref_pr_number, branch_name)
-                    comment = f'Closed via {commit.sha}'
-                    add_comment_and_close_pr(pr, comment)
-            else:
-                print(f'master branch, pr number is: {pr_number}')
-                pr = repo.get_pull(pr_number)
-                pr.add_to_labels('promoted-to-master')
-            processed_prs.add(pr_number)
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/workflows/add-label-when-promoted.yaml
+++ b/.github/workflows/add-label-when-promoted.yaml
@@ -1,36 +0,0 @@
-name: Check if commits are promoted
-
-on:
-  push:
-    branches:
-      - master
-      - branch-*.*
-
-env:
-  DEFAULT_BRANCH: 'master'
-
-jobs:
-  check-commit:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: write
-      issues: write
-    steps:
-      - name: Dump GitHub context
-        env:
-          GITHUB_CONTEXT: ${{ toJson(github) }}
-        run: echo "$GITHUB_CONTEXT"
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          repository: ${{ github.repository }}
-          ref: ${{ env.DEFAULT_BRANCH }}
-          fetch-depth: 0  # Fetch all history for all tags and branches
-
-      - name: Install dependencies
-        run: sudo apt-get install -y python3-github
-
-      - name: Run python script
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: python .github/scripts/label_promoted_commits.py --commit_before_merge ${{ github.event.before }} --commit_after_merge ${{ github.event.after }} --repository ${{ github.repository }} --ref ${{ github.ref }}
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,8 @@ tags
 testlog
 test/*/*.reject
 .vscode
+docs/_build
+docs/poetry.lock
 compile_commands.json
 .ccls-cache/
 .mypy_cache
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.27)
+cmake_minimum_required(VERSION 3.18)

 project(scylla)

@@ -8,19 +8,11 @@ list(APPEND CMAKE_MODULE_PATH
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake
  ${CMAKE_CURRENT_SOURCE_DIR}/seastar/cmake)

+set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE
+    STRING "Choose the type of build." FORCE)
 # Set the possible values of build type for cmake-gui
-set(scylla_build_types
-    "Debug" "Release" "Dev" "Sanitize" "Coverage")
 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-  ${scylla_build_types})
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE
-        STRING "Choose the type of build." FORCE)
-    message(WARNING "CMAKE_BUILD_TYPE not specified, Using 'Release'")
-elseif(NOT CMAKE_BUILD_TYPE IN_LIST scylla_build_types)
-    message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}. "
-        "Following types are supported: ${scylla_build_types}")
-endif()
+  "Debug" "Release" "Dev" "Sanitize")
 string(TOUPPER "${CMAKE_BUILD_TYPE}" build_mode)
 include(mode.${build_mode})
 include(mode.common)
@@ -34,9 +26,7 @@ set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)

 set(Seastar_TESTING ON CACHE BOOL "" FORCE)
-set(Seastar_API_LEVEL 7 CACHE STRING "" FORCE)
-set(Seastar_APPS ON CACHE BOOL "" FORCE)
-set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE)
+set(Seastar_API_LEVEL 6 CACHE STRING "" FORCE)
 add_subdirectory(seastar)

 # System libraries dependencies
@@ -56,8 +46,6 @@ find_package(xxHash REQUIRED)
 set(scylla_gen_build_dir "${CMAKE_BINARY_DIR}/gen")
 file(MAKE_DIRECTORY "${scylla_gen_build_dir}")

-include(add_version_library)
-generate_scylla_version()

 add_library(scylla-main STATIC)
 target_sources(scylla-main
@@ -126,7 +114,6 @@ add_subdirectory(lang)
 add_subdirectory(locator)
 add_subdirectory(mutation)
 add_subdirectory(mutation_writer)
-add_subdirectory(node_ops)
 add_subdirectory(readers)
 add_subdirectory(redis)
 add_subdirectory(replica)
@@ -144,6 +131,7 @@ add_subdirectory(tracing)
 add_subdirectory(transport)
 add_subdirectory(types)
 add_subdirectory(utils)
+include(add_version_library)
 add_version_library(scylla_version
    release.cc)

@@ -225,5 +213,3 @@ set(CMAKE_EXE_LINKER_FLAGS "${default_linker_flags}" CACHE INTERNAL "")
 target_include_directories(scylla PRIVATE
    "${CMAKE_CURRENT_SOURCE_DIR}"
    "${scylla_gen_build_dir}")
-
-add_subdirectory(dist)
--- a/12
+++ b/12
@@ -7,7 +7,6 @@ Options:
  -h|--help show this help message.
  -o|--output-dir PATH specify destination path at which the version files are to be created.
  -d|--date-stamp DATE manually set date for release parameter
-  -v|--verbose also print out the version number

 By default, the script will attempt to parse 'version' file
 in the current directory, which should contain a string of
@@ -34,7 +33,6 @@ END
 )

 DATE=""
-PRINT_VERSION=false

 while [ $# -gt 0 ]; do
 	opt="$1"
@@ -53,10 +51,6 @@ while [ $# -gt 0 ]; do
 			shift
 			shift
 			;;
-		-v|--verbose)
-			PRINT_VERSION=true
-			shift
-			;;
 		*)
 			echo "Unexpected argument found: $1"
 			echo
@@ -78,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.4.10
+VERSION=5.3.0-rc1

 if test -f version
 then
@@ -108,9 +102,7 @@ if [ -f "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" ]; then
 	fi
 fi

-if $PRINT_VERSION; then
-	echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
-fi
+echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p "$OUTPUT_DIR"
 echo "$SCYLLA_VERSION" > "$OUTPUT_DIR/SCYLLA-VERSION-FILE"
 echo "$SCYLLA_RELEASE" > "$OUTPUT_DIR/SCYLLA-RELEASE-FILE"
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -38,6 +38,7 @@
 #include <seastar/json/json_elements.hh>
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include "collection_mutation.hh"
+#include "db/query_context.hh"
 #include "schema/schema.hh"
 #include "db/tags/extension.hh"
 #include "db/tags/utils.hh"
@@ -59,28 +60,7 @@ logging::logger elogger("alternator-executor");

 namespace alternator {

-enum class table_status {
-    active = 0,
-    creating,
-    updating,
-    deleting
-};
-
-static sstring_view table_status_to_sstring(table_status tbl_status) {
-    switch(tbl_status) {
-        case table_status::active:
-            return "ACTIVE";
-        case table_status::creating:
-            return "CREATING";
-        case table_status::updating:
-            return "UPDATING";
-        case table_status::deleting:
-            return "DELETING";
-    }
-    return "UKNOWN";
-}
-
-static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type);
+static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type);

 static map_type attrs_type() {
    static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true);
@@ -211,8 +191,9 @@ static std::string lsi_name(const std::string& table_name, std::string_view inde

 /** Extract table name from a request.
 *  Most requests expect the table's name to be listed in a "TableName" field.
- *  This convenience function returns the name or api_error in case the
- *  table name is missing or not a string.
+ *  This convenience function returns the name, with appropriate validation
+ *  and api_error in case the table name is missing or not a string, or
+ *  doesn't pass validate_table_name().
 */
 static std::optional<std::string> find_table_name(const rjson::value& request) {
    const rjson::value* table_name_value = rjson::find(request, "TableName");
@@ -223,6 +204,7 @@ static std::optional<std::string> find_table_name(const rjson::value& request) {
        throw api_error::validation("Non-string TableName field in request");
    }
    std::string table_name = table_name_value->GetString();
+    validate_table_name(table_name);
    return table_name;
 }

@@ -249,10 +231,6 @@ schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::valu
    try {
        return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(*table_name), *table_name);
    } catch(data_dictionary::no_such_column_family&) {
-        // DynamoDB returns validation error even when table does not exist
-        // and the table name is invalid.
-        validate_table_name(table_name.value());
-
        throw api_error::resource_not_found(
                format("Requested resource not found: Table: {} not found", *table_name));
    }
@@ -303,10 +281,6 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
        try {
            return { proxy.data_dictionary().find_schema(sstring(internal_ks_name), sstring(internal_table_name)), type };
        } catch (data_dictionary::no_such_column_family&) {
-            // DynamoDB returns validation error even when table does not exist
-            // and the table name is invalid.
-            validate_table_name(table_name);
-
            throw api_error::resource_not_found(
                format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
        }
@@ -442,8 +416,22 @@ static rjson::value generate_arn_for_index(const schema& schema, std::string_vie
        schema.ks_name(), schema.cf_name(), index_name));
 }

-static rjson::value fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy const& proxy)
-{
+bool is_alternator_keyspace(const sstring& ks_name) {
+    return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
+}
+
+sstring executor::table_name(const schema& s) {
+    return s.cf_name();
+}
+
+future<executor::request_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
+    _stats.api_operations.describe_table++;
+    elogger.trace("Describing table {}", request);
+
+    schema_ptr schema = get_table(_proxy, request);
+
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
+
    rjson::value table_description = rjson::empty_object();
    rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
    // FIXME: take the tables creation time, not the current time!
@@ -454,8 +442,9 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s
    // We don't currently do this in Alternator - instead CreateTable waits
    // until the table is really available. So/ DescribeTable returns either
    // ACTIVE or doesn't exist at all (and DescribeTable returns an error).
-    // The states CREATING and UPDATING are not currently returned.
-    rjson::add(table_description, "TableStatus", rjson::from_string(table_status_to_sstring(tbl_status)));
+    // The other states (CREATING, UPDATING, DELETING) are not currently
+    // returned.
+    rjson::add(table_description, "TableStatus", "ACTIVE");
    rjson::add(table_description, "TableArn", generate_arn_for_table(*schema));
    rjson::add(table_description, "TableId", rjson::from_string(schema->id().to_sstring()));
    // FIXME: Instead of hardcoding, we should take into account which mode was chosen
@@ -472,9 +461,9 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
-    executor::describe_key_schema(table_description, *schema, key_attribute_types);
+    describe_key_schema(table_description, *schema, key_attribute_types);

-    data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
+    data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
    if (!t.views().empty()) {
        rjson::value gsi_array = rjson::empty_array();
        rjson::value lsi_array = rjson::empty_array();
@@ -490,7 +479,7 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s
            rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
            rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
-            executor::describe_key_schema(view_entry, *vptr, key_attribute_types);
+            describe_key_schema(view_entry, *vptr, key_attribute_types);
            // Add projection type
            rjson::value projection = rjson::empty_object();
            rjson::add(projection, "ProjectionType", "ALL");
@@ -518,29 +507,10 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s
    }
    rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));

-    executor::supplement_table_stream_info(table_description, *schema, proxy);
-
+    supplement_table_stream_info(table_description, *schema, _proxy);
+    
    // FIXME: still missing some response fields (issue #5026)
-    return table_description;
-}

-bool is_alternator_keyspace(const sstring& ks_name) {
-    return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
-}
-
-sstring executor::table_name(const schema& s) {
-    return s.cf_name();
-}
-
-future<executor::request_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
-    _stats.api_operations.describe_table++;
-    elogger.trace("Describing table {}", request);
-
-    schema_ptr schema = get_table(_proxy, request);
-
-    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
-
-    rjson::value table_description = fill_table_description(schema, table_status::active, _proxy);
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Table", std::move(table_description));
    elogger.trace("returning {}", response);
@@ -552,17 +522,10 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    elogger.trace("Deleting table {}", request);

    std::string table_name = get_table_name(request);
-    // DynamoDB returns validation error even when table does not exist
-    // and the table name is invalid.
-    validate_table_name(table_name);
-
    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    tracing::add_table_name(trace_state, keyspace_name, table_name);
    auto& p = _proxy.container();

-    schema_ptr schema = get_table(_proxy, request);
-    rjson::value table_description = fill_table_description(schema, table_status::deleting, _proxy);
-
    co_await _mm.container().invoke_on(0, [&] (service::migration_manager& mm) -> future<> {
        // FIXME: the following needs to be in a loop. If mm.announce() below
        // fails, we need to retry the whole thing.
@@ -572,14 +535,18 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
            throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
        }

-        auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
-        auto m2 = co_await service::prepare_keyspace_drop_announcement(_proxy.local_db(), keyspace_name, group0_guard.write_timestamp());
+        auto m = co_await mm.prepare_column_family_drop_announcement(keyspace_name, table_name, group0_guard.write_timestamp(), service::migration_manager::drop_views::yes);
+        auto m2 = co_await mm.prepare_keyspace_drop_announcement(keyspace_name, group0_guard.write_timestamp());

        std::move(m2.begin(), m2.end(), std::back_inserter(m));

-        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: delete {} table", table_name));
+        co_await mm.announce(std::move(m), std::move(group0_guard));
    });

+    // FIXME: need more attributes?
+    rjson::value table_description = rjson::empty_object();
+    rjson::add(table_description, "TableName", rjson::from_string(table_name));
+    rjson::add(table_description, "TableStatus", "DELETING");
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TableDescription", std::move(table_description));
    elogger.trace("returning {}", response);
@@ -864,6 +831,17 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta
    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
 }

+static future<> wait_for_schema_agreement(service::migration_manager& mm, db::timeout_clock::time_point deadline) {
+    return do_until([&mm, deadline] {
+        if (db::timeout_clock::now() > deadline) {
+            throw std::runtime_error("Unable to reach schema agreement");
+        }
+        return mm.have_schema_agreement();
+    }, [] {
+        return seastar::sleep(500ms);
+    });
+}
+
 static void verify_billing_mode(const rjson::value& request) {
        // Alternator does not yet support billing or throughput limitations, but
    // let's verify that BillingMode is at least legal.
@@ -881,38 +859,6 @@ static void verify_billing_mode(const rjson::value& request) {
    }
 }

-// Validate that a AttributeDefinitions parameter in CreateTable is valid, and
-// throws user-facing api_error::validation if it's not.
-// In particular, verify that the same AttributeName doesn't appear more than
-// once (Issue #13870).
-static void validate_attribute_definitions(const rjson::value& attribute_definitions){
-    if (!attribute_definitions.IsArray()) {
-        throw api_error::validation("AttributeDefinitions must be an array");
-    }
-    std::unordered_set<std::string> seen_attribute_names;
-    for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
-        const rjson::value* attribute_name = rjson::find(*it, "AttributeName");
-        if (!attribute_name) {
-            throw api_error::validation("AttributeName missing in AttributeDefinitions");
-        }
-        if (!attribute_name->IsString()) {
-            throw api_error::validation("AttributeName in AttributeDefinitions must be a string");
-        }
-        auto [it2, added] = seen_attribute_names.emplace(rjson::to_string_view(*attribute_name));
-        if (!added) {
-            throw api_error::validation(format("Duplicate AttributeName={} in AttributeDefinitions",
-                rjson::to_string_view(*attribute_name)));
-        }
-        const rjson::value* attribute_type = rjson::find(*it, "AttributeType");
-        if (!attribute_type) {
-            throw api_error::validation("AttributeType missing in AttributeDefinitions");
-        }
-        if (!attribute_type->IsString()) {
-            throw api_error::validation("AttributeType in AttributeDefinitions must be a string");
-        }
-    }
-}
-
 static future<executor::request_return_type> create_table_on_shard0(tracing::trace_state_ptr trace_state, rjson::value request, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper) {
    assert(this_shard_id() == 0);

@@ -921,14 +867,11 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    // (e.g., verify that this table doesn't already exist) - we can only
    // do this further down - after taking group0_guard.
    std::string table_name = get_table_name(request);
-    validate_table_name(table_name);
-
    if (table_name.find(executor::INTERNAL_TABLE_PREFIX) == 0) {
        co_return api_error::validation(format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
    }
    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    const rjson::value& attribute_definitions = request["AttributeDefinitions"];
-    validate_attribute_definitions(attribute_definitions);

    tracing::add_table_name(trace_state, keyspace_name, table_name);

@@ -1119,7 +1062,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    auto ts = group0_guard.write_timestamp();
    std::vector<mutation> schema_mutations;
    try {
-        schema_mutations = co_await create_keyspace(keyspace_name, sp, gossiper, ts);
+        schema_mutations = co_await create_keyspace(keyspace_name, sp, mm, gossiper, ts);
    } catch (exceptions::already_exists_exception&) {
        if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
            co_return api_error::resource_in_use(format("Table {} already exists", table_name));
@@ -1142,9 +1085,9 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
        db::schema_tables::add_table_or_view_to_schema_mutation(
            view_ptr(view_builder.build()), ts, true, schema_mutations);
    }
-    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), format("alternator-executor: create {} table", table_name));
+    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard));

-    co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
+    co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, sp);
    rjson::add(status, "TableDescription", std::move(request));
@@ -1207,11 +1150,11 @@ future<executor::request_return_type> executor::update_table(client_state& clien

        auto schema = builder.build();

-        auto m = co_await service::prepare_column_family_update_announcement(p.local(), schema, false,  std::vector<view_ptr>(), group0_guard.write_timestamp());
+        auto m = co_await mm.prepare_column_family_update_announcement(schema, false,  std::vector<view_ptr>(), group0_guard.write_timestamp());

-        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: update {} table", tab->cf_name()));
+        co_await mm.announce(std::move(m), std::move(group0_guard));

-        co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+        co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);

        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
@@ -1647,7 +1590,7 @@ static parsed::condition_expression get_parsed_condition_expression(rjson::value
        throw api_error::validation("ConditionExpression must not be empty");
    }
    try {
-        return parse_condition_expression(rjson::to_string_view(*condition_expression), "ConditionExpression");
+        return parse_condition_expression(rjson::to_string_view(*condition_expression));
    } catch(expressions_syntax_error& e) {
        throw api_error::validation(e.what());
    }
@@ -1662,16 +1605,17 @@ static bool check_needs_read_before_write(const parsed::condition_expression& co

 // Fail the expression if it has unused attribute names or values. This is
 // how DynamoDB behaves, so we do too.
-static void verify_all_are_used(const rjson::value* field,
-        const std::unordered_set<std::string>& used, const char* field_name, const char* operation) {
-    if (!field) {
+static void verify_all_are_used(const rjson::value& req, const char* field,
+        const std::unordered_set<std::string>& used, const char* operation) {
+    const rjson::value* attribute_names = rjson::find(req, field);
+    if (!attribute_names) {
        return;
    }
-    for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) {
+    for (auto it = attribute_names->MemberBegin(); it != attribute_names->MemberEnd(); ++it) {
        if (!used.contains(it->name.GetString())) {
            throw api_error::validation(
                format("{} has spurious '{}', not used in {}",
-                    field_name, it->name.GetString(), operation));
+                       field, it->name.GetString(), operation));
        }
    }
 }
@@ -1698,8 +1642,8 @@ public:
            resolve_condition_expression(_condition_expression,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
-            verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "PutItem");
-            verify_all_are_used(expression_attribute_values, used_attribute_values,"ExpressionAttributeValues", "PutItem");
+            verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "PutItem");
+            verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "PutItem");
        } else {
            if (expression_attribute_names) {
                throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
@@ -1783,8 +1727,8 @@ public:
            resolve_condition_expression(_condition_expression,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
-            verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "DeleteItem");
-            verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "DeleteItem");
+            verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "DeleteItem");
+            verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "DeleteItem");
        } else {
            if (expression_attribute_names) {
                throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
@@ -2557,8 +2501,8 @@ update_item_operation::update_item_operation(service::storage_proxy& proxy, rjso
            expression_attribute_names, expression_attribute_values,
            used_attribute_names, used_attribute_values);

-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "UpdateItem");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "UpdateItem");
+    verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "UpdateItem");
+    verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "UpdateItem");

    // DynamoDB forbids having both old-style AttributeUpdates or Expected
    // and new-style UpdateExpression or ConditionExpression in the same request
@@ -3167,8 +3111,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st

    std::unordered_set<std::string> used_attribute_names;
    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");

    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)).then(
@@ -3279,8 +3222,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
        rs.cl = get_read_consistency(it->value);
        std::unordered_set<std::string> used_attribute_names;
        rs.attrs_to_get = ::make_shared<const std::optional<attrs_to_get>>(calculate_attrs_to_get(it->value, used_attribute_names));
-        const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames");
-        verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem");
+        verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");
        auto& keys = (it->value)["Keys"];
        for (rjson::value& key : keys.GetArray()) {
            rs.add(key);
@@ -3449,7 +3391,7 @@ filter::filter(const rjson::value& request, request_type rt,
            throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet");
        }
        try {
-            auto parsed = parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression");
+            auto parsed = parse_condition_expression(rjson::to_string_view(*expression));
            const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
            const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
            resolve_condition_expression(parsed,
@@ -3853,10 +3795,8 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
    // optimized the filtering by modifying partition_ranges and/or
    // ck_bounds. We haven't done this optimization yet.

-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Scan");
+    verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Scan");

    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
            std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
@@ -4077,7 +4017,7 @@ calculate_bounds_condition_expression(schema_ptr schema,
    // sort-key range.
    parsed::condition_expression p;
    try {
-        p = parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression");
+        p = parse_condition_expression(rjson::to_string_view(expression));
    } catch(expressions_syntax_error& e) {
        throw api_error::validation(e.what());
    }
@@ -4297,17 +4237,13 @@ future<executor::request_return_type> executor::query(client_state& client_state
        throw api_error::validation("Query must have one of "
                "KeyConditions or KeyConditionExpression");
    }
-
-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
-
    // exactly one of key_conditions or key_condition_expression
    auto [partition_ranges, ck_bounds] = key_conditions
                ? calculate_bounds_conditions(schema, *key_conditions)
                : calculate_bounds_condition_expression(schema, *key_condition_expression,
-                        expression_attribute_values,
+                        rjson::find(request, "ExpressionAttributeValues"),
                        used_attribute_values,
-                        expression_attribute_names,
+                        rjson::find(request, "ExpressionAttributeNames"),
                        used_attribute_names);

    filter filter(request, filter::request_type::QUERY,
@@ -4334,8 +4270,8 @@ future<executor::request_return_type> executor::query(client_state& client_state
    select_type select = parse_select(request, table_type);

    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names, select);
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query");
+    verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Query");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Query");
    query::partition_slice::option_set opts;
    opts.set_if<query::partition_slice::option::reversed>(!forward);
    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
@@ -4396,17 +4332,6 @@ future<executor::request_return_type> executor::list_tables(client_state& client

 future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header) {
    _stats.api_operations.describe_endpoints++;
-    // The alternator_describe_endpoints configuration can be used to disable
-    // the DescribeEndpoints operation, or set it to return a fixed string
-    std::string override = _proxy.data_dictionary().get_config().alternator_describe_endpoints();
-    if (!override.empty()) {
-        if (override == "disabled") {
-            _stats.unsupported_operations++;
-            return make_ready_future<request_return_type>(api_error::unknown_operation(
-                "DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)"));
-        }
-        host_header = std::move(override);
-    }
    rjson::value response = rjson::empty_object();
    // Without having any configuration parameter to say otherwise, we tell
    // the user to return to the same endpoint they used to reach us. The only
@@ -4444,10 +4369,6 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
    try {
        schema = _proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
    } catch(data_dictionary::no_such_column_family&) {
-        // DynamoDB returns validation error even when table does not exist
-        // and the table name is invalid.
-        validate_table_name(table_name);
-
        throw api_error::table_not_found(
                format("Table {} not found", table_name));
    }
@@ -4467,9 +4388,9 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
 // of nodes in the cluster: A cluster with 3 or more live nodes, gets RF=3.
 // A smaller cluster (presumably, a test only), gets RF=1. The user may
 // manually create the keyspace to override this predefined behavior.
-static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type ts) {
+static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type ts) {
    sstring keyspace_name_str(keyspace_name);
-    int endpoint_count = gossiper.num_endpoints();
+    int endpoint_count = gossiper.get_endpoint_states().size();
    int rf = 3;
    if (endpoint_count < rf) {
        rf = 1;
@@ -4479,7 +4400,7 @@ static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_n
    auto opts = get_network_topology_options(sp, gossiper, rf);
    auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);

-    co_return service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
+    co_return mm.prepare_new_keyspace_announcement(ksm, ts);
 }

 future<> executor::start() {
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -225,10 +225,9 @@ private:
    friend class rmw_operation;

    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr);
+    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
    
 public:
-    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
-
    static std::optional<rjson::value> describe_single_item(schema_ptr,
        const query::partition_slice&,
        const cql3::selection::selection&,
@@ -249,7 +248,7 @@ public:

    static void add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
-    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
+    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
 };

 // is_big() checks approximately if the given JSON value is "bigger" than
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -29,7 +29,7 @@
 namespace alternator {

 template <typename Func, typename Result = std::result_of_t<Func(expressionsParser&)>>
-static Result do_with_parser(std::string_view input, Func&& f) {
+Result do_with_parser(std::string_view input, Func&& f) {
    expressionsLexer::InputStreamType input_stream{
        reinterpret_cast<const ANTLR_UINT8*>(input.data()),
        ANTLR_ENC_UTF8,
@@ -43,41 +43,31 @@ static Result do_with_parser(std::string_view input, Func&& f) {
    return result;
 }

-template <typename Func, typename Result = std::result_of_t<Func(expressionsParser&)>>
-static Result parse(const char* input_name, std::string_view input, Func&& f) {
-    if (input.length() > 4096) {
-        throw expressions_syntax_error(format("{} expression size {} exceeds allowed maximum 4096.",
-            input_name, input.length()));
-    }
-    try {
-        return do_with_parser(input, f);
-    } catch (expressions_syntax_error& e) {
-        // If already an expressions_syntax_error, don't print the type's
-        // name (it's just ugly), just the message.
-        // TODO: displayRecognitionError could set a position inside the
-        // expressions_syntax_error in throws, and we could use it here to
-        // mark the broken position in 'input'.
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
-            input_name, input, e.what()));
-    } catch (...) {
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
-            input_name, input, std::current_exception()));
-    }
-}
-
 parsed::update_expression
 parse_update_expression(std::string_view query) {
-    return parse("UpdateExpression", query,  std::mem_fn(&expressionsParser::update_expression));
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::update_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing UpdateExpression '{}': {}", query, std::current_exception()));
+    }
 }

 std::vector<parsed::path>
 parse_projection_expression(std::string_view query) {
-    return parse ("ProjectionExpression", query,  std::mem_fn(&expressionsParser::projection_expression));
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::projection_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ProjectionExpression '{}': {}", query, std::current_exception()));
+    }
 }

 parsed::condition_expression
-parse_condition_expression(std::string_view query, const char* caller) {
-    return parse(caller, query,  std::mem_fn(&expressionsParser::condition_expression));
+parse_condition_expression(std::string_view query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
+    }
 }

 namespace parsed {
@@ -428,14 +418,9 @@ void for_condition_expression_on(const parsed::condition_expression& ce, const n
 // calculate_size() is ConditionExpression's size() function, i.e., it takes
 // a JSON-encoded value and returns its "size" as defined differently for the
 // different types - also as a JSON-encoded number.
-// If the value's type (e.g. number) has no size defined, there are two cases:
-// 1. If from_data (the value came directly from an attribute of the data),
-//    It returns a JSON-encoded "null" value. Comparisons against this
-//    non-numeric value will later fail, so eventually the application will
-//    get a ConditionalCheckFailedException.
-// 2. Otherwise (the value came from a constant in the query or some other
-//    calculation), throw a ValidationException.
-static rjson::value calculate_size(const rjson::value& v, bool from_data) {
+// It return a JSON-encoded "null" value if this value's type has no size
+// defined. Comparisons against this non-numeric value will later fail.
+static rjson::value calculate_size(const rjson::value& v) {
    // NOTE: If v is improperly formatted for our JSON value encoding, it
    // must come from the request itself, not from the database, so it makes
    // sense to throw a ValidationException if we see such a problem.
@@ -464,12 +449,10 @@ static rjson::value calculate_size(const rjson::value& v, bool from_data) {
            throw api_error::validation(format("invalid byte string: {}", v));
        }
        ret = base64_decoded_len(rjson::to_string_view(it->value));
-    } else if (from_data) {
+    } else {
        rjson::value json_ret = rjson::empty_object();
        rjson::add(json_ret, "null", rjson::value(true));
        return json_ret;
-    } else {
-        throw api_error::validation(format("Unsupported operand type {} for function size()", it->name));
    }
    rjson::value json_ret = rjson::empty_object();
    rjson::add(json_ret, "N", rjson::from_string(std::to_string(ret)));
@@ -551,7 +534,7 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
                        format("{}: size() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
-            return calculate_size(v, f._parameters[0].is_path());
+            return calculate_size(v);
        }
    },
    {"attribute_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
@@ -679,7 +662,7 @@ static rjson::value extract_path(const rjson::value* item,
            // objects. But today Alternator does not validate the structure
            // of nested documents before storing them, so this can happen on
            // read.
-            throw api_error::validation(format("{}: malformed item read: {}", caller, *item));
+            throw api_error::validation(format("{}: malformed item read: {}", *item));
        }
        const char* type = v->MemberBegin()->name.GetString();
        v = &(v->MemberBegin()->value);
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -74,22 +74,7 @@ options {
 */
@parser::context {
    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
-        const char* err;
-        switch (ex->getType()) {
-        case antlr3::ExceptionType::FAILED_PREDICATE_EXCEPTION:
-            err = "expression nested too deeply";
-            break;
-        default:
-            err = "syntax error";
-            break;
-        }
-        // Alternator expressions are always single line so ex->get_line()
-        // is always 1, no sense to print it.
-        // TODO: return the position as part of the exception, so the
-        // caller in expressions.cc that knows the expression string can
-        // mark the error position in the final error message.
-        throw expressions_syntax_error(format("{} at char {}", err,
-            ex->get_charPositionInLine()));
+        throw expressions_syntax_error("syntax error");
    }
 }
@lexer::context {
@@ -98,23 +83,6 @@ options {
    }
 }

-/* Unfortunately, ANTLR uses recursion - not the heap - to parse recursive
- * expressions. To make things even worse, ANTLR has no way to limit the
- * depth of this recursion (unlike Yacc which has YYMAXDEPTH). So deeply-
- * nested expression like "(((((((((((((..." can easily crash Scylla on a
- * stack overflow (see issue #14477).
- *
- * We are lucky that in the grammar for DynamoDB expressions (below),
- * only a few specific rules can recurse, so it was fairly easy to add a
- * "depth" counter to a few specific rules, and then use a predicate
- * "{depth<MAX_DEPTH}?" to avoid parsing if the depth exceeds this limit,
- * and throw a FAILED_PREDICATE_EXCEPTION in that case, which we will
- * report to the user as a "expression nested too deeply" error.
- */
-@parser::members {
-    static constexpr int MAX_DEPTH = 400;
-}
-
 /*
 * Lexical analysis phase, i.e., splitting the input up to tokens.
 * Lexical analyzer rules have names starting in capital letters.
@@ -187,20 +155,19 @@ path returns [parsed::path p]:
      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
    )*;

-/* See comment above why the "depth" counter was needed here */
-value[int depth] returns [parsed::value v]:
+value returns [parsed::value v]:
      VALREF       { $v.set_valref($VALREF.text); }
    | path         { $v.set_path($path.p); }
-    | {depth<MAX_DEPTH}? NAME { $v.set_func_name($NAME.text); }
-     '(' x=value[depth+1]    { $v.add_func_parameter($x.v); }
-     (',' x=value[depth+1]   { $v.add_func_parameter($x.v); })*
+    | NAME         { $v.set_func_name($NAME.text); }
+     '(' x=value   { $v.add_func_parameter($x.v); }
+     (',' x=value  { $v.add_func_parameter($x.v); })*
     ')'
    ;

 update_expression_set_rhs returns [parsed::set_rhs rhs]:
-    v=value[0]  { $rhs.set_value(std::move($v.v)); }
-    (   '+' v=value[0]  { $rhs.set_plus(std::move($v.v)); }
-      | '-' v=value[0]  { $rhs.set_minus(std::move($v.v)); }
+    v=value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
    )?
    ;

@@ -238,7 +205,7 @@ projection_expression returns [std::vector<parsed::path> v]:


 primitive_condition returns [parsed::primitive_condition c]:
-      v=value[0]      { $c.add_value(std::move($v.v));
+      v=value         { $c.add_value(std::move($v.v));
                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
@@ -247,14 +214,14 @@ primitive_condition returns [parsed::primitive_condition c]:
          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
         )
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
         AND
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
-         v=value[0]   { $c.add_value(std::move($v.v)); }
-         (',' v=value[0] { $c.add_value(std::move($v.v)); })*
+         v=value      { $c.add_value(std::move($v.v)); }
+         (',' v=value { $c.add_value(std::move($v.v)); })*
         ')'
      )?
    ;
@@ -264,20 +231,19 @@ primitive_condition returns [parsed::primitive_condition c]:
 // common rule prefixes, and (lack of) support for operator precedence.
 // These rules could have been written more clearly using a more powerful
 // parser generator - such as Yacc.
-// See comment above why the "depth" counter was needed here.
-boolean_expression[int depth] returns [parsed::condition_expression e]:
-	  b=boolean_expression_1[depth]       { $e.append(std::move($b.e), '|'); }
-	  (OR b=boolean_expression_1[depth]   { $e.append(std::move($b.e), '|'); } )*
+boolean_expression returns [parsed::condition_expression e]:
+	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
+	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
 	;
-boolean_expression_1[int depth] returns [parsed::condition_expression e]:
-	  b=boolean_expression_2[depth]       { $e.append(std::move($b.e), '&'); }
-	  (AND b=boolean_expression_2[depth]  { $e.append(std::move($b.e), '&'); } )*
+boolean_expression_1 returns [parsed::condition_expression e]:
+	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
+	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
 	;
-boolean_expression_2[int depth] returns [parsed::condition_expression e]:
+boolean_expression_2 returns [parsed::condition_expression e]:
 	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
-	| {depth<MAX_DEPTH}? NOT b=boolean_expression_2[depth+1]   { $e = std::move($b.e); $e.apply_not(); }
-	| {depth<MAX_DEPTH}? '(' b=boolean_expression[depth+1] ')' { $e = std::move($b.e); }
+	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
+	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
    ;

 condition_expression returns [parsed::condition_expression e]:
-    boolean_expression[0] { e=std::move($boolean_expression.e); } EOF;
+    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -28,7 +28,7 @@ public:

 parsed::update_expression parse_update_expression(std::string_view query);
 std::vector<parsed::path> parse_projection_expression(std::string_view query);
-parsed::condition_expression parse_condition_expression(std::string_view query, const char* caller);
+parsed::condition_expression parse_condition_expression(std::string_view query);

 void resolve_update_expression(parsed::update_expression& ue,
        const rjson::value* expression_attribute_names,
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -208,10 +208,7 @@ protected:
        sstring local_dc = topology.get_datacenter();
        std::unordered_set<gms::inet_address> local_dc_nodes = topology.get_datacenter_endpoints().at(local_dc);
        for (auto& ip : local_dc_nodes) {
-            // Note that it's not enough for the node to be is_alive() - a
-            // node joining the cluster is also "alive" but not responsive to
-            // requests. We need the node to be in normal state. See #19694.
-            if (_gossiper.is_normal(ip)) {
+            if (_gossiper.is_alive(ip)) {
                rjson::push_back(results, rjson::from_string(ip.to_sstring()));
            }
        }
@@ -427,7 +424,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    co_await client_state.maybe_update_per_service_level_params();

    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
-    tracing::trace(trace_state, "{}", op);
+    tracing::trace(trace_state, op);
    rjson::value json_request = co_await _json_parser.parse(std::move(content));
    co_return co_await callback_it->second(_executor, client_state, trace_state,
            make_service_permit(std::move(units)), std::move(json_request), std::move(req));
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -1096,7 +1096,7 @@ void executor::add_stream_options(const rjson::value& stream_specification, sche
    }
 }

-void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
+void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp) {
    auto& opts = schema.cdc_options();
    if (opts.enabled()) {
        auto db = sp.data_dictionary();
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -430,7 +430,6 @@ class token_ranges_owned_by_this_shard {
    size_t _range_idx;
    size_t _end_idx;
    std::optional<dht::selective_token_range_sharder> _intersecter;
-    locator::effective_replication_map_ptr _erm;
 public:
    token_ranges_owned_by_this_shard(replica::database& db, gms::gossiper& g, schema_ptr s)
        :  _s(s)
@@ -438,7 +437,6 @@ public:
                g, utils::fb_utilities::get_broadcast_address())
        , _range_idx(random_offset(0, _token_ranges.size() - 1))
        , _end_idx(_range_idx + _token_ranges.size())
-        , _erm(s->table().get_effective_replication_map())
    {
        tlogger.debug("Generating token ranges starting from base range {} of {}", _range_idx, _token_ranges.size());
    }
@@ -471,7 +469,7 @@ public:
                    return std::nullopt;
                }
            }
-            _intersecter.emplace(_erm->get_sharder(*_s), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
+            _intersecter.emplace(_s->get_sharder(), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
        }
    }

--- a/api/CMakeLists.txt
+++ b/api/CMakeLists.txt
@@ -14,7 +14,6 @@ set(swagger_files
  api-doc/hinted_handoff.json
  api-doc/lsa.json
  api-doc/messaging_service.json
-  api-doc/metrics.json
  api-doc/storage_proxy.json
  api-doc/storage_service.json
  api-doc/stream_manager.json
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -84,14 +84,6 @@
                     "type":"string",
                     "paramType":"path"
                  },
-                  {
-                     "name":"flush_memtables",
-                     "description":"Controls flushing of memtables before compaction (true by default). Set to \"false\" to skip automatic flushing of memtables before compaction, e.g. when the table is flushed explicitly before invoking the compaction api.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  },
                  {
                     "name":"split_output",
                     "description":"true if the output of the major compaction should be split in several sstables",
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -34,14 +34,6 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"parameters",
-                     "description":"dict of parameters to pass to the injection (json format)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"dict",
-                     "paramType":"body"
                  }
               ]
            },
@@ -66,30 +58,6 @@
            }
         ]
      },
-      {
-         "path":"/v2/error_injection/injection/{injection}/message",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Send message to trigger an event in injection's code",
-               "type":"void",
-               "nickname":"message_injection",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"injection",
-                     "description":"injection name, should correspond to an injection added in code",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/v2/error_injection/injection",
         "operations":[
@@ -118,15 +86,5 @@
            }
         ]
      }
-   ],
-   "components":{
-      "schemas": {
-         "dict": {
-            "type": "object",
-            "additionalProperties": {
-               "type": "string"
-            }
-         }
-      }
-   }
+   ]
 }
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -245,7 +245,7 @@
                 "GOSSIP_SHUTDOWN",
                 "DEFINITIONS_UPDATE",
                 "TRUNCATE",
-                 "UNUSED__REPLICATION_FINISHED",
+                 "REPLICATION_FINISHED",
                 "MIGRATION_REQUEST",
                 "PREPARE_MESSAGE",
                 "PREPARE_DONE_MESSAGE",
--- a/api/api-doc/metrics.def.json
+++ b/api/api-doc/metrics.def.json
@@ -1,34 +0,0 @@
-    "metrics_config": {
-        "id": "metrics_config",
-        "summary": "An entry in the metrics configuration",
-        "properties": {
-            "source_labels": {
-                "type": "array",
-                "items": {
-                    "type": "string"
-                },
-                "description": "The source labels, a match is based on concatination of the labels"
-            },
-            "action": {
-                "type": "string",
-                "description": "The action to perfrom on match",
-                "enum": ["skip_when_empty", "report_when_empty", "replace", "keep", "drop", "drop_label"]
-            },
-            "target_label": {
-                "type": "string",
-                "description": "The application state version"
-            },
-            "replacement": {
-                "type": "string",
-                "description": "The replacement string to use when replacing a value"
-            },
-            "regex": {
-                "type": "string",
-                "description": "The regex string to use when replacing a value"
-            },
-            "separator": {
-                "type": "string",
-                "description": "The separator string to use when concatinating the labels"
-            }
-        }
-    }
--- a/api/api-doc/metrics.json
+++ b/api/api-doc/metrics.json
@@ -1,66 +0,0 @@
-    "/v2/metrics-config/":{
-        "get":{
-            "description":"Return the metrics layer configuration",
-            "operationId":"get_metrics_config",
-            "produces":[
-                "application/json"
-            ],
-            "tags":[
-                "metrics"
-            ],
-            "parameters":[
-            ],
-            "responses":{
-                "200":{
-                "schema": {
-                    "type":"array",
-                    "items":{
-                        "$ref":"#/definitions/metrics_config",
-                        "description":"metrics Config value"
-                    }
-                    }
-                },
-                "default":{
-                    "description":"unexpected error",
-                    "schema":{
-                        "$ref":"#/definitions/ErrorModel"
-                    }
-                }
-            }
-        },
-        "post": {
-             "description":"Set the metrics layer relabel configuration",
-            "operationId":"set_metrics_config",
-            "produces":[
-                "application/json"
-            ],
-            "tags":[
-                "metrics"
-            ],
-            "parameters":[
-               {
-                "in":"body",
-                "name":"conf",
-                "description":"An array of relabel_config objects",
-                "schema": {
-                    "type":"array",
-                    "items":{
-                        "$ref":"#/definitions/metrics_config",
-                        "description":"metrics Config value"
-                    }
-                }
-               }
-            ],
-            "responses":{
-                "200":{
-                    "description": "OK"
-                },
-                "default":{
-                    "description":"unexpected error",
-                    "schema":{
-                        "$ref":"#/definitions/ErrorModel"
-                    }
-                }
-            }
-        }
-    }
--- a/api/api-doc/raft.json
+++ b/api/api-doc/raft.json
@@ -1,43 +0,0 @@
-{
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/raft",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/raft/trigger_snapshot/{group_id}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Triggers snapshot creation and log truncation for the given Raft group",
-               "type":"string",
-               "nickname":"trigger_snapshot",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"group_id",
-                     "description":"The ID of the group which should get snapshotted",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"timeout",
-                     "description":"Timeout in seconds after which the endpoint returns a failure. If not provided, 60s is used.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"long",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      }
-   ]
-}
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -465,7 +465,7 @@
         "operations":[
            {
               "method":"GET",
-               "summary":"Retrieve the mapping of endpoint to host ID of all nodes that own tokens",
+               "summary":"Retrieve the mapping of endpoint to host ID",
               "type":"array",
               "items":{
                  "type":"mapper"
@@ -701,30 +701,6 @@
            }
         ]
      },
-      {
-         "path":"/storage_service/compact",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Forces major compaction in all keyspaces",
-               "type":"void",
-               "nickname":"force_compaction",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"flush_memtables",
-                     "description":"Controls flushing of memtables before compaction (true by default). Set to \"false\" to skip automatic flushing of memtables before compaction, e.g. when tables were flushed explicitly before invoking the compaction api.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/storage_service/keyspace_compaction/{keyspace}",
         "operations":[
@@ -752,14 +728,6 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"flush_memtables",
-                     "description":"Controls flushing of memtables before compaction (true by default). Set to \"false\" to skip automatic flushing of memtables before compaction, e.g. when tables were flushed explicitly before invoking the compaction api.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
                  }
               ]
            }
@@ -944,21 +912,6 @@
            }
         ]
      },
-      {
-         "path":"/storage_service/flush",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Flush all memtables in all keyspaces.",
-               "type":"void",
-               "nickname":"force_flush",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[]
-            }
-         ]
-      },
      {
         "path":"/storage_service/keyspace_flush/{keyspace}",
         "operations":[
@@ -1161,14 +1114,6 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"ranges_parallelism",
-                     "description":"An integer specifying the number of ranges to repair in parallel by user request. If this number is bigger than the max_repair_ranges_in_parallel calculated by Scylla core, the smaller one will be used.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
                  }
               ]
            },
@@ -2001,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Forces this node to recalculate versions of schema objects.",
+               "summary":"Reset local schema",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
@@ -2542,23 +2487,7 @@
               ]
            }
         ]
-      },
-      {
-         "path":"/storage_service/raft_topology/reload",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Reload Raft topology state from disk.",
-               "type":"void",
-               "nickname":"reload_raft_topology_state",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      }
+      }      
   ],
   "models":{
      "mapper":{
--- a/api/api-doc/swagger20_header.json
+++ b/api/api-doc/swagger20_header.json
@@ -16,7 +16,7 @@
    }
  },
  "host": "{{Host}}",
-  "basePath": "/",
+  "basePath": "/v2",
  "schemes": [
    "http"
  ],
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -1,182 +1,182 @@
 {
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/task_manager",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/task_manager/list_modules",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get all modules names",
-               "type":"array",
-               "items":{
-                  "type":"string"
-               },
-               "nickname":"get_modules",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/list_module_tasks/{module}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get a list of tasks",
-               "type":"array",
-               "items":{
-                  "type":"task_stats"
-               },
-               "nickname":"get_tasks",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"module",
-                     "description":"The module to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"internal",
-                     "description":"Boolean flag indicating whether internal tasks should be shown (false by default)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace to query about",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"table",
-                     "description":"The table to query about",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/task_status/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get task status",
-               "type":"task_status",
-               "nickname":"get_task_status",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/abort_task/{task_id}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Abort running task and its descendants",
-               "type":"void",
-               "nickname":"abort_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to abort",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/wait_task/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Wait for a task to complete",
-               "type":"task_status",
-               "nickname":"wait_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to wait for",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/task_status_recursive/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get statuses of the task and all its descendants",
-               "type":"array",
-               "items":{
-                  "type":"task_status"
-               },
-               "nickname":"get_task_status_recursively",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager/list_modules",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get all modules names",
+                "type":"array",
+                "items":{
+                   "type":"string"
+                },
+                "nickname":"get_modules",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/list_module_tasks/{module}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get a list of tasks",
+                "type":"array",
+                "items":{
+                    "type":"task_stats"
+                },
+                "nickname":"get_tasks",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"module",
+                        "description":"The module to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    },
+                    {
+                        "name":"internal",
+                        "description":"Boolean flag indicating whether internal tasks should be shown (false by default)",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"boolean",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/task_status/{task_id}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get task status",
+                "type":"task_status",
+                "nickname":"get_task_status",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/abort_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Abort running task and its descendants",
+                "type":"void",
+                "nickname":"abort_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to abort",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   }
+                ]
+             }
+          ]
+       },
+       {
+        "path":"/task_manager/wait_task/{task_id}",
+        "operations":[
+           {
+              "method":"GET",
+              "summary":"Wait for a task to complete",
+              "type":"task_status",
+              "nickname":"wait_task",
+              "produces":[
+                 "application/json"
+              ],
+              "parameters":[
+                 {
+                    "name":"task_id",
+                    "description":"The uuid of a task to wait for",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                 }
+              ]
+           }
+        ]
+     },
+     {
+      "path":"/task_manager/task_status_recursive/{task_id}",
+      "operations":[
+         {
+            "method":"GET",
+            "summary":"Get statuses of the task and all its descendants",
+            "type":"array",
+            "items":{
+               "type":"task_status"
+            },
+            "nickname":"get_task_status_recursively",
+            "produces":[
+               "application/json"
+            ],
+            "parameters":[
+                {
+                    "name":"task_id",
+                    "description":"The uuid of a task to query about",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                }
+            ]
+         }
+      ]
+     },
+     {
         "path":"/task_manager/ttl",
         "operations":[
            {
@@ -199,96 +199,88 @@
               ]
            }
         ]
-      }
-   ],
-   "models":{
-      "task_stats" :{
-         "id": "task_stats",
-         "description":"A task statistics object",
-         "properties":{
-            "task_id":{
-               "type":"string",
-               "description":"The uuid of a task"
-            },
-            "state":{
-               "type":"string",
-               "enum":[
+     }
+    ],
+    "models":{
+       "task_stats" :{
+           "id": "task_stats",
+           "description":"A task statistics object",
+           "properties":{
+             "task_id":{
+                "type":"string",
+                "description":"The uuid of a task"
+             },
+             "state":{
+                "type":"string",
+                "enum":[
                  "created",
                  "running",
                  "done",
                  "failed"
-               ],
-               "description":"The state of a task"
-            },
-            "type":{
-               "type":"string",
-               "description":"The description of the task"
-            },
-            "scope":{
-               "type":"string",
-               "description":"The scope of the task"
-            },
-            "keyspace":{
-               "type":"string",
-               "description":"The keyspace the task is working on (if applicable)"
-            },
-            "table":{
-               "type":"string",
-               "description":"The table the task is working on (if applicable)"
-            },
-            "entity":{
-               "type":"string",
-               "description":"Task-specific entity description"
-            },
-            "sequence_number":{
-               "type":"long",
-               "description":"The running sequence number of the task"
-            }
-         }
-      },
-      "task_status":{
-         "id":"task_status",
-         "description":"A task status object",
-         "properties":{
-            "id":{
-               "type":"string",
-               "description":"The uuid of the task"
-            },
-            "type":{
-               "type":"string",
-               "description":"The description of the task"
-            },
-            "scope":{
-               "type":"string",
-               "description":"The scope of the task"
-            },
-            "state":{
+                ],
+                "description":"The state of a task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "keyspace":{
+                "type":"string",
+                "description":"The keyspace the task is working on (if applicable)"
+             },
+             "table":{
+                "type":"string",
+                "description":"The table the task is working on (if applicable)"
+             },
+             "entity":{
+                "type":"string",
+                "description":"Task-specific entity description"
+             },
+             "sequence_number":{
+                "type":"long",
+                "description":"The running sequence number of the task"
+             }
+           }
+       },
+       "task_status":{
+          "id":"task_status",
+          "description":"A task status object",
+          "properties":{
+             "id":{
+                "type":"string",
+                "description":"The uuid of the task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "state":{
               "type":"string",
               "enum":[
-                  "created",
-                  "running",
-                  "done",
-                  "failed"
+                 "created",
+                 "running",
+                 "done",
+                 "failed"
               ],
-               "description":"The state of the task"
-            },
-            "is_abortable":{
-               "type":"boolean",
-               "description":"Boolean flag indicating whether the task can be aborted"
-            },
-            "start_time":{
-               "type":"datetime",
-               "description":"The start time of the task"
-            },
-            "end_time":{
-               "type":"datetime",
-               "description":"The end time of the task (unspecified when the task is not completed)"
-            },
-            "error":{
-               "type":"string",
-               "description":"Error string, if the task failed"
-            },
-            "parent_id":{
+                "description":"The state of the task"
+             },
+             "is_abortable":{
+                "type":"boolean",
+                "description":"Boolean flag indicating whether the task can be aborted"
+             },
+             "start_time":{
+                "type":"datetime",
+                "description":"The start time of the task"
+             },
+             "end_time":{
+                "type":"datetime",
+                "description":"The end time of the task (unspecified when the task is not completed)"
+             },
+             "error":{
+                "type":"string",
+                "description":"Error string, if the task failed"
+             },
+             "parent_id":{
               "type":"string",
               "description":"The uuid of the parent task"
            },
@@ -326,12 +318,12 @@
            },
            "children_ids":{
               "type":"array",
-               "items":{
-                  "type":"string"
-               },
+                "items":{
+                    "type":"string"
+                },
               "description":"Task IDs of children of this task"
            }
-         }
-      }
-   }
-}
+          }
+       }
+    }
+ }
--- a/api/api-doc/task_manager_test.json
+++ b/api/api-doc/task_manager_test.json
@@ -1,153 +1,153 @@
 {
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/task_manager_test",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/task_manager_test/test_module",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Register test module in task manager",
-               "type":"void",
-               "nickname":"register_test_module",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Unregister test module in task manager",
-               "type":"void",
-               "nickname":"unregister_test_module",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager_test/test_task",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Register test task",
-               "type":"string",
-               "nickname":"register_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to register",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"shard",
-                     "description":"The shard of the task",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"long",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"parent_id",
-                     "description":"The uuid of a parent task",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace the task is working on",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"table",
-                     "description":"The table the task is working on",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"entity",
-                     "description":"Task-specific entity description",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Unregister test task",
-               "type":"void",
-               "nickname":"unregister_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to register",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager_test/finish_test_task/{task_id}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Finish test task",
-               "type":"void",
-               "nickname":"finish_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to finish",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"error",
-                     "description":"The error with which task fails (if it does)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      }
-   ]
-}
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager_test",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager_test/test_module",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test module in task manager",
+                "type":"void",
+                "nickname":"register_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test module in task manager",
+                "type":"void",
+                "nickname":"unregister_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/test_task",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test task",
+                "type":"string",
+                "nickname":"register_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"shard",
+                        "description":"The shard of the task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"long",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"parent_id",
+                        "description":"The uuid of a parent task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"entity",
+                        "description":"Task-specific entity description",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test task",
+                "type":"void",
+                "nickname":"unregister_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/finish_test_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Finish test task",
+                "type":"void",
+                "nickname":"finish_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to finish",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   },
+                   {
+                      "name":"error",
+                      "description":"The error with which task fails (if it does)",
+                      "required":false,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"query"
+                   }
+                ]
+             }
+          ]
+       }
+    ]
+ }
--- a/api/api.cc
+++ b/api/api.cc
@@ -31,7 +31,6 @@
 #include "api/config.hh"
 #include "task_manager.hh"
 #include "task_manager_test.hh"
-#include "raft.hh"

 logging::logger apilog("api");

@@ -61,10 +60,8 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
-        rb02->register_api_file(r, "metrics");
        rb->register_function(r, "system",
                "The system related API");
-        rb02->add_definitions_file(r, "metrics");
        set_system(ctx, r);
    });
 }
@@ -72,7 +69,7 @@ future<> set_server_init(http_context& ctx) {
 future<> set_server_config(http_context& ctx, const db::config& cfg) {
    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
    return ctx.http_server.set_routes([&ctx, &cfg, rb02](routes& r) {
-        set_config(rb02, ctx, r, cfg, false);
+        set_config(rb02, ctx, r, cfg);
    });
 }

@@ -103,16 +100,12 @@ future<> unset_rpc_controller(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_rpc_controller(ctx, r); });
 }

-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    return register_api(ctx, "storage_service", "The storage service API", [&ss, &group0_client] (http_context& ctx, routes& r) {
-            set_storage_service(ctx, r, ss, group0_client);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<gms::gossiper>& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks) {
+    return register_api(ctx, "storage_service", "The storage service API", [&ss, &g, &cdc_gs, &sys_ks] (http_context& ctx, routes& r) {
+            set_storage_service(ctx, r, ss, g.local(), cdc_gs, sys_ks);
        });
 }

-future<> unset_server_storage_service(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
-}
-
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader) {
    return ctx.http_server.set_routes([&ctx, &sst_loader] (routes& r) { set_sstables_loader(ctx, r, sst_loader); });
 }
@@ -194,10 +187,10 @@ future<> unset_server_messaging_service(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_messaging_service(ctx, r); });
 }

-future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_proxy>& proxy) {
+future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_service>& ss) {
    return register_api(ctx, "storage_proxy",
-                "The storage proxy API", [&proxy] (http_context& ctx, routes& r) {
-                    set_storage_proxy(ctx, r, proxy);
+                "The storage proxy API", [&ss] (http_context& ctx, routes& r) {
+                    set_storage_proxy(ctx, r, ss);
                });
 }

@@ -221,10 +214,10 @@ future<> set_server_cache(http_context& ctx) {
            "The cache service API", set_cache_service);
 }

-future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& proxy) {
+future<> set_hinted_handoff(http_context& ctx, sharded<gms::gossiper>& g) {
    return register_api(ctx, "hinted_handoff",
-                "The hinted handoff API", [&proxy] (http_context& ctx, routes& r) {
-                    set_hinted_handoff(ctx, r, proxy);
+                "The hinted handoff API", [&g] (http_context& ctx, routes& r) {
+                    set_hinted_handoff(ctx, r, g.local());
                });
 }

@@ -295,18 +288,6 @@ future<> set_server_task_manager_test(http_context& ctx) {

 #endif

-future<> set_server_raft(http_context& ctx, sharded<service::raft_group_registry>& raft_gr) {
-    auto rb = std::make_shared<api_registry_builder>(ctx.api_doc);
-    return ctx.http_server.set_routes([rb, &ctx, &raft_gr] (routes& r) {
-        rb->register_function(r, "raft", "The Raft API");
-        set_raft(ctx, r, raft_gr);
-    });
-}
-
-future<> unset_server_raft(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
-}
-
 void req_params::process(const request& req) {
    // Process mandatory parameters
    for (auto& [name, ent] : params) {
@@ -314,7 +295,7 @@ void req_params::process(const request& req) {
            continue;
        }
        try {
-            ent.value = req.get_path_param(name);
+            ent.value = req.param[name];
        } catch (std::out_of_range&) {
            throw httpd::bad_param_exception(fmt::format("Mandatory parameter '{}' was not provided", name));
        }
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -22,8 +22,6 @@ namespace service {
 class load_meter;
 class storage_proxy;
 class storage_service;
-class raft_group0_client;
-class raft_group_registry;

 } // namespace service

@@ -53,6 +51,7 @@ class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
+namespace cdc { class generation_service; }

 namespace gms {

@@ -69,13 +68,15 @@ struct http_context {
    sstring api_doc;
    httpd::http_server_control http_server;
    distributed<replica::database>& db;
+    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
    const sharded<locator::shared_token_metadata>& shared_token_metadata;
    sharded<tasks::task_manager>& tm;

    http_context(distributed<replica::database>& _db,
+            distributed<service::storage_proxy>& _sp,
            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm, sharded<tasks::task_manager>& _tm)
-            : db(_db), lmeter(_lm), shared_token_metadata(_stm), tm(_tm) {
+            : db(_db), sp(_sp), lmeter(_lm), shared_token_metadata(_stm), tm(_tm) {
    }

    const locator::token_metadata& get_token_metadata();
@@ -85,8 +86,7 @@ future<> set_server_init(http_context& ctx);
 future<> set_server_config(http_context& ctx, const db::config& cfg);
 future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
 future<> unset_server_snitch(http_context& ctx);
-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
-future<> unset_server_storage_service(http_context& ctx);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<gms::gossiper>& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks);
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
 future<> unset_server_sstables_loader(http_context& ctx);
 future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb);
@@ -106,11 +106,11 @@ future<> set_server_load_sstable(http_context& ctx, sharded<db::system_keyspace>
 future<> unset_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
 future<> unset_server_messaging_service(http_context& ctx);
-future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_proxy>& proxy);
+future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_service>& ss);
 future<> unset_server_storage_proxy(http_context& ctx);
 future<> set_server_stream_manager(http_context& ctx, sharded<streaming::stream_manager>& sm);
 future<> unset_server_stream_manager(http_context& ctx);
-future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& p);
+future<> set_hinted_handoff(http_context& ctx, sharded<gms::gossiper>& g);
 future<> unset_hinted_handoff(http_context& ctx);
 future<> set_server_gossip_settle(http_context& ctx, sharded<gms::gossiper>& g);
 future<> set_server_cache(http_context& ctx);
@@ -118,7 +118,5 @@ future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
 future<> set_server_task_manager(http_context& ctx, lw_shared_ptr<db::config> cfg);
 future<> set_server_task_manager_test(http_context& ctx);
-future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
-future<> unset_server_raft(http_context&);

 }
--- a/api/authorization_cache.cc
+++ b/api/authorization_cache.cc
@@ -11,7 +11,6 @@
 #include "api/authorization_cache.hh"
 #include "api/api.hh"
 #include "auth/common.hh"
-#include "auth/service.hh"

 namespace api {
 using namespace json;
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -54,7 +54,7 @@ static const char* str_to_regex(const sstring& v) {
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [](std::unique_ptr<request> req) {

-        auto id = ::make_shared<scollectd::type_instance_id>(req->get_path_param("pluginid"),
+        auto id = ::make_shared<scollectd::type_instance_id>(req->param["pluginid"],
                req->get_query_param("instance"), req->get_query_param("type"),
                req->get_query_param("type_instance"));

@@ -91,7 +91,7 @@ void set_collectd(http_context& ctx, routes& r) {
    });

    cd::enable_collectd.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
-        std::regex plugin(req->get_path_param("pluginid").c_str());
+        std::regex plugin(req->param["pluginid"].c_str());
        std::regex instance(str_to_regex(req->get_query_param("instance")));
        std::regex type(str_to_regex(req->get_query_param("type")));
        std::regex type_instance(str_to_regex(req->get_query_param("type_instance")));
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -43,7 +43,7 @@ std::tuple<sstring, sstring> parse_fully_qualified_cf_name(sstring name) {
    return std::make_tuple(name.substr(0, pos), name.substr(end));
 }

-table_id get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
+const table_id& get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
    try {
        return db.find_uuid(ks, cf);
    } catch (replica::no_such_column_family& e) {
@@ -51,7 +51,7 @@ table_id get_uuid(const sstring& ks, const sstring& cf, const replica::database&
    }
 }

-table_id get_uuid(const sstring& name, const replica::database& db) {
+const table_id& get_uuid(const sstring& name, const replica::database& db) {
    auto [ks, cf] = parse_fully_qualified_cf_name(name);
    return get_uuid(ks, cf, db);
 }
@@ -135,9 +135,9 @@ static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const
 static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
    std::function<utils::ihistogram(const replica::database&)> fun = [f] (const replica::database& db)  {
        utils::ihistogram res;
-        db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) mutable {
-            res += (table->get_stats().*f).hist;
-        });
+        for (auto i : db.get_column_families()) {
+            res += (i.second->get_stats().*f).hist;
+        }
        return res;
    };
    return ctx.db.map(fun).then([](const std::vector<utils::ihistogram> &res) {
@@ -162,9 +162,9 @@ static future<json::json_return_type>  get_cf_rate_and_histogram(http_context& c
 static future<json::json_return_type> get_cf_rate_and_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
    std::function<utils::rate_moving_average_and_histogram(const replica::database&)> fun = [f] (const replica::database& db)  {
        utils::rate_moving_average_and_histogram res;
-        db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
-            res += (table->get_stats().*f).rate();
-        });
+        for (auto i : db.get_column_families()) {
+            res += (i.second->get_stats().*f).rate();
+        }
        return res;
    };
    return ctx.db.map(fun).then([](const std::vector<utils::rate_moving_average_and_histogram> &res) {
@@ -306,21 +306,21 @@ ratio_holder filter_recent_false_positive_as_ratio_holder(const sstables::shared
 void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace>& sys_ks) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        std::vector<sstring> res;
-        ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
-            res.push_back(kscf.first + ":" + kscf.second);
-        });
+        for (auto i: ctx.db.local().get_column_families_mapping()) {
+            res.push_back(i.first.first + ":" + i.first.second);
+        }
        return res;
    });

    cf::get_column_family.set(r, [&ctx] (std::unique_ptr<http::request> req){
-        std::list<cf::column_family_info> res;
-            ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
+            std::list<cf::column_family_info> res;
+            for (auto i: ctx.db.local().get_column_families_mapping()) {
                cf::column_family_info info;
-                info.ks = kscf.first;
-                info.cf =  kscf.second;
+                info.ks = i.first.first;
+                info.cf =  i.first.second;
                info.type = "ColumnFamilies";
                res.push_back(info);
-            });
+            }
            return make_ready_future<json::json_return_type>(json::stream_range_as_array(std::move(res), std::identity()));
        });

@@ -333,7 +333,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t{0}, [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](replica::column_family& cf) {
            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });
@@ -353,7 +353,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
                return active_memtable->region().occupancy().total_space();
            }), uint64_t(0));
@@ -369,7 +369,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
                return active_memtable->region().occupancy().used_space();
            }), uint64_t(0));
@@ -394,7 +394,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_cf_all_memtables_off_heap_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        warn(unimplemented::cause::INDEXES);
-        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
            return cf.occupancy().total_space();
        }, std::plus<int64_t>());
    });
@@ -410,7 +410,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_cf_all_memtables_live_data_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        warn(unimplemented::cause::INDEXES);
-        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
            return cf.occupancy().used_space();
        }, std::plus<int64_t>());
    });
@@ -425,7 +425,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_switch_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(ctx,req->get_path_param("name") ,&replica::column_family_stats::memtable_switch_count);
+        return get_cf_stats(ctx,req->param["name"] ,&replica::column_family_stats::memtable_switch_count);
    });

    cf::get_all_memtable_switch_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -434,7 +434,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_estimated_row_size_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), utils::estimated_histogram(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](replica::column_family& cf) {
            utils::estimated_histogram res(0);
            for (auto sstables = cf.get_sstables(); auto& i : *sstables) {
                res.merge(i->get_stats_metadata().estimated_partition_size);
@@ -446,7 +446,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_estimated_row_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
            uint64_t res = 0;
            for (auto sstables = cf.get_sstables(); auto& i : *sstables) {
                res += i->get_stats_metadata().estimated_partition_size.count();
@@ -457,7 +457,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_estimated_column_count_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), utils::estimated_histogram(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](replica::column_family& cf) {
            utils::estimated_histogram res(0);
            for (auto sstables = cf.get_sstables(); auto& i : *sstables) {
                res.merge(i->get_stats_metadata().estimated_cells_count);
@@ -474,7 +474,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_pending_flushes.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(ctx,req->get_path_param("name") ,&replica::column_family_stats::pending_flushes);
+        return get_cf_stats(ctx,req->param["name"] ,&replica::column_family_stats::pending_flushes);
    });

    cf::get_all_pending_flushes.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -482,7 +482,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_read.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_count(ctx,req->get_path_param("name") ,&replica::column_family_stats::reads);
+        return get_cf_stats_count(ctx,req->param["name"] ,&replica::column_family_stats::reads);
    });

    cf::get_all_read.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -490,7 +490,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_write.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_count(ctx, req->get_path_param("name") ,&replica::column_family_stats::writes);
+        return get_cf_stats_count(ctx, req->param["name"] ,&replica::column_family_stats::writes);
    });

    cf::get_all_write.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -498,19 +498,19 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::reads);
+        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::reads);
    });

    cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_rate_and_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::reads);
+        return get_cf_rate_and_histogram(ctx, req->param["name"], &replica::column_family_stats::reads);
    });

    cf::get_read_latency.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_sum(ctx,req->get_path_param("name") ,&replica::column_family_stats::reads);
+        return get_cf_stats_sum(ctx,req->param["name"] ,&replica::column_family_stats::reads);
    });

    cf::get_write_latency.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_sum(ctx, req->get_path_param("name") ,&replica::column_family_stats::writes);
+        return get_cf_stats_sum(ctx, req->param["name"] ,&replica::column_family_stats::writes);
    });

    cf::get_all_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -522,11 +522,11 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::writes);
+        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::writes);
    });

    cf::get_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_rate_and_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::writes);
+        return get_cf_rate_and_histogram(ctx, req->param["name"], &replica::column_family_stats::writes);
    });

    cf::get_all_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -538,7 +538,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });
@@ -550,7 +550,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_live_ss_table_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(ctx, req->get_path_param("name"), &replica::column_family_stats::live_sstable_count);
+        return get_cf_stats(ctx, req->param["name"], &replica::column_family_stats::live_sstable_count);
    });

    cf::get_all_live_ss_table_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -558,11 +558,11 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_unleveled_sstables.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_unleveled_sstables(ctx, req->get_path_param("name"));
+        return get_cf_unleveled_sstables(ctx, req->param["name"]);
    });

    cf::get_live_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return sum_sstable(ctx, req->get_path_param("name"), false);
+        return sum_sstable(ctx, req->param["name"], false);
    });

    cf::get_all_live_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -570,7 +570,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_total_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return sum_sstable(ctx, req->get_path_param("name"), true);
+        return sum_sstable(ctx, req->param["name"], true);
    });

    cf::get_all_total_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -579,7 +579,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_min_row_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), INT64_MAX, min_partition_size, min_int64);
+        return map_reduce_cf(ctx, req->param["name"], INT64_MAX, min_partition_size, min_int64);
    });

    // FIXME: this refers to partitions, not rows.
@@ -589,7 +589,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_max_row_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), max_partition_size, max_int64);
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), max_partition_size, max_int64);
    });

    // FIXME: this refers to partitions, not rows.
@@ -600,7 +600,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    // FIXME: this refers to partitions, not rows.
    cf::get_mean_row_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        // Cassandra 3.x mean values are truncated as integrals.
-        return map_reduce_cf(ctx, req->get_path_param("name"), integral_ratio_holder(), mean_partition_size, std::plus<integral_ratio_holder>());
+        return map_reduce_cf(ctx, req->param["name"], integral_ratio_holder(), mean_partition_size, std::plus<integral_ratio_holder>());
    });

    // FIXME: this refers to partitions, not rows.
@@ -610,7 +610,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_get_false_positive();
@@ -628,7 +628,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_recent_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_get_recent_false_positive();
@@ -646,7 +646,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), ratio_holder(), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (replica::column_family& cf) {
            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_false_positive_as_ratio_holder), ratio_holder());
        }, std::plus<>());
    });
@@ -658,7 +658,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), ratio_holder(), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (replica::column_family& cf) {
            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_recent_false_positive_as_ratio_holder), ratio_holder());
        }, std::plus<>());
    });
@@ -670,7 +670,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_size();
@@ -688,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_memory_size();
@@ -706,7 +706,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->get_summary().memory_footprint();
@@ -729,7 +729,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        // We are missing the off heap memory calculation
        // Return 0 is the wrong value. It's a work around
        // until the memory calculation will be available
-        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
+        //auto id = get_uuid(req->param["name"], ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -742,7 +742,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    cf::get_speculative_retries.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
+        //auto id = get_uuid(req->param["name"], ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -755,7 +755,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    cf::get_key_cache_hit_rate.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
+        //auto id = get_uuid(req->param["name"], ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -780,7 +780,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    cf::get_row_cache_hit_out_of_range.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
+        //auto id = get_uuid(req->param["name"], ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -791,7 +791,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_row_cache_hit.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_raw(ctx, req->get_path_param("name"), utils::rate_moving_average(), [](const replica::column_family& cf) {
+        return map_reduce_cf_raw(ctx, req->param["name"], utils::rate_moving_average(), [](const replica::column_family& cf) {
            return cf.get_row_cache().stats().hits.rate();
        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
            return make_ready_future<json::json_return_type>(meter_to_json(m));
@@ -807,7 +807,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_row_cache_miss.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_raw(ctx, req->get_path_param("name"), utils::rate_moving_average(), [](const replica::column_family& cf) {
+        return map_reduce_cf_raw(ctx, req->param["name"], utils::rate_moving_average(), [](const replica::column_family& cf) {
            return cf.get_row_cache().stats().misses.rate();
        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
            return make_ready_future<json::json_return_type>(meter_to_json(m));
@@ -824,57 +824,57 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_cas_prepare.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
            return cf.get_stats().cas_prepare.histogram();
        });
    });

    cf::get_cas_propose.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
            return cf.get_stats().cas_accept.histogram();
        });
    });

    cf::get_cas_commit.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
            return cf.get_stats().cas_learn.histogram();
        });
    });

    cf::get_sstables_per_read_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->get_path_param("name"), utils::estimated_histogram(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](replica::column_family& cf) {
            return cf.get_stats().estimated_sstable_per_read;
        },
        utils::estimated_histogram_merge, utils_json::estimated_histogram());
    });

    cf::get_tombstone_scanned_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::tombstone_scanned);
+        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::tombstone_scanned);
    });

    cf::get_live_scanned_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::live_scanned);
+        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::live_scanned);
    });

    cf::get_col_update_time_delta_histogram.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
+        //auto id = get_uuid(req->param["name"], ctx.db.local());
        std::vector<double> res;
        return make_ready_future<json::json_return_type>(res);
    });

    cf::get_auto_compaction.set(r, [&ctx] (const_req req) {
-        auto uuid = get_uuid(req.get_path_param("name"), ctx.db.local());
+        auto uuid = get_uuid(req.param["name"], ctx.db.local());
        replica::column_family& cf = ctx.db.local().find_column_family(uuid);
        return !cf.is_auto_compaction_disabled_by_user();
    });

    cf::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/enable_auto_compaction: name={}", req->get_path_param("name"));
+        apilog.info("column_family/enable_auto_compaction: name={}", req->param["name"]);
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
-            return foreach_column_family(ctx, req->get_path_param("name"), [](replica::column_family &cf) {
+            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
                cf.enable_auto_compaction();
            }).then([g = std::move(g)] {
                return make_ready_future<json::json_return_type>(json_void());
@@ -883,10 +883,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/disable_auto_compaction: name={}", req->get_path_param("name"));
+        apilog.info("column_family/disable_auto_compaction: name={}", req->param["name"]);
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
-            return foreach_column_family(ctx, req->get_path_param("name"), [](replica::column_family &cf) {
+            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
                return cf.disable_auto_compaction();
            }).then([g = std::move(g)] {
                return make_ready_future<json::json_return_type>(json_void());
@@ -895,14 +895,14 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_tombstone_gc.set(r, [&ctx] (const_req req) {
-        auto uuid = get_uuid(req.get_path_param("name"), ctx.db.local());
+        auto uuid = get_uuid(req.param["name"], ctx.db.local());
        replica::table& t = ctx.db.local().find_column_family(uuid);
        return t.tombstone_gc_enabled();
    });

    cf::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/enable_tombstone_gc: name={}", req->get_path_param("name"));
-        return foreach_column_family(ctx, req->get_path_param("name"), [](replica::table& t) {
+        apilog.info("column_family/enable_tombstone_gc: name={}", req->param["name"]);
+        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
            t.set_tombstone_gc_enabled(true);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -910,8 +910,8 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/disable_tombstone_gc: name={}", req->get_path_param("name"));
-        return foreach_column_family(ctx, req->get_path_param("name"), [](replica::table& t) {
+        apilog.info("column_family/disable_tombstone_gc: name={}", req->param["name"]);
+        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
            t.set_tombstone_gc_enabled(false);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -919,7 +919,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_built_indexes.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
-        auto ks_cf = parse_fully_qualified_cf_name(req->get_path_param("name"));
+        auto ks_cf = parse_fully_qualified_cf_name(req->param["name"]);
        auto&& ks = std::get<0>(ks_cf);
        auto&& cf_name = std::get<1>(ks_cf);
        return sys_ks.local().load_view_build_progress().then([ks, cf_name, &ctx](const std::vector<db::system_keyspace_view_build_progress>& vb) mutable {
@@ -957,7 +957,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_compression_ratio.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto uuid = get_uuid(req->get_path_param("name"), ctx.db.local());
+        auto uuid = get_uuid(req->param["name"], ctx.db.local());

        return ctx.db.map_reduce(sum_ratio<double>(), [uuid](replica::database& db) {
            replica::column_family& cf = db.find_column_family(uuid);
@@ -968,21 +968,21 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_read_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
            return cf.get_stats().reads.histogram();
        });
    });

    cf::get_write_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
            return cf.get_stats().writes.histogram();
        });
    });

    cf::set_compaction_strategy_class.set(r, [&ctx](std::unique_ptr<http::request> req) {
        sstring strategy = req->get_query_param("class_name");
-        apilog.info("column_family/set_compaction_strategy_class: name={} strategy={}", req->get_path_param("name"), strategy);
-        return foreach_column_family(ctx, req->get_path_param("name"), [strategy](replica::column_family& cf) {
+        apilog.info("column_family/set_compaction_strategy_class: name={} strategy={}", req->param["name"], strategy);
+        return foreach_column_family(ctx, req->param["name"], [strategy](replica::column_family& cf) {
            cf.set_compaction_strategy(sstables::compaction_strategy::type(strategy));
        }).then([] {
                return make_ready_future<json::json_return_type>(json_void());
@@ -990,7 +990,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_compaction_strategy_class.set(r, [&ctx](const_req req) {
-        return ctx.db.local().find_column_family(get_uuid(req.get_path_param("name"), ctx.db.local())).get_compaction_strategy().name();
+        return ctx.db.local().find_column_family(get_uuid(req.param["name"], ctx.db.local())).get_compaction_strategy().name();
    });

    cf::set_compression_parameters.set(r, [](std::unique_ptr<http::request> req) {
@@ -1006,7 +1006,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_sstable_count_per_level.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        return map_reduce_cf_raw(ctx, req->get_path_param("name"), std::vector<uint64_t>(), [](const replica::column_family& cf) {
+        return map_reduce_cf_raw(ctx, req->param["name"], std::vector<uint64_t>(), [](const replica::column_family& cf) {
            return cf.sstable_count_per_level();
        }, concat_sstable_count_per_level).then([](const std::vector<uint64_t>& res) {
            return make_ready_future<json::json_return_type>(res);
@@ -1015,14 +1015,13 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_sstables_for_key.set(r, [&ctx](std::unique_ptr<http::request> req) {
        auto key = req->get_query_param("key");
-        auto uuid = get_uuid(req->get_path_param("name"), ctx.db.local());
+        auto uuid = get_uuid(req->param["name"], ctx.db.local());

-        return ctx.db.map_reduce0([key, uuid] (replica::database& db) -> future<std::unordered_set<sstring>> {
-            auto sstables = co_await db.find_column_family(uuid).get_sstables_by_partition_key(key);
-            co_return boost::copy_range<std::unordered_set<sstring>>(sstables | boost::adaptors::transformed([] (auto s) { return s->get_filename(); }));
+        return ctx.db.map_reduce0([key, uuid] (replica::database& db) {
+            return db.find_column_family(uuid).get_sstables_by_partition_key(key);
        }, std::unordered_set<sstring>(),
-        [](std::unordered_set<sstring> a, std::unordered_set<sstring>&& b) mutable {
-            a.merge(b);
+            [](std::unordered_set<sstring> a, std::unordered_set<sstring>&& b) mutable {
+            a.insert(b.begin(),b.end());
            return a;
        }).then([](const std::unordered_set<sstring>& res) {
            return make_ready_future<json::json_return_type>(container_to_vec(res));
@@ -1031,7 +1030,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace


    cf::toppartitions.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        auto name = req->get_path_param("name");
+        auto name = req->param["name"];
        auto [ks, cf] = parse_fully_qualified_cf_name(name);

        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
@@ -1047,31 +1046,17 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto params = req_params({
-            std::pair("name", mandatory::yes),
-            std::pair("flush_memtables", mandatory::no),
-            std::pair("split_output", mandatory::no),
-        });
-        params.process(*req);
-        if (params.get("split_output")) {
+        if (req->get_query_param("split_output") != "") {
            fail(unimplemented::cause::API);
        }
-        auto [ks, cf] = parse_fully_qualified_cf_name(*params.get("name"));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.info("column_family/force_major_compaction: name={} flush={}", req->get_path_param("name"), flush);

+        apilog.info("column_family/force_major_compaction: name={}", req->param["name"]);
+        auto [ks, cf] = parse_fully_qualified_cf_name(req->param["name"]);
        auto keyspace = validate_keyspace(ctx, ks);
-        std::vector<table_info> table_infos = {table_info{
-            .name = cf,
-            .id = ctx.db.local().find_uuid(ks, cf)
-        }};
+        std::vector<table_id> table_infos = {ctx.db.local().find_uuid(ks, cf)};

        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<major_compaction_task_impl::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = major_compaction_task_impl::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), ctx.db, std::move(table_infos), fmopt);
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, std::move(table_infos));
        co_await task->done();
        co_return json_void();
    });
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -23,7 +23,7 @@ namespace api {
 void set_column_family(http_context& ctx, httpd::routes& r, sharded<db::system_keyspace>& sys_ks);
 void unset_column_family(http_context& ctx, httpd::routes& r);

-table_id get_uuid(const sstring& name, const replica::database& db);
+const table_id& get_uuid(const sstring& name, const replica::database& db);
 future<> foreach_column_family(http_context& ctx, const sstring& name, std::function<void(replica::column_family&)> f);


@@ -68,10 +68,9 @@ struct map_reduce_column_families_locally {
    std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)> reducer;
    future<std::unique_ptr<std::any>> operator()(replica::database& db) const {
        auto res = seastar::make_lw_shared<std::unique_ptr<std::any>>(std::make_unique<std::any>(init));
-        return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) {
-            *res = reducer(std::move(*res), mapper(*table.get()));
-            return make_ready_future();
-        }).then([res] () {
+        return do_for_each(db.get_column_families(), [res, this](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) {
+            *res = reducer(std::move(*res), mapper(*i.second.get()));
+        }).then([res] {
            return std::move(*res);
        });
    }
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -7,7 +7,6 @@
 */

 #include <seastar/core/coroutine.hh>
-#include <seastar/coroutine/exception.hh>

 #include "compaction_manager.hh"
 #include "compaction/compaction_manager.hh"
@@ -69,8 +68,8 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) {
-                    replica::table& cf = *table.get();
+                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) -> future<> {
+                    replica::table& cf = *i.second.get();
                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
                    return make_ready_future<>();
                }).then([&tasks] {
@@ -110,7 +109,7 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    });

    cm::stop_keyspace_compaction.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto ks_name = validate_keyspace(ctx, req);
+        auto ks_name = validate_keyspace(ctx, req->param);
        auto table_names = parse_tables(ks_name, ctx, req->query_parameters, "tables");
        if (table_names.empty()) {
            table_names = map_keys(ctx.db.local().find_keyspace(ks_name).metadata().get()->cf_meta_data());
@@ -153,13 +152,10 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    });

    cm::get_compaction_history.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        std::function<future<>(output_stream<char>&&)> f = [&ctx] (output_stream<char>&& out) -> future<> {
-            auto s = std::move(out);
-            bool first = true;
-            std::exception_ptr ex;
-            try {
-                co_await s.write("[");
-                co_await ctx.db.local().get_compaction_manager().get_compaction_history([&s, &first](const db::compaction_history_entry& entry) mutable -> future<> {
+        std::function<future<>(output_stream<char>&&)> f = [&ctx](output_stream<char>&& s) {
+            return do_with(output_stream<char>(std::move(s)), true, [&ctx] (output_stream<char>& s, bool& first){
+                return s.write("[").then([&ctx, &s, &first] {
+                    return ctx.db.local().get_compaction_manager().get_compaction_history([&s, &first](const db::compaction_history_entry& entry) mutable {
                        cm::history h;
                        h.id = entry.id.to_sstring();
                        h.ks = std::move(entry.ks);
@@ -173,21 +169,18 @@ void set_compaction_manager(http_context& ctx, routes& r) {
                            e.value = it.second;
                            h.rows_merged.push(std::move(e));
                        }
-                        if (!first) {
-                            co_await s.write(", ");
-                        }
+                        auto fut = first ? make_ready_future<>() : s.write(", ");
                        first = false;
-                        co_await formatter::write(s, h);
+                        return fut.then([&s, h = std::move(h)] {
+                            return formatter::write(s, h);
+                        });
+                    }).then([&s] {
+                        return s.write("]").then([&s] {
+                            return s.close();
+                        });
                    });
-                co_await s.write("]");
-                co_await s.flush();
-            } catch (...) {
-                ex = std::current_exception();
-            }
-            co_await s.close();
-            if (ex) {
-                co_await coroutine::return_exception_ptr(std::move(ex));
-            }
+                });
+            });
        };
        return make_ready_future<json::json_return_type>(std::move(f));
    });
--- a/api/config.cc
+++ b/api/config.cc
@@ -45,7 +45,7 @@ future<> get_config_swagger_entry(std::string_view name, const std::string& desc
    } else {
        ss <<',';
    };
-    ss << "\"/v2/config/" << name <<"\": {"
+    ss << "\"/config/" << name <<"\": {"
      "\"get\": {"
        "\"description\": \"" << boost::replace_all_copy(boost::replace_all_copy(boost::replace_all_copy(description,"\n","\\n"),"\"", "''"), "\t", " ") <<"\","
        "\"operationId\": \"find_config_"<< name <<"\","
@@ -76,9 +76,9 @@ future<> get_config_swagger_entry(std::string_view name, const std::string& desc

 namespace cs = httpd::config_json;

-void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx, routes& r, const db::config& cfg, bool first) {
-    rb->register_function(r, [&cfg, first] (output_stream<char>& os) {
-        return do_with(first, [&os, &cfg] (bool& first) {
+void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx, routes& r, const db::config& cfg) {
+    rb->register_function(r, [&cfg] (output_stream<char>& os) {
+        return do_with(true, [&os, &cfg] (bool& first) {
            auto f = make_ready_future();
            for (auto&& cfg_ref : cfg.values()) {
                auto&& cfg = cfg_ref.get();
@@ -91,7 +91,7 @@ void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx
    });

    cs::find_config_id.set(r, [&cfg] (const_req r) {
-        auto id = r.get_path_param("id");
+        auto id = r.param["id"];
        for (auto&& cfg_ref : cfg.values()) {
            auto&& cfg = cfg_ref.get();
            if (id == cfg.name()) {
--- a/api/config.hh
+++ b/api/config.hh
@@ -13,5 +13,5 @@

 namespace api {

-void set_config(std::shared_ptr<httpd::api_registry_builder20> rb, http_context& ctx, httpd::routes& r, const db::config& cfg, bool first = false);
+void set_config(std::shared_ptr<httpd::api_registry_builder20> rb, http_context& ctx, httpd::routes& r, const db::config& cfg);
 }
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -12,9 +12,7 @@
 #include <seastar/http/exception.hh>
 #include "log.hh"
 #include "utils/error_injection.hh"
-#include "utils/rjson.hh"
 #include <seastar/core/future-util.hh>
-#include <seastar/util/short_streams.hh>

 namespace api {
 using namespace seastar::httpd;
@@ -24,29 +22,12 @@ namespace hf = httpd::error_injection_json;
 void set_error_injection(http_context& ctx, routes& r) {

    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->get_path_param("injection");
+        sstring injection = req->param["injection"];
        bool one_shot = req->get_query_param("one_shot") == "True";
-        auto params = req->content;
-
-        const size_t max_params_size = 1024 * 1024;
-        if (params.size() > max_params_size) {
-            // This is a hard limit, because we don't want to allocate
-            // too much memory or block the thread for too long.
-            throw httpd::bad_param_exception(format("Injection parameters are too long, max length is {}", max_params_size));
-        }
-
-        try {
-            auto parameters = params.empty()
-                ? utils::error_injection_parameters{}
-                : rjson::parse_to_map<utils::error_injection_parameters>(params);
-
-            auto& errinj = utils::get_local_injector();
-            return errinj.enable_on_all(injection, one_shot, std::move(parameters)).then([] {
-                return make_ready_future<json::json_return_type>(json::json_void());
-            });
-        } catch (const rjson::error& e) {
-            throw httpd::bad_param_exception(format("Failed to parse injections parameters: {}", e.what()));
-        }
+        auto& errinj = utils::get_local_injector();
+        return errinj.enable_on_all(injection, one_shot).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
    });

    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
@@ -56,7 +37,7 @@ void set_error_injection(http_context& ctx, routes& r) {
    });

    hf::disable_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->get_path_param("injection");
+        sstring injection = req->param["injection"];

        auto& errinj = utils::get_local_injector();
        return errinj.disable_on_all(injection).then([] {
@@ -71,13 +52,6 @@ void set_error_injection(http_context& ctx, routes& r) {
        });
    });

-    hf::message_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->get_path_param("injection");
-        auto& errinj = utils::get_local_injector();
-        return errinj.receive_message_on_all(injection).then([] {
-            return make_ready_future<json::json_return_type>(json::json_void());
-        });
-    });
 }

 } // namespace api
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -18,43 +18,36 @@ namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            std::vector<fd::endpoint_state> res;
-            res.reserve(g.num_endpoints());
-            g.for_each_endpoint_state([&] (const gms::inet_address& addr, const gms::endpoint_state& eps) {
-                fd::endpoint_state val;
-                val.addrs = fmt::to_string(addr);
-                val.is_alive = g.is_alive(addr);
-                val.generation = eps.get_heart_beat_state().get_generation().value();
-                val.version = eps.get_heart_beat_state().get_heart_beat_version().value();
-                val.update_time = eps.get_update_timestamp().time_since_epoch().count();
-                for (const auto& [as_type, app_state] : eps.get_application_state_map()) {
-                    fd::version_value version_val;
-                    // We return the enum index and not it's name to stay compatible to origin
-                    // method that the state index are static but the name can be changed.
-                    version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(as_type);
-                    version_val.value = app_state.value();
-                    version_val.version = app_state.version().value();
-                    val.application_state.push(version_val);
-                }
-                res.emplace_back(std::move(val));
-            });
-            return make_ready_future<json::json_return_type>(res);
-        });
+        std::vector<fd::endpoint_state> res;
+        for (auto i : g.get_endpoint_states()) {
+            fd::endpoint_state val;
+            val.addrs = fmt::to_string(i.first);
+            val.is_alive = i.second.is_alive();
+            val.generation = i.second.get_heart_beat_state().get_generation().value();
+            val.version = i.second.get_heart_beat_state().get_heart_beat_version().value();
+            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+            for (auto a : i.second.get_application_state_map()) {
+                fd::version_value version_val;
+                // We return the enum index and not it's name to stay compatible to origin
+                // method that the state index are static but the name can be changed.
+                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                version_val.value = a.second.value();
+                version_val.version = a.second.version().value();
+                val.application_state.push(version_val);
+            }
+            res.push_back(val);
+        }
+        return make_ready_future<json::json_return_type>(res);
    });

    fd::get_up_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            int res = g.get_up_endpoint_count();
-            return make_ready_future<json::json_return_type>(res);
-        });
+        int res = g.get_up_endpoint_count();
+        return make_ready_future<json::json_return_type>(res);
    });

    fd::get_down_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            int res = g.get_down_endpoint_count();
-            return make_ready_future<json::json_return_type>(res);
-        });
+        int res = g.get_down_endpoint_count();
+        return make_ready_future<json::json_return_type>(res);
    });

    fd::get_phi_convict_threshold.set(r, [] (std::unique_ptr<request> req) {
@@ -62,13 +55,11 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            std::map<sstring, sstring> nodes_status;
-            g.for_each_endpoint_state([&] (const gms::inet_address& node, const gms::endpoint_state&) {
-                nodes_status.emplace(node.to_sstring(), g.is_alive(node) ? "UP" : "DOWN");
-            });
-            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
-        });
+        std::map<sstring, sstring> nodes_status;
+        for (auto& entry : g.get_endpoint_states()) {
+            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+        }
+        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
    });

    fd::set_phi_convict_threshold.set(r, [](std::unique_ptr<request> req) {
@@ -79,15 +70,13 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
-            auto state = g.get_endpoint_state_ptr(gms::inet_address(req->get_path_param("addr")));
-            if (!state) {
-                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->get_path_param("addr")));
-            }
-            std::stringstream ss;
-            g.append_endpoint_state(ss, *state);
-            return make_ready_future<json::json_return_type>(sstring(ss.str()));
-        });
+        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
+        if (!state) {
+            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+        }
+        std::stringstream ss;
+        g.append_endpoint_state(ss, *state);
+        return make_ready_future<json::json_return_type>(sstring(ss.str()));
    });

    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -6,11 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

-#include <seastar/core/coroutine.hh>
-
 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
-#include "gms/endpoint_state.hh"
 #include "gms/gossiper.hh"

 namespace api {
@@ -18,9 +15,9 @@ using namespace seastar::httpd;
 using namespace json;

 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
-    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
-        auto res = co_await g.get_unreachable_members_synchronized();
-        co_return json::json_return_type(container_to_vec(res));
+    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (const_req req) {
+        auto res = g.get_unreachable_members();
+        return container_to_vec(res);
    });


@@ -30,22 +27,20 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
        });
    });

-    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
-        gms::inet_address ep(req->get_path_param("addr"));
-        // synchronize unreachable_members on all shards
-        co_await g.get_unreachable_members_synchronized();
-        co_return g.get_endpoint_downtime(ep);
+    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
+        gms::inet_address ep(req.param["addr"]);
+        return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<http::request> req) {
-        gms::inet_address ep(req->get_path_param("addr"));
+        gms::inet_address ep(req->param["addr"]);
        return g.get_current_generation_number(ep).then([] (gms::generation_type res) {
            return make_ready_future<json::json_return_type>(res.value());
        });
    });

    httpd::gossiper_json::get_current_heart_beat_version.set(r, [&g] (std::unique_ptr<http::request> req) {
-        gms::inet_address ep(req->get_path_param("addr"));
+        gms::inet_address ep(req->param["addr"]);
        return g.get_current_heart_beat_version(ep).then([] (gms::version_type res) {
            return make_ready_future<json::json_return_type>(res.value());
        });
@@ -53,18 +48,18 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {

    httpd::gossiper_json::assassinate_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
        if (req->get_query_param("unsafe") != "True") {
-            return g.assassinate_endpoint(req->get_path_param("addr")).then([] {
+            return g.assassinate_endpoint(req->param["addr"]).then([] {
                return make_ready_future<json::json_return_type>(json_void());
            });
        }
-        return g.unsafe_assassinate_endpoint(req->get_path_param("addr")).then([] {
+        return g.unsafe_assassinate_endpoint(req->param["addr"]).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    httpd::gossiper_json::force_remove_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
-        gms::inet_address ep(req->get_path_param("addr"));
-        return g.force_remove_endpoint(ep, gms::null_permit_id).then([] {
+        gms::inet_address ep(req->param["addr"]);
+        return g.force_remove_endpoint(ep).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
--- a/api/hinted_handoff.cc
+++ b/api/hinted_handoff.cc
@@ -13,6 +13,7 @@
 #include "api/api-doc/hinted_handoff.json.hh"

 #include "gms/inet_address.hh"
+#include "gms/gossiper.hh"
 #include "service/storage_proxy.hh"

 namespace api {
@@ -21,33 +22,38 @@ using namespace json;
 using namespace seastar::httpd;
 namespace hh = httpd::hinted_handoff_json;

-void set_hinted_handoff(http_context& ctx, routes& r, sharded<service::storage_proxy>& proxy) {
-    hh::create_hints_sync_point.set(r, [&proxy] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto parse_hosts_list = [] (sstring arg) {
+void set_hinted_handoff(http_context& ctx, routes& r, gms::gossiper& g) {
+    hh::create_hints_sync_point.set(r, [&ctx, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto parse_hosts_list = [&g] (sstring arg) {
            std::vector<sstring> hosts_str = split(arg, ",");
            std::vector<gms::inet_address> hosts;
            hosts.reserve(hosts_str.size());

-            for (const auto& host_str : hosts_str) {
-                try {
-                    gms::inet_address host;
-                    host = gms::inet_address(host_str);
-                    hosts.push_back(host);
-                } catch (std::exception& e) {
-                    throw httpd::bad_param_exception(format("Failed to parse host address {}: {}", host_str, e.what()));
+            if (hosts_str.empty()) {
+                // No target_hosts specified means that we should wait for hints for all nodes to be sent
+                const auto members_set = g.get_live_members();
+                std::copy(members_set.begin(), members_set.end(), std::back_inserter(hosts));
+            } else {
+                for (const auto& host_str : hosts_str) {
+                    try {
+                        gms::inet_address host;
+                        host = gms::inet_address(host_str);
+                        hosts.push_back(host);
+                    } catch (std::exception& e) {
+                        throw httpd::bad_param_exception(format("Failed to parse host address {}: {}", host_str, e.what()));
+                    }
                }
            }
-
            return hosts;
        };

        std::vector<gms::inet_address> target_hosts = parse_hosts_list(req->get_query_param("target_hosts"));
-        return proxy.local().create_hint_sync_point(std::move(target_hosts)).then([] (db::hints::sync_point sync_point) {
+        return ctx.sp.local().create_hint_sync_point(std::move(target_hosts)).then([] (db::hints::sync_point sync_point) {
            return json::json_return_type(sync_point.encode());
        });
    });

-    hh::get_hints_sync_point.set(r, [&proxy] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    hh::get_hints_sync_point.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        db::hints::sync_point sync_point;
        const sstring encoded = req->get_query_param("id");
        try {
@@ -81,7 +87,7 @@ void set_hinted_handoff(http_context& ctx, routes& r, sharded<service::storage_p
        using return_type = hh::ns_get_hints_sync_point::get_hints_sync_point_return_type;
        using return_type_wrapper = hh::ns_get_hints_sync_point::return_type_wrapper;

-        return proxy.local().wait_for_hint_sync_point(std::move(sync_point), deadline).then([] {
+        return ctx.sp.local().wait_for_hint_sync_point(std::move(sync_point), deadline).then([] {
            return json::json_return_type(return_type_wrapper(return_type::DONE));
        }).handle_exception_type([] (const timed_out_error&) {
            return json::json_return_type(return_type_wrapper(return_type::IN_PROGRESS));
--- a/api/hinted_handoff.hh
+++ b/api/hinted_handoff.hh
@@ -8,14 +8,17 @@

 #pragma once

-#include <seastar/core/sharded.hh>
 #include "api.hh"

-namespace service { class storage_proxy; }
+namespace gms {
+
+class gossiper;
+
+}

 namespace api {

-void set_hinted_handoff(http_context& ctx, httpd::routes& r, sharded<service::storage_proxy>& p);
+void set_hinted_handoff(http_context& ctx, httpd::routes& r, gms::gossiper& g);
 void unset_hinted_handoff(http_context& ctx, httpd::routes& r);

 }
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2024-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#include <seastar/core/coroutine.hh>
-
-#include "api/api.hh"
-#include "api/api-doc/raft.json.hh"
-
-#include "service/raft/raft_group_registry.hh"
-
-using namespace seastar::httpd;
-
-extern logging::logger apilog;
-
-namespace api {
-
-namespace r = httpd::raft_json;
-using namespace json;
-
-void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
-    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
-        raft::group_id gid{utils::UUID{req->get_path_param("group_id")}};
-        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
-            if (timeout_str.empty()) {
-                return std::chrono::seconds{60};
-            }
-            auto dur = std::stoll(timeout_str);
-            if (dur <= 0) {
-                throw std::runtime_error{"Timeout must be a positive number."};
-            }
-            return std::chrono::seconds{dur};
-        });
-
-        std::atomic<bool> found_srv{false};
-        co_await raft_gr.invoke_on_all([gid, timeout_dur, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
-            auto* srv = raft_gr.find_server(gid);
-            if (!srv) {
-                co_return;
-            }
-
-            found_srv = true;
-            abort_on_expiry aoe(lowres_clock::now() + timeout_dur);
-            apilog.info("Triggering Raft group {} snapshot", gid);
-            auto result = co_await srv->trigger_snapshot(&aoe.abort_source());
-            if (result) {
-                apilog.info("New snapshot for Raft group {} created", gid);
-            } else {
-                apilog.info("Could not create new snapshot for Raft group {}, no new entries applied", gid);
-            }
-        });
-
-        if (!found_srv) {
-            throw std::runtime_error{fmt::format("Server for group ID {} not found", gid)};
-        }
-
-        co_return json_void{};
-    });
-}
-
-void unset_raft(http_context&, httpd::routes& r) {
-    r::trigger_snapshot.unset(r);
-}
-
-}
-
--- a/api/raft.hh
+++ b/api/raft.hh
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) 2023-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#pragma once
-
-#include "api_init.hh"
-
-namespace api {
-
-void set_raft(http_context& ctx, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr);
-void unset_raft(http_context& ctx, httpd::routes& r);
-
-}
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -10,6 +10,7 @@
 #include "service/storage_proxy.hh"
 #include "api/api-doc/storage_proxy.json.hh"
 #include "api/api-doc/utils.json.hh"
+#include "service/storage_service.hh"
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "replica/database.hh"
@@ -115,17 +116,17 @@ utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimat
    return res;
 }

-static future<json::json_return_type>  sum_estimated_histogram(sharded<service::storage_proxy>& proxy, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(proxy, [f] (service::storage_proxy_stats::stats& stats) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
        return (stats.*f).histogram();
    }, utils::time_estimated_histogram_merge, utils::time_estimated_histogram()).then([](const utils::time_estimated_histogram& val) {
        return make_ready_future<json::json_return_type>(time_to_json_histogram(val));
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(sharded<service::storage_proxy>& proxy, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {

-    return two_dimensional_map_reduce(proxy, f, utils::estimated_histogram_merge,
+    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
@@ -133,8 +134,8 @@ static future<json::json_return_type>  sum_estimated_histogram(sharded<service::
    });
 }

-static future<json::json_return_type>  total_latency(sharded<service::storage_proxy>& proxy, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(proxy, [f] (service::storage_proxy_stats::stats& stats) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
            return (stats.*f).hist.mean * (stats.*f).hist.count;
        }, std::plus<double>(), 0.0).then([](double val) {
        int64_t res = val;
@@ -183,43 +184,43 @@ sum_timer_stats_storage_proxy(distributed<proxy>& d,
    });
 }

-void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_proxy>& proxy) {
+void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_service>& ss) {
    sp::get_total_hints.set(r, [](std::unique_ptr<http::request> req)  {
        //TBD
        unimplemented();
        return make_ready_future<json::json_return_type>(0);
    });

-    sp::get_hinted_handoff_enabled.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        const auto& filter = proxy.local().get_hints_host_filter();
+    sp::get_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        const auto& filter = ctx.sp.local().get_hints_host_filter();
        return make_ready_future<json::json_return_type>(!filter.is_disabled_for_all());
    });

-    sp::set_hinted_handoff_enabled.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::set_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        auto enable = req->get_query_param("enable");
        auto filter = (enable == "true" || enable == "1")
                ? db::hints::host_filter(db::hints::host_filter::enabled_for_all_tag {})
                : db::hints::host_filter(db::hints::host_filter::disabled_for_all_tag {});
-        return proxy.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+        return ctx.sp.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
            return sp.change_hints_host_filter(filter);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    sp::get_hinted_handoff_enabled_by_dc.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::get_hinted_handoff_enabled_by_dc.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        std::vector<sstring> res;
-        const auto& filter = proxy.local().get_hints_host_filter();
+        const auto& filter = ctx.sp.local().get_hints_host_filter();
        const auto& dcs = filter.get_dcs();
        res.reserve(res.size());
        std::copy(dcs.begin(), dcs.end(), std::back_inserter(res));
        return make_ready_future<json::json_return_type>(res);
    });

-    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        auto dcs = req->get_query_param("dcs");
        auto filter = db::hints::host_filter::parse_from_dc_list(std::move(dcs));
-        return proxy.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+        return ctx.sp.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
            return sp.change_hints_host_filter(filter);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -341,131 +342,144 @@ void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_pr
        return make_ready_future<json::json_return_type>(json_void());
    });

-    sp::get_read_repair_attempted.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_attempts);
+    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
    });

-    sp::get_read_repair_repaired_blocking.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
+    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
    });

-    sp::get_read_repair_repaired_background.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_repaired_background);
+    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
    });

-    sp::get_cas_read_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_read_timeouts);
+    sp::get_schema_versions.set(r, [&ss](std::unique_ptr<http::request> req)  {
+        return ss.local().describe_schema_versions().then([] (auto result) {
+            std::vector<sp::mapper_list> res;
+            for (auto e : result) {
+                sp::mapper_list entry;
+                entry.key = std::move(e.first);
+                entry.value = std::move(e.second);
+                res.emplace_back(std::move(entry));
+            }
+            return make_ready_future<json::json_return_type>(std::move(res));
+        });
    });

-    sp::get_cas_read_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_read_unavailables);
+    sp::get_cas_read_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_timeouts);
    });

-    sp::get_cas_write_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_write_timeouts);
+    sp::get_cas_read_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_unavailables);
    });

-    sp::get_cas_write_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_write_unavailables);
+    sp::get_cas_write_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_timeouts);
    });

-    sp::get_cas_write_metrics_unfinished_commit.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_write_unfinished_commit);
+    sp::get_cas_write_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_unavailables);
    });

-    sp::get_cas_write_metrics_contention.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &proxy::stats::cas_write_contention);
+    sp::get_cas_write_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_unfinished_commit);
    });

-    sp::get_cas_write_metrics_condition_not_met.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_write_condition_not_met);
+    sp::get_cas_write_metrics_contention.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_write_contention);
    });

-    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_failed_read_round_optimization);
+    sp::get_cas_write_metrics_condition_not_met.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

-    sp::get_cas_read_metrics_unfinished_commit.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_read_unfinished_commit);
+    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
    });

-    sp::get_cas_read_metrics_contention.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &proxy::stats::cas_read_contention);
+    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });

-    sp::get_read_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::read_timeouts);
+    sp::get_cas_read_metrics_contention.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_read_contention);
    });

-    sp::get_read_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::read_unavailables);
+    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

-    sp::get_range_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::range_slice_timeouts);
+    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

-    sp::get_range_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::range_slice_unavailables);
+    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

-    sp::get_write_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::write_timeouts);
+    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

-    sp::get_write_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::write_unavailables);
+    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

-    sp::get_read_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::read_timeouts);
+    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

-    sp::get_read_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::read_unavailables);
+    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

-    sp::get_range_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::range_slice_timeouts);
+    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

-    sp::get_range_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::range_slice_unavailables);
+    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

-    sp::get_write_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::write_timeouts);
+    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

-    sp::get_write_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::write_unavailables);
+    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

-    sp::get_range_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

-    sp::get_write_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_read_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_range_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_write_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::write);
-    });
-    sp::get_cas_write_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats(proxy, &proxy::stats::cas_write);
+    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_cas_read_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats(proxy, &proxy::stats::cas_read);
+    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
+    });
+    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
+    });
+
+    sp::get_cas_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_read);
    });

    sp::get_view_write_metrics_latency_histogram.set(r, [](std::unique_ptr<http::request> req) {
@@ -476,31 +490,31 @@ void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_pr
        return make_ready_future<json::json_return_type>(get_empty_moving_average());
    });

-    sp::get_read_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_read_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_read_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
-    sp::get_write_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_write_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_range_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_range_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
    });
 }

@@ -533,6 +547,7 @@ void unset_storage_proxy(http_context& ctx, routes& r) {
    sp::get_read_repair_attempted.unset(r);
    sp::get_read_repair_repaired_blocking.unset(r);
    sp::get_read_repair_repaired_background.unset(r);
+    sp::get_schema_versions.unset(r);
    sp::get_cas_read_timeouts.unset(r);
    sp::get_cas_read_unavailables.unset(r);
    sp::get_cas_write_timeouts.unset(r);
--- a/api/storage_proxy.hh
+++ b/api/storage_proxy.hh
@@ -11,11 +11,11 @@
 #include <seastar/core/sharded.hh>
 #include "api.hh"

-namespace service { class storage_proxy; }
+namespace service { class storage_service; }

 namespace api {

-void set_storage_proxy(http_context& ctx, httpd::routes& r, sharded<service::storage_proxy>& proxy);
+void set_storage_proxy(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss);
 void unset_storage_proxy(http_context& ctx, httpd::routes& r);

 }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -8,7 +8,6 @@

 #include "storage_service.hh"
 #include "api/api-doc/storage_service.json.hh"
-#include "api/api-doc/storage_proxy.json.hh"
 #include "db/config.hh"
 #include "db/schema_tables.hh"
 #include "utils/hash.hh"
@@ -43,6 +42,7 @@
 #include "thrift/controller.hh"
 #include "locator/token_metadata.hh"
 #include "cdc/generation_service.hh"
+#include "service/storage_proxy.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "sstables_loader.hh"
 #include "db/view/view_builder.hh"
@@ -52,25 +52,33 @@ using namespace std::chrono_literals;

 extern logging::logger apilog;

+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti) {
+    fmt::print(os, "table{{name={}, id={}}}", ti.name, ti.id);
+    return os;
+}
+
+} // namespace std
+
 namespace api {

+const locator::token_metadata& http_context::get_token_metadata() {
+        return *shared_token_metadata.local().get();
+}
+
 namespace ss = httpd::storage_service_json;
-namespace sp = httpd::storage_proxy_json;
 using namespace json;

-sstring validate_keyspace(const http_context& ctx, sstring ks_name) {
+sstring validate_keyspace(http_context& ctx, sstring ks_name) {
    if (ctx.db.local().has_keyspace(ks_name)) {
        return ks_name;
    }
    throw bad_param_exception(replica::no_such_keyspace(ks_name).what());
 }

-sstring validate_keyspace(const http_context& ctx, const std::unique_ptr<http::request>& req) {
-    return validate_keyspace(ctx, req->get_path_param("keyspace"));
-}
-
-sstring validate_keyspace(const http_context& ctx, const http::request& req) {
-    return validate_keyspace(ctx, req.get_path_param("keyspace"));
+sstring validate_keyspace(http_context& ctx, const parameters& param) {
+    return validate_keyspace(ctx, param["keyspace"]);
 }

 locator::host_id validate_host_id(const sstring& param) {
@@ -175,7 +183,7 @@ using ks_cf_func = std::function<future<json::json_return_type>(http_context&, s

 static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    return [&ctx, f = std::move(f)](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
        return f(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
    };
@@ -254,21 +262,17 @@ future<json::json_return_type> set_tables_tombstone_gc(http_context& ctx, const
 }

 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
-    ss::start_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.start_server();
-            });
+            return ctl.start_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::stop_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.request_stop_server();
-            });
+            return ctl.request_stop_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -290,21 +294,17 @@ void unset_transport_controller(http_context& ctx, routes& r) {
 }

 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
-    ss::stop_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.request_stop_server();
-            });
+            return ctl.request_stop_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::start_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.start_server();
-            });
+            return ctl.start_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -329,7 +329,7 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair) {
    ss::repair_async.set(r, [&ctx, &repair](std::unique_ptr<http::request> req) {
        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "ignore_nodes", "trace",
-                "startToken", "endToken", "ranges_parallelism"};
+                "startToken", "endToken" };
        std::unordered_map<sstring, sstring> options_map;
        for (auto o : options) {
            auto s = req->get_query_param(o);
@@ -342,7 +342,7 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair) {
        // returns immediately, not waiting for the repair to finish. The user
        // then has other mechanisms to track the ongoing repair's progress,
        // or stop it.
-        return repair_start(repair, validate_keyspace(ctx, req),
+        return repair_start(repair, validate_keyspace(ctx, req->param),
                options_map).then([] (int i) {
                    return make_ready_future<json::json_return_type>(i);
                });
@@ -425,7 +425,7 @@ void unset_repair(http_context& ctx, routes& r) {

 void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>& sst_loader) {
    ss::load_new_ss_tables.set(r, [&ctx, &sst_loader](std::unique_ptr<http::request> req) {
-        auto ks = validate_keyspace(ctx, req);
+        auto ks = validate_keyspace(ctx, req->param);
        auto cf = req->get_query_param("cf");
        auto stream = req->get_query_param("load_and_stream");
        auto primary_replica = req->get_query_param("primary_replica_only");
@@ -456,8 +456,8 @@ void unset_sstables_loader(http_context& ctx, routes& r) {

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb) {
    ss::view_build_statuses.set(r, [&ctx, &vb] (std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req);
-        auto view = req->get_path_param("view");
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto view = req->param["view"];
        return vb.local().view_build_statuses(std::move(keyspace), std::move(view)).then([] (std::unordered_map<sstring, sstring> status) {
            std::vector<storage_service_json::mapper> res;
            return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
@@ -474,21 +474,29 @@ static future<json::json_return_type> describe_ring_as_json(sharded<service::sto
    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring(keyspace), token_range_endpoints_to_json));
 }

-void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    ss::local_hostid.set(r, [&ss](std::unique_ptr<http::request> req) {
-        auto id = ss.local().get_token_metadata().get_my_id();
+static std::vector<table_id> get_table_ids(const std::vector<table_info>& table_infos) {
+    std::vector<table_id> table_ids{table_infos.size()};
+    boost::transform(table_infos, table_ids.begin(), [] (const auto& ti) {
+        return ti.id;
+    });
+    return table_ids;
+}
+
+void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks) {
+    ss::local_hostid.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        auto id = ctx.db.local().get_config().host_id;
        return make_ready_future<json::json_return_type>(id.to_sstring());
    });

-    ss::get_tokens.set(r, [&ss] (std::unique_ptr<http::request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ss.local().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<http::request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().sorted_tokens(), [](const dht::token& i) {
           return fmt::to_string(i);
        }));
    });

-    ss::get_node_tokens.set(r, [&ss] (std::unique_ptr<http::request> req) {
+    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ss.local().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().get_tokens(addr), [](const dht::token& i) {
           return fmt::to_string(i);
       }));
    });
@@ -556,8 +564,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::get_leaving_nodes.set(r, [&ss](const_req req) {
-        return container_to_vec(ss.local().get_token_metadata().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
+        return container_to_vec(ctx.get_token_metadata().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -565,8 +573,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [&ss](const_req req) {
-        auto points = ss.local().get_token_metadata().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
+        auto points = ctx.get_token_metadata().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(fmt::to_string(i.second));
@@ -594,7 +602,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::get_range_to_endpoint_map.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        std::vector<ss::maplist_mapper> res;
        co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace),
                [](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
@@ -619,7 +627,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::get_pending_range_to_endpoint_map.set(r, [&ctx](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        std::vector<ss::maplist_mapper> res;
        return make_ready_future<json::json_return_type>(res);
    });
@@ -635,12 +643,12 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::describe_ring.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) {
-        return describe_ring_as_json(ss, validate_keyspace(ctx, req));
+        return describe_ring_as_json(ss, validate_keyspace(ctx, req->param));
    });

-    ss::get_host_id_map.set(r, [&ss](const_req req) {
+    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(ss.local().get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<http::request> req) {
@@ -660,71 +668,36 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::get_current_generation_number.set(r, [&ss](std::unique_ptr<http::request> req) {
+    ss::get_current_generation_number.set(r, [&g](std::unique_ptr<http::request> req) {
        gms::inet_address ep(utils::fb_utilities::get_broadcast_address());
-        return ss.local().gossiper().get_current_generation_number(ep).then([](gms::generation_type res) {
+        return g.get_current_generation_number(ep).then([](gms::generation_type res) {
            return make_ready_future<json::json_return_type>(res.value());
        });
    });

    ss::get_natural_endpoints.set(r, [&ctx, &ss](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req.param);
        return container_to_vec(ss.local().get_natural_endpoints(keyspace, req.get_query_param("cf"),
                req.get_query_param("key")));
    });

-    ss::cdc_streams_check_and_repair.set(r, [&ss] (std::unique_ptr<http::request> req) {
-        return ss.invoke_on(0, [] (service::storage_service& ss) {
-            return ss.check_and_repair_cdc_streams();
-        }).then([] {
+    ss::cdc_streams_check_and_repair.set(r, [&cdc_gs] (std::unique_ptr<http::request> req) {
+        if (!cdc_gs.local_is_initialized()) {
+            throw std::runtime_error("get_cdc_generation_service: not initialized yet");
+        }
+        return cdc_gs.local().check_and_repair_cdc_streams().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::force_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("flush_memtables", mandatory::no),
-        });
-        params.process(*req);
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.info("force_compaction: flush={}", flush);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<major_compaction_task_impl::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = major_compaction_task_impl::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_compaction failed: {}", std::current_exception());
-            throw;
-        }
-
-        co_return json_void();
-    });
-
    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("keyspace", mandatory::yes),
-            std::pair("cf", mandatory::no),
-            std::pair("flush_memtables", mandatory::no),
-        });
-        params.process(*req);
-        auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
-        auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.debug("force_keyspace_compaction: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, table_infos);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<major_compaction_task_impl::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = major_compaction_task_impl::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos));
        try {
            co_await task->done();
        } catch (...) {
@@ -737,7 +710,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& db = ctx.db;
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
@@ -747,7 +720,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        }

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos);
+        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos));
        try {
            co_await task->done();
        } catch (...) {
@@ -762,7 +735,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, res);
+        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, get_table_ids(table_infos), res);
        try {
            co_await task->done();
        } catch (...) {
@@ -780,7 +753,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos), exclude_current_version);
        try {
            co_await task->done();
        } catch (...) {
@@ -791,16 +764,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        co_return json::json_return_type(0);
    }));

-    ss::force_flush.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        apilog.info("flush all tables");
-        co_await ctx.db.invoke_on_all([] (replica::database& db) {
-            return db.flush_all_tables();
-        });
-        co_return json_void();
-    });
-
    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
        auto& db = ctx.db;
@@ -829,16 +794,21 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::remove_node.set(r, [&ss](std::unique_ptr<http::request> req) {
        auto host_id = validate_host_id(req->get_query_param("host_id"));
-        std::vector<sstring> ignore_nodes_strs = utils::split_comma_separated_list(req->get_query_param("ignore_nodes"));
+        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<locator::host_id_or_endpoint>();
-        for (const sstring& n : ignore_nodes_strs) {
+        for (std::string n : ignore_nodes_strs) {
            try {
-                auto hoep = locator::host_id_or_endpoint(n);
-                if (!ignore_nodes.empty() && hoep.has_host_id() != ignore_nodes.front().has_host_id()) {
-                    throw std::runtime_error("All nodes should be identified using the same method: either Host IDs or ip addresses.");
+                std::replace(n.begin(), n.end(), '\"', ' ');
+                std::replace(n.begin(), n.end(), '\'', ' ');
+                boost::trim_all(n);
+                if (!n.empty()) {
+                    auto hoep = locator::host_id_or_endpoint(n);
+                    if (!ignore_nodes.empty() && hoep.has_host_id() != ignore_nodes.front().has_host_id()) {
+                        throw std::runtime_error("All nodes should be identified using the same method: either Host IDs or ip addresses.");
+                    }
+                    ignore_nodes.push_back(std::move(hoep));
                }
-                ignore_nodes.push_back(std::move(hoep));
            } catch (...) {
                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}: {}", ignore_nodes_strs, n, std::current_exception()));
            }
@@ -909,7 +879,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::truncate.set(r, [&ctx](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_family = req->get_query_param("cf");
        return make_ready_future<json::json_return_type>(json_void());
    });
@@ -951,11 +921,11 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(json_void());
    });

-    ss::is_initialized.set(r, [&ss](std::unique_ptr<http::request> req) {
-        return ss.local().get_operation_mode().then([&ss] (auto mode) {
+    ss::is_initialized.set(r, [&ss, &g](std::unique_ptr<http::request> req) {
+        return ss.local().get_operation_mode().then([&g] (auto mode) {
            bool is_initialized = mode >= service::storage_service::mode::STARTING;
            if (mode == service::storage_service::mode::NORMAL) {
-                is_initialized = ss.local().gossiper().is_enabled();
+                is_initialized = g.is_enabled();
            }
            return make_ready_future<json::json_return_type>(is_initialized);
        });
@@ -1024,9 +994,10 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                ks.set_incremental_backups(value);
            }

-            db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
-                table->set_incremental_backups(value);
-            });
+            for (auto& pair: db.get_column_families()) {
+                auto cf_ptr = pair.second;
+                cf_ptr->set_incremental_backups(value);
+            }
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -1043,14 +1014,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::bulk_load.set(r, [](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto path = req->get_path_param("path");
+        auto path = req->param["path"];
        return make_ready_future<json::json_return_type>(json_void());
    });

    ss::bulk_load_async.set(r, [](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto path = req->get_path_param("path");
+        auto path = req->param["path"];
        return make_ready_future<json::json_return_type>(json_void());
    });

@@ -1067,11 +1038,13 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    ss::reset_local_schema.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
+        auto& fs = ctx.sp.local().features();
        apilog.info("reset_local_schema");
-        co_await ss.local().reload_schema();
-        co_return json_void();
+        return db::schema_tables::recalculate_schema_version(sys_ks, ctx.sp, fs).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<http::request> req) {
@@ -1138,7 +1111,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
@@ -1146,7 +1119,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
@@ -1154,7 +1127,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("enable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
@@ -1162,7 +1135,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req);
+        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("disable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
@@ -1176,12 +1149,12 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(json_void());
      });

-    ss::get_cluster_name.set(r, [&ss](const_req req) {
-        return ss.local().gossiper().get_cluster_name();
+    ss::get_cluster_name.set(r, [&g](const_req req) {
+        return g.get_cluster_name();
    });

-    ss::get_partitioner_name.set(r, [&ss](const_req req) {
-        return ss.local().gossiper().get_partitioner_name();
+    ss::get_partitioner_name.set(r, [&g](const_req req) {
+        return g.get_partitioner_name();
    });

    ss::get_tombstone_warn_threshold.set(r, [](std::unique_ptr<http::request> req) {
@@ -1258,7 +1231,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::get_effective_ownership.set(r, [&ctx, &ss] (std::unique_ptr<http::request> req) {
-        auto keyspace_name = req->get_path_param("keyspace") == "null" ? "" : validate_keyspace(ctx, req);
+        auto keyspace_name = req->param["keyspace"] == "null" ? "" : validate_keyspace(ctx, req->param);
        return ss.local().effective_ownership(keyspace_name).then([] (auto&& ownership) {
            std::vector<storage_service_json::mapper> res;
            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
@@ -1299,7 +1272,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

                auto& ext = db.get_config().extensions();

-                db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
+                for (auto& t : db.get_column_families() | boost::adaptors::map_values) {
                    auto& schema = t->schema();
                    if ((ks.empty() || ks == schema->ks_name()) && (cf.empty() || cf == schema->cf_name())) {
                        // at most Nsstables long
@@ -1380,7 +1353,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                        }
                        res.emplace_back(std::move(tst));
                    }
-                });
+                }
                std::sort(res.begin(), res.end(), [](const ss::table_sstables& t1, const ss::table_sstables& t2) {
                    return t1.keyspace() < t2.keyspace() || (t1.keyspace() == t2.keyspace() && t1.table() < t2.table());
                });
@@ -1390,125 +1363,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
            });
        });
    });
-
-    ss::reload_raft_topology_state.set(r,
-            [&ss, &group0_client] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        co_await ss.invoke_on(0, [&group0_client] (service::storage_service& ss) -> future<> {
-            apilog.info("Waiting for group 0 read/apply mutex before reloading Raft topology state...");
-            auto holder = co_await group0_client.hold_read_apply_mutex();
-            apilog.info("Reloading Raft topology state");
-            // Using topology_transition() instead of topology_state_load(), because the former notifies listeners
-            co_await ss.topology_transition();
-            apilog.info("Reloaded Raft topology state");
-        });
-        co_return json_void();
-    });
-
-    sp::get_schema_versions.set(r, [&ss](std::unique_ptr<http::request> req)  {
-        return ss.local().describe_schema_versions().then([] (auto result) {
-            std::vector<sp::mapper_list> res;
-            for (auto e : result) {
-                sp::mapper_list entry;
-                entry.key = std::move(e.first);
-                entry.value = std::move(e.second);
-                res.emplace_back(std::move(entry));
-            }
-            return make_ready_future<json::json_return_type>(std::move(res));
-        });
-    });
-}
-
-void unset_storage_service(http_context& ctx, routes& r) {
-    ss::local_hostid.unset(r);
-    ss::get_tokens.unset(r);
-    ss::get_node_tokens.unset(r);
-    ss::get_commitlog.unset(r);
-    ss::get_token_endpoint.unset(r);
-    ss::toppartitions_generic.unset(r);
-    ss::get_leaving_nodes.unset(r);
-    ss::get_moving_nodes.unset(r);
-    ss::get_joining_nodes.unset(r);
-    ss::get_release_version.unset(r);
-    ss::get_scylla_release_version.unset(r);
-    ss::get_schema_version.unset(r);
-    ss::get_all_data_file_locations.unset(r);
-    ss::get_saved_caches_location.unset(r);
-    ss::get_range_to_endpoint_map.unset(r);
-    ss::get_pending_range_to_endpoint_map.unset(r);
-    ss::describe_any_ring.unset(r);
-    ss::describe_ring.unset(r);
-    ss::get_host_id_map.unset(r);
-    ss::get_load.unset(r);
-    ss::get_load_map.unset(r);
-    ss::get_current_generation_number.unset(r);
-    ss::get_natural_endpoints.unset(r);
-    ss::cdc_streams_check_and_repair.unset(r);
-    ss::force_compaction.unset(r);
-    ss::force_keyspace_compaction.unset(r);
-    ss::force_keyspace_cleanup.unset(r);
-    ss::perform_keyspace_offstrategy_compaction.unset(r);
-    ss::upgrade_sstables.unset(r);
-    ss::force_flush.unset(r);
-    ss::force_keyspace_flush.unset(r);
-    ss::decommission.unset(r);
-    ss::move.unset(r);
-    ss::remove_node.unset(r);
-    ss::get_removal_status.unset(r);
-    ss::force_remove_completion.unset(r);
-    ss::set_logging_level.unset(r);
-    ss::get_logging_levels.unset(r);
-    ss::get_operation_mode.unset(r);
-    ss::is_starting.unset(r);
-    ss::get_drain_progress.unset(r);
-    ss::drain.unset(r);
-    ss::truncate.unset(r);
-    ss::get_keyspaces.unset(r);
-    ss::stop_gossiping.unset(r);
-    ss::start_gossiping.unset(r);
-    ss::is_gossip_running.unset(r);
-    ss::stop_daemon.unset(r);
-    ss::is_initialized.unset(r);
-    ss::join_ring.unset(r);
-    ss::is_joined.unset(r);
-    ss::set_stream_throughput_mb_per_sec.unset(r);
-    ss::get_stream_throughput_mb_per_sec.unset(r);
-    ss::get_compaction_throughput_mb_per_sec.unset(r);
-    ss::set_compaction_throughput_mb_per_sec.unset(r);
-    ss::is_incremental_backups_enabled.unset(r);
-    ss::set_incremental_backups_enabled.unset(r);
-    ss::rebuild.unset(r);
-    ss::bulk_load.unset(r);
-    ss::bulk_load_async.unset(r);
-    ss::reschedule_failed_deletions.unset(r);
-    ss::sample_key_range.unset(r);
-    ss::reset_local_schema.unset(r);
-    ss::set_trace_probability.unset(r);
-    ss::get_trace_probability.unset(r);
-    ss::get_slow_query_info.unset(r);
-    ss::set_slow_query.unset(r);
-    ss::enable_auto_compaction.unset(r);
-    ss::disable_auto_compaction.unset(r);
-    ss::enable_tombstone_gc.unset(r);
-    ss::disable_tombstone_gc.unset(r);
-    ss::deliver_hints.unset(r);
-    ss::get_cluster_name.unset(r);
-    ss::get_partitioner_name.unset(r);
-    ss::get_tombstone_warn_threshold.unset(r);
-    ss::set_tombstone_warn_threshold.unset(r);
-    ss::get_tombstone_failure_threshold.unset(r);
-    ss::set_tombstone_failure_threshold.unset(r);
-    ss::get_batch_size_failure_threshold.unset(r);
-    ss::set_batch_size_failure_threshold.unset(r);
-    ss::set_hinted_handoff_throttle_in_kb.unset(r);
-    ss::get_metrics_load.unset(r);
-    ss::get_exceptions.unset(r);
-    ss::get_total_hints_in_progress.unset(r);
-    ss::get_total_hints.unset(r);
-    ss::get_ownership.unset(r);
-    ss::get_effective_ownership.unset(r);
-    ss::sstable_info.unset(r);
-    ss::reload_raft_topology_state.unset(r);
-    sp::get_schema_versions.unset(r);
 }

 enum class scrub_status {
@@ -1546,10 +1400,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
                        });
                    }).then([&s] {
                        return s.write("]").then([&s] {
-                            return s.flush();
+                            return s.close();
                        });
-                    }).finally([&s] {
-                        return s.close();
                    });
                });
            };
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -25,6 +25,7 @@ class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
+namespace cdc { class generation_service; }
 class sstables_loader;

 namespace gms {
@@ -37,11 +38,11 @@ namespace api {

 // verify that the keyspace is found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective keyspace error.
-sstring validate_keyspace(const http_context& ctx, sstring ks_name);
+sstring validate_keyspace(http_context& ctx, sstring ks_name);

 // verify that the keyspace parameter is found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective keyspace error.
-sstring validate_keyspace(const http_context& ctx, const std::unique_ptr<http::request>& req);
+sstring validate_keyspace(http_context& ctx, const httpd::parameters& param);

 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
@@ -50,6 +51,11 @@ sstring validate_keyspace(const http_context& ctx, const std::unique_ptr<http::r
 // If the parameter is found and empty, returns a list of all table names in the keyspace.
 std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

+struct table_info {
+    sstring name;
+    table_id id;
+};
+
 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective no_such_column_family error.
@@ -57,8 +63,7 @@ std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, con
 // if the parameter is not found or is empty, returns a list of all table infos in the keyspace.
 std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

-void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
-void unset_storage_service(http_context& ctx, httpd::routes& r);
+void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ls);
 void set_sstables_loader(http_context& ctx, httpd::routes& r, sharded<sstables_loader>& sst_loader);
 void unset_sstables_loader(http_context& ctx, httpd::routes& r);
 void set_view_builder(http_context& ctx, httpd::routes& r, sharded<db::view::view_builder>& vb);
@@ -74,3 +79,9 @@ void unset_snapshot(http_context& ctx, httpd::routes& r);
 seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);

 } // namespace api
+
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti);
+
+} // namespace std
--- a/api/stream_manager.cc
+++ b/api/stream_manager.cc
@@ -106,7 +106,7 @@ void set_stream_manager(http_context& ctx, routes& r, sharded<streaming::stream_
    });

    hs::get_total_incoming_bytes.set(r, [&sm](std::unique_ptr<request> req) {
-        gms::inet_address peer(req->get_path_param("peer"));
+        gms::inet_address peer(req->param["peer"]);
        return sm.map_reduce0([peer](streaming::stream_manager& sm) {
            return sm.get_progress_on_all_shards(peer).then([] (auto sbytes) {
                return sbytes.bytes_received;
@@ -127,7 +127,7 @@ void set_stream_manager(http_context& ctx, routes& r, sharded<streaming::stream_
    });

    hs::get_total_outgoing_bytes.set(r, [&sm](std::unique_ptr<request> req) {
-        gms::inet_address peer(req->get_path_param("peer"));
+        gms::inet_address peer(req->param["peer"]);
        return sm.map_reduce0([peer] (streaming::stream_manager& sm) {
            return sm.get_progress_on_all_shards(peer).then([] (auto sbytes) {
                return sbytes.bytes_sent;
--- a/api/system.cc
+++ b/api/system.cc
@@ -7,18 +7,10 @@
 */

 #include "api/api-doc/system.json.hh"
-#include "api/api-doc/metrics.json.hh"
-
 #include "api/api.hh"

 #include <seastar/core/reactor.hh>
-#include <seastar/core/metrics_api.hh>
-#include <seastar/core/relabel_config.hh>
 #include <seastar/http/exception.hh>
-#include <seastar/util/short_streams.hh>
-#include <seastar/http/short_streams.hh>
-#include "utils/rjson.hh"
-
 #include "log.hh"
 #include "replica/database.hh"

@@ -28,77 +20,8 @@ namespace api {
 using namespace seastar::httpd;

 namespace hs = httpd::system_json;
-namespace hm = httpd::metrics_json;

 void set_system(http_context& ctx, routes& r) {
-    hm::get_metrics_config.set(r, [](const_req req) {
-        std::vector<hm::metrics_config> res;
-        res.resize(seastar::metrics::get_relabel_configs().size());
-        size_t i = 0;
-        for (auto&& r : seastar::metrics::get_relabel_configs()) {
-            res[i].action = r.action;
-            res[i].target_label = r.target_label;
-            res[i].replacement = r.replacement;
-            res[i].separator = r.separator;
-            res[i].source_labels = r.source_labels;
-            res[i].regex = r.expr.str();
-            i++;
-        }
-        return res;
-    });
-
-    hm::set_metrics_config.set(r, [](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        rapidjson::Document doc;
-        doc.Parse(req->content.c_str());
-        if (!doc.IsArray()) {
-            throw bad_param_exception("Expected a json array");
-        }
-        std::vector<seastar::metrics::relabel_config> relabels;
-        relabels.resize(doc.Size());
-        for (rapidjson::SizeType i = 0; i < doc.Size(); i++) {
-            const auto& element = doc[i];
-            if (element.HasMember("source_labels")) {
-                std::vector<std::string> source_labels;
-                source_labels.resize(element["source_labels"].Size());
-
-                for (size_t j = 0; j < element["source_labels"].Size(); j++) {
-                    source_labels[j] = element["source_labels"][j].GetString();
-                }
-                relabels[i].source_labels = source_labels;
-            }
-            if (element.HasMember("action")) {
-                relabels[i].action = seastar::metrics::relabel_config_action(element["action"].GetString());
-            }
-            if (element.HasMember("replacement")) {
-                relabels[i].replacement = element["replacement"].GetString();
-            }
-            if (element.HasMember("separator")) {
-                relabels[i].separator = element["separator"].GetString();
-            }
-            if (element.HasMember("target_label")) {
-                relabels[i].target_label = element["target_label"].GetString();
-            }
-            if (element.HasMember("regex")) {
-                relabels[i].expr = element["regex"].GetString();
-            }
-        }
-        return do_with(std::move(relabels), false, [](const std::vector<seastar::metrics::relabel_config>& relabels, bool& failed) {
-            return smp::invoke_on_all([&relabels, &failed] {
-                return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
-                    if (result.metrics_relabeled_due_to_collision > 0) {
-                        failed = true;
-                    }
-                    return;
-                });
-            }).then([&failed](){
-                if (failed) {
-                    throw bad_param_exception("conflicts found during relabeling");
-                }
-                return make_ready_future<json::json_return_type>(seastar::json::json_void());
-            });
-        });
-    });
-
    hs::get_system_uptime.set(r, [](const_req req) {
        return std::chrono::duration_cast<std::chrono::milliseconds>(engine().uptime()).count();
    });
@@ -119,9 +42,9 @@ void set_system(http_context& ctx, routes& r) {

    hs::get_logger_level.set(r, [](const_req req) {
        try {
-            return logging::level_name(logging::logger_registry().get_logger_level(req.get_path_param("name")));
+            return logging::level_name(logging::logger_registry().get_logger_level(req.param["name"]));
        } catch (std::out_of_range& e) {
-            throw bad_param_exception("Unknown logger name " + req.get_path_param("name"));
+            throw bad_param_exception("Unknown logger name " + req.param["name"]);
        }
        // just to keep the compiler happy
        return sstring();
@@ -130,9 +53,9 @@ void set_system(http_context& ctx, routes& r) {
    hs::set_logger_level.set(r, [](const_req req) {
        try {
            logging::log_level level = boost::lexical_cast<logging::log_level>(std::string(req.get_query_param("level")));
-            logging::logger_registry().set_logger_level(req.get_path_param("name"), level);
+            logging::logger_registry().set_logger_level(req.param["name"], level);
        } catch (std::out_of_range& e) {
-            throw bad_param_exception("Unknown logger name " + req.get_path_param("name"));
+            throw bad_param_exception("Unknown logger name " + req.param["name"]);
        } catch (boost::bad_lexical_cast& e) {
            throw bad_param_exception("Unknown logging level " + req.get_query_param("level"));
        }
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -7,7 +7,6 @@
 */

 #include <seastar/core/coroutine.hh>
-#include <seastar/coroutine/exception.hh>

 #include "task_manager.hh"
 #include "api/api-doc/task_manager.json.hh"
@@ -45,7 +44,6 @@ struct task_stats {
        : task_id(task->id().to_sstring())
        , state(task->get_status().state)
        , type(task->type())
-        , scope(task->get_status().scope)
        , keyspace(task->get_status().keyspace)
        , table(task->get_status().table)
        , entity(task->get_status().entity)
@@ -55,7 +53,6 @@ struct task_stats {
    sstring task_id;
    tasks::task_manager::task_state state;
    std::string type;
-    std::string scope;
    std::string keyspace;
    std::string table;
    std::string entity;
@@ -72,7 +69,6 @@ tm::task_status make_status(full_task_status status) {
    tm::task_status res{};
    res.id = status.task_status.id.to_sstring();
    res.type = status.type;
-    res.scope = status.task_status.scope;
    res.state = status.task_status.state;
    res.is_abortable = bool(status.abortable);
    res.start_time = st;
@@ -123,12 +119,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {
        auto internal = tasks::is_internal{req_param<bool>(*req, "internal", false)};
        std::vector<chunked_stats> res = co_await ctx.tm.map([&req, internal] (tasks::task_manager& tm) {
            chunked_stats local_res;
-            tasks::task_manager::module_ptr module;
-            try {
-                module = tm.find_module(req->get_path_param("module"));
-            } catch (...) {
-                throw bad_param_exception(fmt::format("{}", std::current_exception()));
-            }
+            auto module = tm.find_module(req->param["module"]);
            const auto& filtered_tasks = module->get_tasks() | boost::adaptors::filtered([&params = req->query_parameters, internal] (const auto& task) {
                return (internal || !task.second->is_internal()) && filter_tasks(task.second, params);
            });
@@ -140,110 +131,82 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {

        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
            auto s = std::move(os);
-            std::exception_ptr ex;
-            try {
-                auto res = std::move(r);
-                co_await s.write("[");
-                std::string delim = "";
-                for (auto& v: res) {
-                    for (auto& stats: v) {
-                        co_await s.write(std::exchange(delim, ", "));
-                        tm::task_stats ts;
-                        ts = stats;
-                        co_await formatter::write(s, ts);
-                    }
+            auto res = std::move(r);
+            co_await s.write("[");
+            std::string delim = "";
+            for (auto& v: res) {
+                for (auto& stats: v) {
+                    co_await s.write(std::exchange(delim, ", "));
+                    tm::task_stats ts;
+                    ts = stats;
+                    co_await formatter::write(s, ts);
                }
-                co_await s.write("]");
-                co_await s.flush();
-            } catch (...) {
-                ex = std::current_exception();
            }
+            co_await s.write("]");
            co_await s.close();
-            if (ex) {
-                co_await coroutine::return_exception_ptr(std::move(ex));
-            }
        };
        co_return std::move(f);
    });

    tm::get_task_status.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
-                }
-                co_return std::move(task);
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return std::move(task);
+        }));
        auto s = co_await retrieve_status(task);
        co_return make_status(s);
    });

    tm::abort_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
-        try {
-            co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                if (!task->is_abortable()) {
-                    co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
-                }
-                co_await task->abort();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            if (!task->is_abortable()) {
+                co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
+            }
+            co_await task->abort();
+        });
        co_return json_void();
    });

    tm::wait_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
-                return task->done().then_wrapped([task] (auto f) {
-                    task->unregister_task();
-                    // done() is called only because we want the task to be complete before getting its status.
-                    // The future should be ignored here as the result does not matter.
-                    f.ignore_ready_future();
-                    return make_foreign(task);
-                });
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
+            return task->done().then_wrapped([task] (auto f) {
+                task->unregister_task();
+                f.get();
+                return make_foreign(task);
+            });
+        }));
        auto s = co_await retrieve_status(task);
        co_return make_status(s);
    });

    tm::get_task_status_recursively.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& _ctx = ctx;
-        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
        std::queue<tasks::task_manager::foreign_task_ptr> q;
        utils::chunked_vector<full_task_status> res;

-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            // Get requested task.
-            task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
-                }
-                co_return task;
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        // Get requested task.
+        auto task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return task;
+        }));

        // Push children's statuses in BFS order.
        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
        while (!q.empty()) {
            auto& current = q.front();
            res.push_back(co_await retrieve_status(current));
-            for (auto& child: current->get_children()) {
-                q.push(co_await child.copy());
+            for (size_t i = 0; i < current->get_children().size(); ++i) {
+                q.push(co_await current->get_children()[i].copy());
            }
            q.pop();
        }
@@ -265,11 +228,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {

    tm::get_and_update_ttl.set(r, [&cfg] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        uint32_t ttl = cfg.task_ttl_seconds();
-        try {
-            co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
-        } catch (...) {
-            throw bad_param_exception(fmt::format("{}", std::current_exception()));
-        }
+        co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
        co_return json::json_return_type(ttl);
    });
 }
--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -71,36 +71,28 @@ void set_task_manager_test(http_context& ctx, routes& r) {

    tmt::unregister_test_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->query_parameters["task_id"]}};
-        try {
-            co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                tasks::test_task test_task{task};
-                co_await test_task.unregister_task();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            tasks::test_task test_task{task};
+            co_await test_task.unregister_task();
+        });
        co_return json_void();
    });

    tmt::finish_test_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
        auto it = req->query_parameters.find("error");
        bool fail = it != req->query_parameters.end();
        std::string error = fail ? it->second : "";

-        try {
-            co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
-                tasks::test_task test_task{task};
-                if (fail) {
-                    test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
-                } else {
-                    test_task.finish();
-                }
-                return make_ready_future<>();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
+            tasks::test_task test_task{task};
+            if (fail) {
+                test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
+            } else {
+                test_task.finish();
+            }
+            return make_ready_future<>();
+        });
        co_return json_void();
    });
 }
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -7,7 +7,6 @@ target_sources(scylla_auth
    allow_all_authorizer.cc
    authenticated_user.cc
    authenticator.cc
-    certificate_authenticator.cc
    common.cc
    default_authorizer.cc
    password_authenticator.cc
@@ -31,7 +30,6 @@ target_link_libraries(scylla_auth
  PRIVATE
    cql3
    idl
-    wasmtime_bindings
-    libxcrypt::libxcrypt)
+    wasmtime_bindings)

 add_whole_archive(auth scylla_auth)
--- a/auth/authenticator.cc
+++ b/auth/authenticator.cc
@@ -18,7 +18,3 @@

 const sstring auth::authenticator::USERNAME_KEY("username");
 const sstring auth::authenticator::PASSWORD_KEY("password");
-
-future<std::optional<auth::authenticated_user>> auth::authenticator::authenticate(session_dn_func) const {
-    return make_ready_future<std::optional<auth::authenticated_user>>(std::nullopt);
-}
--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -15,8 +15,6 @@
 #include <set>
 #include <stdexcept>
 #include <unordered_map>
-#include <optional>
-#include <functional>

 #include <seastar/core/enum.hh>
 #include <seastar/core/future.hh>
@@ -38,16 +36,6 @@ namespace auth {

 class authenticated_user;

-// Query alt name info as a single (subject style) string
-using alt_name_func = std::function<future<std::string>()>;
-
-struct certificate_info {
-    std::string subject;
-    alt_name_func get_alt_names;
-};
-
-using session_dn_func = std::function<future<std::optional<certificate_info>>()>;
-
 ///
 /// Abstract client for authenticating role identity.
 ///
@@ -99,13 +87,6 @@ public:
    ///
    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const = 0;

-    ///
-    /// Authenticate (early) using transport info
-    ///
-    /// \returns nullopt if not supported/required. exceptional future if failed
-    ///
-    virtual future<std::optional<authenticated_user>> authenticate(session_dn_func) const;
-
    ///
    /// Create an authentication record for a new user. This is required before the user can log-in.
    ///
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -1,181 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#include "auth/certificate_authenticator.hh"
-
-#include <regex>
-
-#include "utils/class_registrator.hh"
-#include "data_dictionary/data_dictionary.hh"
-#include "cql3/query_processor.hh"
-#include "db/config.hh"
-
-static const auto CERT_AUTH_NAME = "com.scylladb.auth.CertificateAuthenticator";
-const std::string_view auth::certificate_authenticator_name(CERT_AUTH_NAME);
-
-static logging::logger clogger("certificate_authenticator");
-
-static const std::string cfg_source_attr = "source";
-static const std::string cfg_query_attr = "query";
-
-static const std::string cfg_source_subject = "SUBJECT";
-static const std::string cfg_source_altname = "ALTNAME";
-
-static const class_registrator<auth::authenticator
-    , auth::certificate_authenticator
-    , cql3::query_processor&
-    , ::service::migration_manager&> cert_auth_reg(CERT_AUTH_NAME);
-
-enum class auth::certificate_authenticator::query_source {
-    subject, altname
-};
-
-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::migration_manager&)
-    : _queries([&] {
-        auto& conf = qp.db().get_config();
-        auto queries = conf.auth_certificate_role_queries();
-
-        if (queries.empty()) {
-            throw std::invalid_argument("No role extraction queries specified.");
-        }
-
-        std::vector<std::pair<query_source, boost::regex>> res;
-
-        for (auto& map : queries) {
-            // first, check for any invalid config keys
-            if (map.size() == 2) {
-                try {
-                    auto& source = map.at(cfg_source_attr);
-                    std::string query = map.at(cfg_query_attr);
-
-                    std::transform(source.begin(), source.end(), source.begin(), ::toupper);
-
-                    boost::regex ex(query);
-                    if (ex.mark_count() != 1) {
-                        throw std::invalid_argument("Role query must have exactly one mark expression");
-                    }
-
-                    clogger.debug("Append role query: {} : {}", source, query);
-
-                    if (source == cfg_source_subject) {
-                        res.emplace_back(query_source::subject, std::move(ex));
-                    } else if (source == cfg_source_altname) {
-                        res.emplace_back(query_source::altname, std::move(ex));
-                    } else {
-                        throw std::invalid_argument(fmt::format("Invalid source: {}", map.at(cfg_source_attr)));
-                    }
-                    continue;
-                } catch (std::out_of_range&) {
-                    // just fallthrough
-                } catch (std::regex_error&) {
-                    std::throw_with_nested(std::invalid_argument(fmt::format("Invalid query expression: {}", map.at(cfg_query_attr))));
-                }
-            }
-            throw std::invalid_argument(fmt::format("Invalid query: {}", map));
-        }
-        return res;
-    }())
-{}
-
-auth::certificate_authenticator::~certificate_authenticator() = default;
-
-future<> auth::certificate_authenticator::start() {
-    co_return;
-}
-
-future<> auth::certificate_authenticator::stop() {
-    co_return;
-}
-
-std::string_view auth::certificate_authenticator::qualified_java_name() const {
-    return certificate_authenticator_name;
-}
-
-bool auth::certificate_authenticator::require_authentication() const {
-    return true;
-}
-
-auth::authentication_option_set auth::certificate_authenticator::supported_options() const {
-    return {};
-}
-
-auth::authentication_option_set auth::certificate_authenticator::alterable_options() const {
-    return {};
-}
-
-future<std::optional<auth::authenticated_user>> auth::certificate_authenticator::authenticate(session_dn_func f) const {
-    if (!f) {
-        co_return std::nullopt;
-    }
-    auto dninfo = co_await f();
-    if (!dninfo) {
-        throw exceptions::authentication_exception("No valid certificate found");
-    }
-
-    auto& subject = dninfo->subject;
-    std::optional<std::string> altname ;
-
-    const std::string* source_str = nullptr;
-
-    for (auto& [source, expr] : _queries) {
-        switch (source) {
-            default:
-            case query_source::subject:
-                source_str = &subject;
-                break;
-            case query_source::altname:
-                if (!altname) {
-                    altname = dninfo->get_alt_names ? co_await dninfo->get_alt_names() : std::string{};
-                }
-                source_str = &*altname;
-                break;
-        }
-
-        clogger.debug("Checking {}: {}", int(source), *source_str);
-
-        boost::smatch m;
-        if (boost::regex_search(*source_str, m, expr)) {
-            auto username = m[1].str();
-            clogger.debug("Return username: {}", username);
-            co_return username;
-        }
-    }
-    throw exceptions::authentication_exception(format("Subject '{}'/'{}' does not match any query expression", subject, altname));
-}
-
-
-future<auth::authenticated_user> auth::certificate_authenticator::authenticate(const credentials_map&) const {
-    throw exceptions::authentication_exception("Cannot authenticate using attribute map");
-}
-
-future<> auth::certificate_authenticator::create(std::string_view role_name, const authentication_options& options) const {
-    // TODO: should we keep track of roles/enforce existence? Role manager should deal with this...
-    co_return;
-}
-
-future<> auth::certificate_authenticator::alter(std::string_view role_name, const authentication_options& options) const {
-    co_return;
-}
-
-future<> auth::certificate_authenticator::drop(std::string_view role_name) const {
-    co_return;
-}
-
-future<auth::custom_options> auth::certificate_authenticator::query_custom_options(std::string_view) const {
-    co_return auth::custom_options{};
-}
-
-const auth::resource_set& auth::certificate_authenticator::protected_resources() const {
-    static const resource_set resources;
-    return resources;
-}
-
-::shared_ptr<auth::sasl_challenge> auth::certificate_authenticator::new_sasl_challenge() const {
-    throw exceptions::authentication_exception("Login authentication not supported");
-}
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#pragma once
-
-#include <boost/regex.hpp>
-#include "auth/authenticator.hh"
-
-namespace cql3 {
-
-class query_processor;
-
-} // namespace cql3
-
-namespace service {
-class migration_manager;
-}
-
-namespace auth {
-
-extern const std::string_view certificate_authenticator_name;
-
-class certificate_authenticator : public authenticator {
-    enum class query_source;
-    std::vector<std::pair<query_source, boost::regex>> _queries;
-public:
-    certificate_authenticator(cql3::query_processor&, ::service::migration_manager&);
-    ~certificate_authenticator();
-
-    future<> start() override;
-    future<> stop() override;
-
-    std::string_view qualified_java_name() const override;
-
-    bool require_authentication() const override;
-
-    authentication_option_set supported_options() const override;
-    authentication_option_set alterable_options() const override;
-
-    future<authenticated_user> authenticate(const credentials_map& credentials) const override;
-    future<std::optional<authenticated_user>> authenticate(session_dn_func) const override;
-
-    future<> create(std::string_view role_name, const authentication_options& options) const override;
-    future<> alter(std::string_view role_name, const authentication_options& options) const override;
-    future<> drop(std::string_view role_name) const override;
-
-    future<custom_options> query_custom_options(std::string_view role_name) const override;
-
-    const resource_set& protected_resources() const override;
-
-    ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;
-private:
-};
-
-}
-
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -71,8 +71,7 @@ static future<> create_metadata_table_if_missing_impl(
        auto group0_guard = co_await mm.start_group0_operation();
        auto ts = group0_guard.write_timestamp();
        try {
-            co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
-                    std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
+            co_return co_await mm.announce(co_await mm.prepare_new_column_family_announcement(table, ts), std::move(group0_guard));
        } catch (exceptions::already_exists_exception&) {}
    }
 }
@@ -85,6 +84,20 @@ future<> create_metadata_table_if_missing(
    return futurize_invoke(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

+future<> wait_for_schema_agreement(::service::migration_manager& mm, const replica::database& db, seastar::abort_source& as) {
+    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
+
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != replica::database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
+    });
+}
+
 ::service::query_state& internal_distributed_query_state() noexcept {
 #ifdef DEBUG
    // Give the much slower debug tests more headroom for completing auth queries.
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -22,6 +22,7 @@
 #include "log.hh"
 #include "seastarx.hh"
 #include "utils/exponential_backoff_retry.hh"
+#include "service/query_state.hh"

 using namespace std::chrono_literals;

@@ -31,7 +32,6 @@ class database;

 namespace service {
 class migration_manager;
-class query_state;
 }

 namespace cql3 {
@@ -67,6 +67,8 @@ future<> create_metadata_table_if_missing(
        std::string_view cql,
        ::service::migration_manager&) noexcept;

+future<> wait_for_schema_agreement(::service::migration_manager&, const replica::database&, seastar::abort_source&);
+
 ///
 /// Time-outs for internal, non-local CQL queries.
 ///
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -129,7 +129,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -29,7 +29,6 @@
 #include "utils/class_registrator.hh"
 #include "replica/database.hh"
 #include "cql3/query_processor.hh"
-#include "db/config.hh"

 namespace auth {

@@ -51,23 +50,14 @@ static const class_registrator<

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

-static std::string_view get_config_value(std::string_view value, std::string_view def) {
-    return value.empty() ? def : value;
-}
-
-std::string password_authenticator::default_superuser(const db::config& cfg) {
-    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
-}
-
 password_authenticator::~password_authenticator() {
 }

 password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::migration_manager& mm)
    : _qp(qp)
    , _migration_manager(mm)
-    , _stopped(make_ready_future<>()) 
-    , _superuser(default_superuser(qp.db().get_config()))
-{}
+    , _stopped(make_ready_future<>()) {
+}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
@@ -116,17 +106,13 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 }

 future<> password_authenticator::create_default_if_missing() const {
-    return default_role_row_satisfies(_qp, &has_salted_hash, _superuser).then([this](bool exists) {
+    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
-            if (salted_pwd.empty()) {
-                salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
-            }
            return _qp.execute_internal(
                    update_row_query(),
                    db::consistency_level::QUORUM,
                    internal_distributed_query_state(),
-                    {salted_pwd, _superuser},
+                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME},
                    cql3::query_processor::cache_internal::no).then([](auto&&) {
                plogger.info("Created default superuser authentication record.");
            });
@@ -146,9 +132,9 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

-                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get0()) {
+                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
                         plogger.warn("Ignoring legacy authentication metadata since nondefault data already exist.");
                     }
@@ -175,8 +161,6 @@ future<> password_authenticator::stop() {
 }

 db::consistency_level password_authenticator::consistency_for_user(std::string_view role_name) {
-    // TODO: this is plain dung. Why treat hardcoded default special, but for example a user-created
-    // super user uses plain LOCAL_ONE?
    if (role_name == DEFAULT_USER_NAME) {
        return db::consistency_level::QUORUM;
    }
@@ -245,8 +229,6 @@ future<authenticated_user> password_authenticator::authenticate(
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
        } catch (exceptions::authentication_exception& e) {
            std::throw_with_nested(e);
-        } catch (exceptions::unavailable_exception& e) {
-            std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
        } catch (...) {
            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -14,10 +14,6 @@

 #include "auth/authenticator.hh"

-namespace db {
-    class config;
-}
-
 namespace cql3 {

 class query_processor;
@@ -37,11 +33,9 @@ class password_authenticator : public authenticator {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    seastar::abort_source _as;
-    std::string _superuser;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
-    static std::string default_superuser(const db::config&);

    password_authenticator(cql3::query_processor&, ::service::migration_manager&);

--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -46,43 +46,60 @@ constexpr std::string_view qualified_name("system_auth.roles");

 future<bool> default_role_row_satisfies(
        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
+        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {} WHERE {} = ?",
            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

-    for (auto cl : { db::consistency_level::ONE, db::consistency_level::QUORUM }) {
-        auto results = co_await qp.execute_internal(query, cl
-            , internal_distributed_query_state()
-            , {rolename.value_or(std::string(meta::DEFAULT_SUPERUSER_NAME))}
-            , cql3::query_processor::cache_internal::yes
-            );
-        if (!results->empty()) {
-            co_return p(results->one());
-        }
-    }
-    co_return false;
+    return do_with(std::move(p), [&qp](const auto& p) {
+        return qp.execute_internal(
+                query,
+                db::consistency_level::ONE,
+                internal_distributed_query_state(),
+                {meta::DEFAULT_SUPERUSER_NAME},
+                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
+            if (results->empty()) {
+                return qp.execute_internal(
+                        query,
+                        db::consistency_level::QUORUM,
+                        internal_distributed_query_state(),
+                        {meta::DEFAULT_SUPERUSER_NAME},
+                        cql3::query_processor::cache_internal::yes).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+                    if (results->empty()) {
+                        return make_ready_future<bool>(false);
+                    }
+
+                    return make_ready_future<bool>(p(results->one()));
+                });
+            }
+
+            return make_ready_future<bool>(p(results->one()));
+        });
+    });
 }

 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
+        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name);

-    auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
-        , internal_distributed_query_state(), cql3::query_processor::cache_internal::no
-        );
-    if (results->empty()) {
-        co_return false;
-    }
-    static const sstring col_name = sstring(meta::roles_table::role_col_name);
+    return do_with(std::move(p), [&qp](const auto& p) {
+        return qp.execute_internal(
+                query,
+                db::consistency_level::QUORUM,
+                internal_distributed_query_state(),
+                cql3::query_processor::cache_internal::no).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+            if (results->empty()) {
+                return false;
+            }

-    co_return boost::algorithm::any_of(*results, [&](const cql3::untyped_result_set_row& row) {
-        auto superuser = rolename ? std::string_view(*rolename) : meta::DEFAULT_SUPERUSER_NAME;
-        const bool is_nondefault = row.get_as<sstring>(col_name) != superuser;
-        return is_nondefault && p(row);
+            static const sstring col_name = sstring(meta::roles_table::role_col_name);
+
+            return boost::algorithm::any_of(*results, [&p](const cql3::untyped_result_set_row& row) {
+                const bool is_nondefault = row.get_as<sstring>(col_name) != meta::DEFAULT_SUPERUSER_NAME;
+                return is_nondefault && p(row);
+            });
+        });
    });
 }

--- a/auth/roles-metadata.hh
+++ b/auth/roles-metadata.hh
@@ -43,17 +43,13 @@ constexpr std::string_view role_col_name{"role", 4};
 ///
 future<bool> default_role_row_satisfies(
        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
+        std::function<bool(const cql3::untyped_result_set_row&)>);

 ///
 /// Check that any nondefault role satisfies a predicate. `false` if no nondefault roles exist.
 ///
 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
+        std::function<bool(const cql3::untyped_result_set_row&)>);

 }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -178,8 +178,7 @@ future<> service::create_keyspace_if_missing(::service::migration_manager& mm) c
                    opts,
                    true);

-            co_return co_await mm.announce(::service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts),
-                    std::move(group0_guard), format("auth_service: create {} keyspace", meta::AUTH_KS));
+            co_return co_await mm.announce(mm.prepare_new_keyspace_announcement(ksm, ts), std::move(group0_guard));
        }
    }
 }
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -28,8 +28,6 @@
 #include "log.hh"
 #include "utils/class_registrator.hh"
 #include "replica/database.hh"
-#include "service/migration_manager.hh"
-#include "password_authenticator.hh"

 namespace auth {

@@ -129,13 +127,6 @@ static bool has_can_login(const cql3::untyped_result_set_row& row) {
    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob("can_login")).is_null());
 }

-standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::service::migration_manager& mm)
-    : _qp(qp)
-    , _migration_manager(mm)
-    , _stopped(make_ready_future<>())
-    , _superuser(password_authenticator::default_superuser(qp.db().get_config()))
-{}
-
 std::string_view standard_role_manager::qualified_java_name() const noexcept {
    return "org.apache.cassandra.auth.CassandraRoleManager";
 }
@@ -177,7 +168,7 @@ future<> standard_role_manager::create_metadata_tables_if_missing() const {
 }

 future<> standard_role_manager::create_default_role_if_missing() const {
-    return default_role_row_satisfies(_qp, &has_can_login, _superuser).then([this](bool exists) {
+    return default_role_row_satisfies(_qp, &has_can_login).then([this](bool exists) {
        if (!exists) {
            static const sstring query = format("INSERT INTO {} ({}, is_superuser, can_login) VALUES (?, true, true)",
                    meta::roles_table::qualified_name,
@@ -187,9 +178,9 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_query_state(),
-                    {_superuser},
-                    cql3::query_processor::cache_internal::no).then([this](auto&&) {
-                log.info("Created default superuser role '{}'.", _superuser);
+                    {meta::DEFAULT_SUPERUSER_NAME},
+                    cql3::query_processor::cache_internal::no).then([](auto&&) {
+                log.info("Created default superuser role '{}'.", meta::DEFAULT_SUPERUSER_NAME);
                return make_ready_future<>();
            });
        }
@@ -241,7 +232,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -34,10 +34,13 @@ class standard_role_manager final : public role_manager {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    seastar::abort_source _as;
-    std::string _superuser;

 public:
-    standard_role_manager(cql3::query_processor&, ::service::migration_manager&);
+    standard_role_manager(cql3::query_processor& qp, ::service::migration_manager& mm)
+            : _qp(qp)
+            , _migration_manager(mm)
+            , _stopped(make_ready_future<>()) {
+    }

    virtual std::string_view qualified_java_name() const noexcept override;

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -37,8 +37,10 @@
 // The constants q1 and q2 are used to determine the proportional factor at each stage.
 class backlog_controller {
 public:
-    using scheduling_group = seastar::scheduling_group;
-
+    struct scheduling_group {
+        seastar::scheduling_group cpu = default_scheduling_group();
+        seastar::io_priority_class io = default_priority_class();
+    };
    future<> shutdown() {
        _update_timer.cancel();
        return std::move(_inflight_update);
@@ -56,11 +58,11 @@ protected:
    };

    scheduling_group _scheduling_group;
+    timer<> _update_timer;

    std::vector<control_point> _control_points;

    std::function<float()> _current_backlog;
-    timer<> _update_timer;
    // updating shares for an I/O class may contact another shard and returns a future.
    future<> _inflight_update;

@@ -80,9 +82,9 @@ protected:
                       std::vector<control_point> control_points, std::function<float()> backlog,
                       float static_shares = 0)
        : _scheduling_group(std::move(sg))
+        , _update_timer([this] { adjust(); })
        , _control_points()
        , _current_backlog(std::move(backlog))
-        , _update_timer([this] { adjust(); })
        , _inflight_update(make_ready_future<>())
        , _static_shares(static_shares)
    {
--- a/bin/cqlsh
+++ b/bin/cqlsh
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-# Copyright (C) 2023-present ScyllaDB
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-here=$(dirname "$0")
-exec "$here/../tools/cqlsh/bin/cqlsh" "$@"
-
--- a/bytes.hh
+++ b/bytes.hh
@@ -89,7 +89,7 @@ public:
        // get the delimeter if any
        auto it = ctx.begin();
        auto end = ctx.end();
-        if (it != end && *it != '}') {
+        if (it != end) {
            int group_size = *it++ - '0';
            if (group_size < 0 ||
                static_cast<size_t>(group_size) > sizeof(uint64_t)) {
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -98,16 +98,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    bool _next_row_in_range = false;
    bool _has_rt = false;

-    // True iff current population interval starts at before_all_clustered_rows
-    // and _last_row is unset. (And the read isn't reverse).
-    //
-    // Rationale: in the "most general" step of cache population,
-    // we mark the `(_last_row, ...] `range as continuous, which can involve doing something to `_last_row`.
-    // But when populating the range `(before_all_clustered_rows, ...)`,
-    // a rows_entry at `before_all_clustered_rows` needn't exist.
-    // Thus this case needs a special treatment which doesn't involve `_last_row`.
-    // And for that, this case it has to be recognized (via this flag).
-    //
+    // True iff current population interval, since the previous clustering row, starts before all clustered rows.
    // We cannot just look at _lower_bound, because emission of range tombstones changes _lower_bound and
    // because we mark clustering intervals as continuous when consuming a clustering_row, it would prevent
    // us from marking the interval as continuous.
@@ -119,9 +110,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    flat_mutation_reader_v2* _underlying = nullptr;
    flat_mutation_reader_v2_opt _underlying_holder;

-    gc_clock::time_point _read_time;
-    gc_clock::time_point _gc_before;
-
    future<> do_fill_buffer();
    future<> ensure_underlying();
    void copy_from_cache_to_buffer();
@@ -156,8 +144,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    bool maybe_add_to_cache(const range_tombstone_change& rtc);
    void maybe_add_to_cache(const static_row& sr);
    void maybe_set_static_row_continuous();
-    void set_rows_entry_continuous(rows_entry& e);
-    void restore_continuity_after_insertion(const mutation_partition::rows_type::iterator&);
    void finish_reader() {
        push_mutation_fragment(*_schema, _permit, partition_end());
        _end_of_stream = true;
@@ -192,20 +178,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    const schema& table_schema() {
        return *_snp->schema();
    }
-
-    gc_clock::time_point get_read_time() {
-        return _read_context.tombstone_gc_state() ? gc_clock::now() : gc_clock::time_point::min();
-    }
-
-    gc_clock::time_point get_gc_before(const schema& schema, dht::decorated_key dk, const gc_clock::time_point query_time) {
-        auto gc_state = _read_context.tombstone_gc_state();
-        if (gc_state) {
-            return gc_state->get_gc_before_for_key(schema.shared_from_this(), dk, query_time);
-        }
-
-        return gc_clock::time_point::min();
-    }
-
 public:
    cache_flat_mutation_reader(schema_ptr s,
                               dht::decorated_key dk,
@@ -224,8 +196,6 @@ public:
        , _read_context_holder()
        , _read_context(ctx)    // ctx is owned by the caller, who's responsible for closing it.
        , _next_row(*_schema, *_snp, false, _read_context.is_reversed())
-        , _read_time(get_read_time())
-        , _gc_before(get_gc_before(*_schema, dk, _read_time))
    {
        clogger.trace("csm {}: table={}.{}, reversed={}, snap={}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name(), _read_context.is_reversed(),
                      fmt::ptr(&*_snp));
@@ -352,7 +322,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer() {
            });
        }
        _state = state::reading_from_underlying;
-        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema) && !_read_context.is_reversed() && !_last_row;
+        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema) && !_read_context.is_reversed();
        _underlying_upper_bound = _next_row_in_range ? position_in_partition::before_key(_next_row.position())
                                                     : position_in_partition(_upper_bound);
        if (!_read_context.partition_exists()) {
@@ -453,10 +423,7 @@ future<> cache_flat_mutation_reader::read_from_underlying() {
                                auto e = alloc_strategy_unique_ptr<rows_entry>(
                                    current_allocator().construct<rows_entry>(_ck_ranges_curr->start()->value()));
                                // Use _next_row iterator only as a hint, because there could be insertions after _upper_bound.
-                                auto insert_result = rows.insert_before_hint(
-                                        _next_row.at_a_row() ? _next_row.get_iterator_in_latest_version() : rows.begin(),
-                                        std::move(e),
-                                        cmp);
+                                auto insert_result = rows.insert_before_hint(_next_row.get_iterator_in_latest_version(), std::move(e), cmp);
                                if (insert_result.second) {
                                    auto it = insert_result.first;
                                    _snp->tracker()->insert(*it);
@@ -473,22 +440,18 @@ future<> cache_flat_mutation_reader::read_from_underlying() {
                                auto e = alloc_strategy_unique_ptr<rows_entry>(
                                    current_allocator().construct<rows_entry>(table_s, to_table_domain(_upper_bound), is_dummy::yes, is_continuous::no));
                                // Use _next_row iterator only as a hint, because there could be insertions after _upper_bound.
-                                auto insert_result = rows.insert_before_hint(
-                                        _next_row.at_a_row() ? _next_row.get_iterator_in_latest_version() : rows.begin(),
-                                        std::move(e),
-                                        cmp);
+                                auto insert_result = rows.insert_before_hint(_next_row.get_iterator_in_latest_version(), std::move(e), cmp);
                                if (insert_result.second) {
                                    clogger.trace("csm {}: L{}: inserted dummy at {}", fmt::ptr(this), __LINE__, _upper_bound);
                                    _snp->tracker()->insert(*insert_result.first);
-                                    restore_continuity_after_insertion(insert_result.first);
                                }
                                if (_read_context.is_reversed()) [[unlikely]] {
                                    clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), _last_row.position(), insert_result.first->position(), _current_tombstone);
-                                    set_rows_entry_continuous(*_last_row);
+                                    _last_row->set_continuous(true);
                                    _last_row->set_range_tombstone(_current_tombstone);
                                } else {
                                    clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(), _last_row.position(), _current_tombstone);
-                                    set_rows_entry_continuous(*insert_result.first);
+                                    insert_result.first->set_continuous(true);
                                    insert_result.first->set_range_tombstone(_current_tombstone);
                                }
                                maybe_drop_last_entry(_current_tombstone);
@@ -523,11 +486,11 @@ bool cache_flat_mutation_reader::ensure_population_lower_bound() {
        rows_entry::tri_compare cmp(*_schema);
        partition_snapshot_row_cursor cur(*_schema, *_snp, false, _read_context.is_reversed());

-        if (!cur.advance_to(to_query_domain(_last_row.position()))) {
+        if (!cur.advance_to(_last_row.position())) {
            return false;
        }

-        if (cmp(cur.table_position(), _last_row.position()) != 0) {
+        if (cmp(cur.position(), _last_row.position()) != 0) {
            return false;
        }

@@ -549,7 +512,7 @@ void cache_flat_mutation_reader::maybe_update_continuity() {
    position_in_partition::equal_compare eq(*_schema);
    if (can_populate()
            && ensure_population_lower_bound()
-            && !eq(_last_row.position(), _next_row.table_position())) {
+            && !eq(_last_row.position(), _next_row.position())) {
        with_allocator(_snp->region().allocator(), [&] {
            rows_entry& e = _next_row.ensure_entry_in_latest().row;
            auto& rows = _snp->version()->partition().mutable_clustered_rows();
@@ -571,14 +534,14 @@ void cache_flat_mutation_reader::maybe_update_continuity() {
                        }
                        clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(),
                                      _last_row.position(), _current_tombstone);
-                        set_rows_entry_continuous(*insert_result.first);
+                        insert_result.first->set_continuous(true);
                        insert_result.first->set_range_tombstone(_current_tombstone);
                        clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), _last_row.position());
-                        set_rows_entry_continuous(*_last_row);
+                        _last_row->set_continuous(true);
                    });
                } else {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), _last_row.position(), _current_tombstone);
-                    set_rows_entry_continuous(*_last_row);
+                    _last_row->set_continuous(true);
                    _last_row->set_range_tombstone(_current_tombstone);
                }
            } else {
@@ -596,18 +559,18 @@ void cache_flat_mutation_reader::maybe_update_continuity() {
                        if (insert_result.second) {
                            clogger.trace("csm {}: L{}: inserted dummy at {}", fmt::ptr(this), __LINE__, insert_result.first->position());
                            _snp->tracker()->insert(*insert_result.first);
-                            clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(),
-                                          _last_row.position(), _current_tombstone);
-                            set_rows_entry_continuous(*insert_result.first);
-                            insert_result.first->set_range_tombstone(_current_tombstone);
                        }
+                        clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(),
+                                      _last_row.position(), _current_tombstone);
+                        insert_result.first->set_continuous(true);
+                        insert_result.first->set_range_tombstone(_current_tombstone);
                        clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), e.position());
-                        set_rows_entry_continuous(e);
+                        e.set_continuous(true);
                    });
                } else {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), e.position(), _current_tombstone);
                    e.set_range_tombstone(_current_tombstone);
-                    set_rows_entry_continuous(e);
+                    e.set_continuous(true);
                }
            }
            maybe_drop_last_entry(_current_tombstone);
@@ -637,27 +600,26 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
            current_allocator().construct<rows_entry>(table_schema(), cr.key(), cr.as_deletable_row()));
        new_entry->set_continuous(false);
        new_entry->set_range_tombstone(_current_tombstone);
-        auto it = _next_row.iterators_valid() && _next_row.at_a_row() ? _next_row.get_iterator_in_latest_version()
+        auto it = _next_row.iterators_valid() ? _next_row.get_iterator_in_latest_version()
                                              : mp.clustered_rows().lower_bound(cr.key(), cmp);
        auto insert_result = mp.mutable_clustered_rows().insert_before_hint(it, std::move(new_entry), cmp);
        it = insert_result.first;
        if (insert_result.second) {
            _snp->tracker()->insert(*it);
-            restore_continuity_after_insertion(it);
        }

        rows_entry& e = *it;
        if (ensure_population_lower_bound()) {
            if (_read_context.is_reversed()) [[unlikely]] {
                clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), _last_row.position());
-                set_rows_entry_continuous(*_last_row);
+                _last_row->set_continuous(true);
                // _current_tombstone must also apply to _last_row itself (if it's non-dummy)
                // because otherwise there would be a rtc after it, either creating a different entry,
                // or clearing _last_row if population did not happen.
                _last_row->set_range_tombstone(_current_tombstone);
            } else {
                clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), e.position());
-                set_rows_entry_continuous(e);
+                e.set_continuous(true);
                e.set_range_tombstone(_current_tombstone);
            }
        } else {
@@ -702,31 +664,26 @@ bool cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone_change

        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
                current_allocator().construct<rows_entry>(table_schema(), to_table_domain(rtc.position()), is_dummy::yes, is_continuous::no));
-        auto it = _next_row.iterators_valid() && _next_row.at_a_row() ? _next_row.get_iterator_in_latest_version()
+        auto it = _next_row.iterators_valid() ? _next_row.get_iterator_in_latest_version()
                                              : mp.clustered_rows().lower_bound(to_table_domain(rtc.position()), cmp);
        auto insert_result = mp.mutable_clustered_rows().insert_before_hint(it, std::move(new_entry), cmp);
        it = insert_result.first;
        if (insert_result.second) {
            _snp->tracker()->insert(*it);
-            restore_continuity_after_insertion(it);
        }

        rows_entry& e = *it;
        if (ensure_population_lower_bound()) {
            // underlying may emit range_tombstone_change fragments with the same position.
            // In such case, the range to which the tombstone from the first fragment applies is empty and should be ignored.
-            //
-            // Note: we are using a query schema comparator to compare table schema positions here,
-            // but this is okay because we are only checking for equality,
-            // which is preserved by schema reversals.
-            if (q_cmp(_last_row.position(), it->position()) != 0) {
+            if (q_cmp(_last_row.position(), it->position()) < 0) {
                if (_read_context.is_reversed()) [[unlikely]] {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), _last_row.position(), prev);
-                    set_rows_entry_continuous(*_last_row);
+                    _last_row->set_continuous(true);
                    _last_row->set_range_tombstone(prev);
                } else {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), e.position(), prev);
-                    set_rows_entry_continuous(e);
+                    e.set_continuous(true);
                    e.set_range_tombstone(prev);
                }
            }
@@ -773,51 +730,9 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
        }
    }

+    // We add the row to the buffer even when it's full.
+    // This simplifies the code. For more info see #3139.
    if (_next_row_in_range) {
-        bool remove_row = false;
-
-        if (_read_context.tombstone_gc_state() // do not compact rows when tombstone_gc_state is not set (used in some unit tests)
-            && !_next_row.dummy()
-            && _snp->at_latest_version()
-            && _snp->at_oldest_version()) {
-            deletable_row& row = _next_row.latest_row();
-            tombstone range_tomb = _next_row.range_tombstone_for_row();
-            auto t = row.deleted_at();
-            t.apply(range_tomb);
-
-            auto row_tomb_expired = [&](row_tombstone tomb) {
-                return (tomb && tomb.max_deletion_time() < _gc_before);
-            };
-
-            auto is_row_dead = [&](const deletable_row& row) {
-                auto& m = row.marker();
-                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before);
-            };
-
-            if (row_tomb_expired(t) || is_row_dead(row)) {
-                can_gc_fn always_gc = [&](tombstone) { return true; };
-                const schema& row_schema = _next_row.latest_row_schema();
-
-                _read_context.cache()._tracker.on_row_compacted();
-
-                with_allocator(_snp->region().allocator(), [&] {
-                    deletable_row row_copy(row_schema, row);
-                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, always_gc, _gc_before, nullptr);
-                    std::swap(row, row_copy);
-                });
-                remove_row = row.empty();
-
-                auto tomb_expired = [&](tombstone tomb) {
-                    return (tomb && tomb.deletion_time < _gc_before);
-                };
-
-                auto latests_range_tomb = _next_row.get_iterator_in_latest_version()->range_tombstone();
-                if (tomb_expired(latests_range_tomb)) {
-                    _next_row.get_iterator_in_latest_version()->set_range_tombstone({});
-                }
-            }
-        }
-
        if (_next_row.range_tombstone_for_row() != _current_tombstone) [[unlikely]] {
            auto tomb = _next_row.range_tombstone_for_row();
            auto new_lower_bound = position_in_partition::before_key(_next_row.position());
@@ -827,31 +742,8 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
            _current_tombstone = tomb;
            _read_context.cache()._tracker.on_range_tombstone_read();
        }
-
-        if (remove_row) {
-            _read_context.cache()._tracker.on_row_compacted_away();
-
-            _lower_bound = position_in_partition::after_key(*_schema, _next_row.position());
-
-            partition_snapshot_row_weakref row_ref(_next_row);
-            move_to_next_entry();
-
-            with_allocator(_snp->region().allocator(), [&] {
-                cache_tracker& tracker = _read_context.cache()._tracker;
-                if (row_ref->is_linked()) {
-                    tracker.get_lru().remove(*row_ref);
-                }
-                row_ref->on_evicted(tracker);
-            });
-
-            _snp->region().allocator().invalidate_references();
-            _next_row.force_valid();
-        } else {
-            // We add the row to the buffer even when it's full.
-            // This simplifies the code. For more info see #3139.
-            add_to_buffer(_next_row);
-            move_to_next_entry();
-        }
+        add_to_buffer(_next_row);
+        move_to_next_entry();
    } else {
        move_to_next_range();
    }
@@ -905,10 +797,7 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
                    auto& rows = _snp->version()->partition().mutable_clustered_rows();
                    auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(table_schema(),
                            to_table_domain(_lower_bound), is_dummy::yes, is_continuous::no));
-                    return rows.insert_before_hint(
-                            _next_row.at_a_row() ? _next_row.get_iterator_in_latest_version() : rows.begin(),
-                            std::move(new_entry),
-                            cmp);
+                    return rows.insert_before_hint(_next_row.get_iterator_in_latest_version(), std::move(new_entry), cmp);
                });
                auto it = insert_result.first;
                if (insert_result.second) {
@@ -1005,7 +894,7 @@ void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_curs
    if (!row.dummy()) {
        _read_context.cache().on_row_hit();
        if (_read_context.digest_requested()) {
-            row.latest_row_prepare_hash();
+            row.latest_row().cells().prepare_hash(table_schema(), column_kind::regular_column);
        }
        add_clustering_row_to_buffer(mutation_fragment_v2(*_schema, _permit, row.row()));
    } else {
@@ -1068,28 +957,6 @@ void cache_flat_mutation_reader::maybe_set_static_row_continuous() {
    }
 }

-// Last dummies can exist in a quasi-evicted state, where they are unlinked from LRU,
-// but still alive.
-// But while in this state, they mustn't carry any information (i.e. continuity),
-// due to the "older versions are evicted first" rule of MVCC.
-// Thus, when we make an entry continuous, we must ensure that it isn't an
-// unlinked last dummy.
-inline
-void cache_flat_mutation_reader::set_rows_entry_continuous(rows_entry& e) {
-    e.set_continuous(true);
-    if (!e.is_linked()) [[unlikely]] {
-        _snp->tracker()->touch(e);
-    }
-}
-
-inline
-void cache_flat_mutation_reader::restore_continuity_after_insertion(const mutation_partition::rows_type::iterator& it) {
-    if (auto x = std::next(it); x->continuous()) {
-        it->set_continuous(true);
-        it->set_range_tombstone(x->range_tombstone());
-    }
-}
-
 inline
 bool cache_flat_mutation_reader::can_populate() const {
    return _snp->at_latest_version() && _read_context.cache().phase_of(_read_context.key()) == _read_context.phase();
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -13,7 +13,6 @@
 #include <seastar/core/sleep.hh>
 #include <seastar/core/coroutine.hh>

-#include "gms/endpoint_state.hh"
 #include "keys.hh"
 #include "schema/schema_builder.hh"
 #include "replica/database.hh"
@@ -26,7 +25,6 @@
 #include "gms/inet_address.hh"
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
-#include "utils/error_injection.hh"
 #include "utils/UUID_gen.hh"

 #include "cdc/generation.hh"
@@ -51,16 +49,8 @@ namespace db {

 namespace cdc {

-api::timestamp_clock::duration get_generation_leeway() {
-    static thread_local auto generation_leeway =
-            std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
-
-    utils::get_local_injector().inject("increase_cdc_generation_leeway", [&] {
-        generation_leeway = std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::minutes(5));
-    });
-
-    return generation_leeway;
-}
+extern const api::timestamp_clock::duration generation_leeway =
+    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));

 static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    i = net::hton(i);
@@ -76,10 +66,10 @@ static constexpr auto stream_id_index_shift = stream_id_version_shift + stream_i
 static constexpr auto stream_id_random_shift = stream_id_index_shift + stream_id_index_bits;

 /**
- * Responsibility for encoding stream_id moved from the create_stream_ids
- * function to this constructor, to keep knowledge of composition in a
- * single place. Note the make_new_generation_description function
- * defines the "order" in which we view vnodes etc.
+ * Responsibilty for encoding stream_id moved from factory method to
+ * this constructor, to keep knowledge of composition in a single place.
+ * Note this is private and friended to topology_description_generator,
+ * because he is the one who defined the "order" we view vnodes etc.
 */
 stream_id::stream_id(dht::token token, size_t vnode_index)
    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
@@ -163,18 +153,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
+topology_description::topology_description(std::vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-utils::chunked_vector<token_range_description>&& topology_description::entries() && {
+std::vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -193,48 +183,98 @@ static std::vector<stream_id> create_stream_ids(
    return result;
 }

+class topology_description_generator final {
+    const std::unordered_set<dht::token>& _bootstrap_tokens;
+    const locator::token_metadata_ptr _tmptr;
+    const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& _get_sharding_info;
+
+    // Compute a set of tokens that split the token ring into vnodes
+    auto get_tokens() const {
+        auto tokens = _tmptr->sorted_tokens();
+        auto it = tokens.insert(
+                tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
+        std::sort(it, tokens.end());
+        std::inplace_merge(tokens.begin(), it, tokens.end());
+        tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
+        return tokens;
+    }
+
+    token_range_description create_description(size_t index, dht::token start, dht::token end) const {
+        token_range_description desc;
+
+        desc.token_range_end = end;
+
+        auto [shard_count, ignore_msb] = _get_sharding_info(end);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
+        desc.sharding_ignore_msb = ignore_msb;
+
+        return desc;
+    }
+public:
+    topology_description_generator(
+            const std::unordered_set<dht::token>& bootstrap_tokens,
+            const locator::token_metadata_ptr tmptr,
+            // This function must return sharding parameters for a node that owns the vnode ending with
+            // the given token. Returns <shard_count, ignore_msb> pair.
+            const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info)
+        : _bootstrap_tokens(bootstrap_tokens)
+        , _tmptr(std::move(tmptr))
+        , _get_sharding_info(get_sharding_info)
+    {}
+
+    /*
+     * Generate a set of CDC stream identifiers such that for each shard
+     * and vnode pair there exists a stream whose token falls into this vnode
+     * and is owned by this shard. It is sometimes not possible to generate
+     * a CDC stream identifier for some (vnode, shard) pair because not all
+     * shards have to own tokens in a vnode. Small vnode can be totally owned
+     * by a single shard. In such case, a stream identifier that maps to
+     * end of the vnode is generated.
+     *
+     * Then build a cdc::topology_description which maps tokens to generated
+     * stream identifiers, such that if token T is owned by shard S in vnode V,
+     * it gets mapped to the stream identifier generated for (S, V).
+     */
+    // Run in seastar::async context.
+    topology_description generate() const {
+        const auto tokens = get_tokens();
+
+        std::vector<token_range_description> vnode_descriptions;
+        vnode_descriptions.reserve(tokens.size());
+
+        vnode_descriptions.push_back(
+                create_description(0, tokens.back(), tokens.front()));
+        for (size_t idx = 1; idx < tokens.size(); ++idx) {
+            vnode_descriptions.push_back(
+                    create_description(idx, tokens[idx - 1], tokens[idx]));
+        }
+
+        return {std::move(vnode_descriptions)};
+    }
+};
+
 bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
    auto my_host_id = g.get_host_id(me);
-    return g.for_each_endpoint_state_until([&] (const gms::inet_address& node, const gms::endpoint_state& eps) {
-        return stop_iteration(my_host_id < g.get_host_id(node));
-    }) == stop_iteration::no;
+    auto& eps = g.get_endpoint_states();
+    return std::none_of(eps.begin(), eps.end(),
+            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
+        return my_host_id < g.get_host_id(ep.first);
+    });
 }

-bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm) {
-    if (tm.sorted_tokens().size() != gen.entries().size()) {
-        // We probably have garbage streams from old generations
-        cdc_log.info("Generation size does not match the token ring");
-        return false;
-    } else {
-        std::unordered_set<dht::token> gen_ends;
-        for (const auto& entry : gen.entries()) {
-            gen_ends.insert(entry.token_range_end);
-        }
-        for (const auto& metadata_token : tm.sorted_tokens()) {
-            if (!gen_ends.contains(metadata_token)) {
-                cdc_log.warn("CDC generation missing token {}", metadata_token);
-                return false;
-            }
-        }
-        return true;
-    }
-}
-
-static future<utils::chunked_vector<mutation>> get_common_cdc_generation_mutations(
+future<utils::chunked_vector<mutation>> get_cdc_generation_mutations(
        schema_ptr s,
-        const partition_key& pkey,
-        noncopyable_function<clustering_key (dht::token)>&& get_ckey_from_range_end,
+        utils::UUID id,
        const cdc::topology_description& desc,
        size_t mutation_size_threshold,
        api::timestamp_type ts) {
    utils::chunked_vector<mutation> res;
-    res.emplace_back(s, pkey);
+    res.emplace_back(s, partition_key::from_singular(*s, id));
+    res.back().set_static_cell(to_bytes("num_ranges"), int32_t(desc.entries().size()), ts);
    size_t size_estimate = 0;
-    size_t total_size_estimate = 0;
    for (auto& e : desc.entries()) {
        if (size_estimate >= mutation_size_threshold) {
-            total_size_estimate += size_estimate;
-            res.emplace_back(s, pkey);
+            res.emplace_back(s, partition_key::from_singular(*s, id));
            size_estimate = 0;
        }

@@ -245,60 +285,16 @@ static future<utils::chunked_vector<mutation>> get_common_cdc_generation_mutatio
        }

        size_estimate += e.streams.size() * 20;
-        auto ckey = get_ckey_from_range_end(e.token_range_end);
+        auto ckey = clustering_key::from_singular(*s, dht::token::to_int64(e.token_range_end));
        res.back().set_cell(ckey, to_bytes("streams"), make_set_value(db::cdc_streams_set_type, std::move(streams)), ts);
        res.back().set_cell(ckey, to_bytes("ignore_msb"), int8_t(e.sharding_ignore_msb), ts);

        co_await coroutine::maybe_yield();
    }

-    total_size_estimate += size_estimate;
-
-    // Copy mutations n times, where n is picked so that the memory size of all mutations together exceeds `max_command_size`.
-    utils::get_local_injector().inject("cdc_generation_mutations_replication", [&res, total_size_estimate, mutation_size_threshold] {
-        utils::chunked_vector<mutation> new_res;
-
-        size_t number_of_copies = (mutation_size_threshold / total_size_estimate + 1) * 2;
-        for (size_t i = 0; i < number_of_copies; ++i) {
-            std::copy(res.begin(), res.end(), std::back_inserter(new_res));
-        }
-
-        res = std::move(new_res);
-    });
-
    co_return res;
 }

-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v2(
-        schema_ptr s,
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        size_t mutation_size_threshold,
-        api::timestamp_type ts) {
-    auto pkey = partition_key::from_singular(*s, id);
-    auto get_ckey = [s] (dht::token range_end) {
-        return clustering_key::from_singular(*s, dht::token::to_int64(range_end));
-    };
-
-    auto res = co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
-    res.back().set_static_cell(to_bytes("num_ranges"), int32_t(desc.entries().size()), ts);
-    co_return res;
-}
-
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
-        schema_ptr s,
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        size_t mutation_size_threshold,
-        api::timestamp_type ts) {
-    auto pkey = partition_key::from_singular(*s, CDC_GENERATIONS_V3_KEY);
-    auto get_ckey = [&] (dht::token range_end) {
-        return clustering_key::from_exploded(*s, {timeuuid_type->decompose(id), long_type->decompose(dht::token::to_int64(range_end))}) ;
-    };
-
-    co_return co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
-}
-
 // non-static for testing
 size_t limit_of_streams_in_topology_description() {
    // Each stream takes 16B and we don't want to exceed 4MB so we can have
@@ -331,47 +327,13 @@ topology_description limit_number_of_streams_if_needed(topology_description&& de
    return topology_description(std::move(entries));
 }

-// Compute a set of tokens that split the token ring into vnodes.
-static auto get_tokens(const std::unordered_set<dht::token>& bootstrap_tokens, const locator::token_metadata_ptr tmptr) {
-    auto tokens = tmptr->sorted_tokens();
-    auto it = tokens.insert(tokens.end(), bootstrap_tokens.begin(), bootstrap_tokens.end());
-    std::sort(it, tokens.end());
-    std::inplace_merge(tokens.begin(), it, tokens.end());
-    tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
-    return tokens;
-}
-
-static token_range_description create_token_range_description(
-        size_t index,
-        dht::token start,
-        dht::token end,
-        const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info) {
-    token_range_description desc;
-
-    desc.token_range_end = end;
-
-    auto [shard_count, ignore_msb] = get_sharding_info(end);
-    desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
-    desc.sharding_ignore_msb = ignore_msb;
-
-    return desc;
-}
-
-cdc::topology_description make_new_generation_description(
+std::pair<utils::UUID, cdc::topology_description> make_new_generation_data(
        const std::unordered_set<dht::token>& bootstrap_tokens,
        const noncopyable_function<std::pair<size_t, uint8_t>(dht::token)>& get_sharding_info,
        const locator::token_metadata_ptr tmptr) {
-    const auto tokens = get_tokens(bootstrap_tokens, tmptr);
-
-    utils::chunked_vector<token_range_description> vnode_descriptions;
-    vnode_descriptions.reserve(tokens.size());
-
-    vnode_descriptions.push_back(create_token_range_description(0, tokens.back(), tokens.front(), get_sharding_info));
-    for (size_t idx = 1; idx < tokens.size(); ++idx) {
-        vnode_descriptions.push_back(create_token_range_description(idx, tokens[idx - 1], tokens[idx], get_sharding_info));
-    }
-
-    return {std::move(vnode_descriptions)};
+    auto gen = topology_description_generator(bootstrap_tokens, tmptr, get_sharding_info).generate();
+    auto uuid = utils::make_random_uuid();
+    return {uuid, std::move(gen)};
 }

 db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milliseconds ring_delay) {
@@ -380,7 +342,7 @@ db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milli

    auto ts = db_clock::now();
    if (add_delay && ring_delay != 0ms) {
-        ts += 2 * ring_delay + duration_cast<milliseconds>(get_generation_leeway());
+        ts += 2 * ring_delay + duration_cast<milliseconds>(generation_leeway);
    }
    return ts;
 }
@@ -403,9 +365,7 @@ future<cdc::generation_id> generation_service::legacy_make_new_generation(const
            return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
        }
    };
-
-    auto uuid = utils::make_random_uuid();
-    auto gen = make_new_generation_description(bootstrap_tokens, get_sharding_info, tmptr);
+    auto [uuid, gen] = make_new_generation_data(bootstrap_tokens, get_sharding_info, tmptr);

    // Our caller should ensure that there are normal tokens in the token ring.
    auto normal_token_owners = tmptr->count_normal_token_owners();
@@ -459,12 +419,8 @@ future<cdc::generation_id> generation_service::legacy_make_new_generation(const
 * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
 * which means it will gossip the generation's timestamp.
 */
-static std::optional<cdc::generation_id> get_generation_id_for(const gms::inet_address& endpoint, const gms::endpoint_state& eps) {
-    const auto* gen_id_ptr = eps.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
-    if (!gen_id_ptr) {
-        return std::nullopt;
-    }
-    auto gen_id_string = gen_id_ptr->value();
+static std::optional<cdc::generation_id> get_generation_id_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto gen_id_string = g.get_application_state_value(endpoint, gms::application_state::CDC_GENERATION_ID);
    cdc_log.trace("endpoint={}, gen_id_string={}", endpoint, gen_id_string);
    return gms::versioned_value::cdc_generation_id_from_string(gen_id_string);
 }
@@ -668,21 +624,21 @@ future<> generation_service::maybe_rewrite_streams_descriptions() {

    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
    std::vector<time_and_ttl> times_and_ttls;
-    _db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
-        auto& s = *t->schema();
+    for (auto& [_, cf] : _db.get_column_families()) {
+        auto& s = *cf->schema();
        auto base = cdc::get_base_table(_db, s.ks_name(), s.cf_name());
        if (!base) {
            // Not a CDC log table.
-            return;
+            continue;
        }
        auto& cdc_opts = base->cdc_options();
        if (!cdc_opts.enabled()) {
            // This table is named like a CDC log table but it's not one.
-            return;
+            continue;
        }

        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
-    });
+    }

    if (times_and_ttls.empty()) {
        // There's no point in rewriting old generations' streams (they don't contain any data).
@@ -770,8 +726,8 @@ future<> generation_service::stop() {
        cdc_log.error("CDC stream rewrite failed: ", std::current_exception());
    }

-    if (_joined && (this_shard_id() == 0)) {
-        co_await leave_ring();
+    if (this_shard_id() == 0) {
+        co_await _gossiper.unregister_(shared_from_this());
    }

    _stopped = true;
@@ -783,6 +739,7 @@ generation_service::~generation_service() {

 future<> generation_service::after_join(std::optional<cdc::generation_id>&& startup_gen_id) {
    assert_shard_zero(__PRETTY_FUNCTION__);
+    assert(_sys_ks.local().bootstrap_complete());

    _gen_id = std::move(startup_gen_id);
    _gossiper.register_(shared_from_this());
@@ -800,24 +757,18 @@ future<> generation_service::after_join(std::optional<cdc::generation_id>&& star
    _cdc_streams_rewrite_complete = maybe_rewrite_streams_descriptions();
 }

-future<> generation_service::leave_ring() {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-    _joined = false;
-    co_await _gossiper.unregister_(shared_from_this());
-}
-
-future<> generation_service::on_join(gms::inet_address ep, gms::endpoint_state_ptr ep_state, gms::permit_id pid) {
+future<> generation_service::on_join(gms::inet_address ep, gms::endpoint_state ep_state) {
    assert_shard_zero(__PRETTY_FUNCTION__);

-    auto val = ep_state->get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
+    auto val = ep_state.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
    if (!val) {
        return make_ready_future();
    }

-    return on_change(ep, gms::application_state::CDC_GENERATION_ID, *val, pid);
+    return on_change(ep, gms::application_state::CDC_GENERATION_ID, *val);
 }

-future<> generation_service::on_change(gms::inet_address ep, gms::application_state app_state, const gms::versioned_value& v, gms::permit_id) {
+future<> generation_service::on_change(gms::inet_address ep, gms::application_state app_state, const gms::versioned_value& v) {
    assert_shard_zero(__PRETTY_FUNCTION__);

    if (app_state != gms::application_state::CDC_GENERATION_ID) {
@@ -837,21 +788,22 @@ future<> generation_service::check_and_repair_cdc_streams() {
    }

    std::optional<cdc::generation_id> latest = _gen_id;
-    _gossiper.for_each_endpoint_state([&] (const gms::inet_address& addr, const gms::endpoint_state& state) {
+    const auto& endpoint_states = _gossiper.get_endpoint_states();
+    for (const auto& [addr, state] : endpoint_states) {
        if (_gossiper.is_left(addr)) {
            cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
-            return;
+            continue;
        }
        if (!_gossiper.is_normal(addr)) {
            throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
        }

-        const auto gen_id = get_generation_id_for(addr, state);
+        const auto gen_id = get_generation_id_for(addr, _gossiper);
        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
            latest = gen_id;
        }
-    });
+    }

    auto tmptr = _token_metadata.get();
    auto sys_dist_ks = get_sys_dist_ks();
@@ -906,9 +858,24 @@ future<> generation_service::check_and_repair_cdc_streams() {
                " even though some node gossiped about it.",
                latest, db_clock::now());
            should_regenerate = true;
-        } else if (!is_cdc_generation_optimal(*gen, *tmptr)) {
-            should_regenerate = true;
-            cdc_log.info("CDC generation {} needs repair, regenerating", latest);
+        } else {
+            if (tmptr->sorted_tokens().size() != gen->entries().size()) {
+                // We probably have garbage streams from old generations
+                cdc_log.info("Generation size does not match the token ring, regenerating");
+                should_regenerate = true;
+            } else {
+                std::unordered_set<dht::token> gen_ends;
+                for (const auto& entry : gen->entries()) {
+                    gen_ends.insert(entry.token_range_end);
+                }
+                for (const auto& metadata_token : tmptr->sorted_tokens()) {
+                    if (!gen_ends.contains(metadata_token)) {
+                        cdc_log.warn("CDC generation {} missing token {}. Regenerating.", latest, metadata_token);
+                        should_regenerate = true;
+                        break;
+                    }
+                }
+            }
        }
    }

@@ -968,13 +935,17 @@ future<> generation_service::legacy_handle_cdc_generation(std::optional<cdc::gen
        co_return;
    }

-    if (!_sys_dist_ks.local_is_initialized() || !_sys_dist_ks.local().started()) {
-        on_internal_error(cdc_log, "Legacy handle CDC generation with sys.dist.ks. down");
+    if (!_sys_ks.local().bootstrap_complete() || !_sys_dist_ks.local_is_initialized()
+            || !_sys_dist_ks.local().started()) {
+        // The service should not be listening for generation changes until after the node
+        // is bootstrapped. Therefore we would previously assume that this condition
+        // can never become true and call on_internal_error here, but it turns out that
+        // it may become true on decommission: the node enters NEEDS_BOOTSTRAP
+        // state before leaving the token ring, so bootstrap_complete() becomes false.
+        // In that case we can simply return.
+        co_return;
    }

-    // The service should not be listening for generation changes until after the node
-    // is bootstrapped and since the node leaves the ring on decommission
-
    if (co_await container().map_reduce(and_reducer(), [ts = get_ts(*gen_id)] (generation_service& svc) {
        return !svc._cdc_metadata.prepare(ts);
    })) {
@@ -1037,12 +1008,12 @@ future<> generation_service::legacy_scan_cdc_generations() {
    assert_shard_zero(__PRETTY_FUNCTION__);

    std::optional<cdc::generation_id> latest;
-    _gossiper.for_each_endpoint_state([&] (const gms::inet_address& node, const gms::endpoint_state& eps) {
-        auto gen_id = get_generation_id_for(node, eps);
+    for (const auto& ep: _gossiper.get_endpoint_states()) {
+        auto gen_id = get_generation_id_for(ep.first, _gossiper);
        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
            latest = gen_id;
        }
-    });
+    }

    if (latest) {
        cdc_log.info("Latest generation seen during startup: {}", *latest);
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -46,8 +46,6 @@ namespace gms {

 namespace cdc {

-api::timestamp_clock::duration get_generation_leeway();
-
 class stream_id final {
    bytes _value;
 public:
@@ -94,13 +92,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    utils::chunked_vector<token_range_description> _entries;
+    std::vector<token_range_description> _entries;
 public:
-    topology_description(utils::chunked_vector<token_range_description> entries);
+    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const utils::chunked_vector<token_range_description>& entries() const&;
-    utils::chunked_vector<token_range_description>&& entries() &&;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
 };

 /**
@@ -135,28 +133,7 @@ public:
 */
 bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);

-/*
- * Checks if the CDC generation is optimal, which is true if its `topology_description` is consistent
- * with `token_metadata`.
-*/
-bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm);
-
-/*
- * Generate a set of CDC stream identifiers such that for each shard
- * and vnode pair there exists a stream whose token falls into this vnode
- * and is owned by this shard. It is sometimes not possible to generate
- * a CDC stream identifier for some (vnode, shard) pair because not all
- * shards have to own tokens in a vnode. Small vnode can be totally owned
- * by a single shard. In such case, a stream identifier that maps to
- * end of the vnode is generated.
- *
- * Then build a cdc::topology_description which maps tokens to generated
- * stream identifiers, such that if token T is owned by shard S in vnode V,
- * it gets mapped to the stream identifier generated for (S, V).
- *
- * Run in seastar::async context.
- */
-cdc::topology_description make_new_generation_description(
+std::pair<utils::UUID, cdc::topology_description> make_new_generation_data(
    const std::unordered_set<dht::token>& bootstrap_tokens,
    const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info,
    const locator::token_metadata_ptr);
@@ -167,20 +144,9 @@ db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milli
 // using `mutation_size_threshold` to decide on the mutation sizes. The partition key of each mutation
 // is given by `gen_uuid`. The timestamp of each cell in each mutation is given by `mutation_timestamp`.
 //
-// Works only for the CDC_GENERATIONS_V2 schema (in system_distributed keyspace).
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v2(
-    schema_ptr, utils::UUID gen_uuid, const cdc::topology_description&,
-    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);
-
-// The partition key of all rows in the single-partition CDC_GENERATIONS_V3 schema (in system keyspace).
-static constexpr auto CDC_GENERATIONS_V3_KEY = "cdc_generations";
-
-// Translates the CDC generation data given by a `cdc::topology_description` into a vector of mutations,
-// using `mutation_size_threshold` to decide on the mutation sizes. The first clustering key column is
-// given by `gen_uuid`. The timestamp of each cell in each mutation is given by `mutation_timestamp`.
-//
-// Works only for the CDC_GENERATIONS_V3 schema (in system keyspace).
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
+// Works for only specific schemas: CDC_GENERATIONS_V2 (in system_distributed_keyspace)
+// and CDC_GENERATIONS_V3 (in system_keyspace).
+future<utils::chunked_vector<mutation>> get_cdc_generation_mutations(
    schema_ptr, utils::UUID gen_uuid, const cdc::topology_description&,
    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);

--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -98,20 +98,19 @@ public:
     * Must be called on shard 0 - that's where the generation management happens.
     */
    future<> after_join(std::optional<cdc::generation_id>&& startup_gen_id);
-    future<> leave_ring();

    cdc::metadata& get_cdc_metadata() {
        return _cdc_metadata;
    }

-    virtual future<> before_change(gms::inet_address, gms::endpoint_state_ptr, gms::application_state, const gms::versioned_value&) override { return make_ready_future(); }
-    virtual future<> on_alive(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_dead(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_remove(gms::inet_address, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_restart(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
+    virtual future<> before_change(gms::inet_address, gms::endpoint_state, gms::application_state, const gms::versioned_value&) override { return make_ready_future(); }
+    virtual future<> on_alive(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }
+    virtual future<> on_dead(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }
+    virtual future<> on_remove(gms::inet_address) override { return make_ready_future(); }
+    virtual future<> on_restart(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }

-    virtual future<> on_join(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override;
-    virtual future<> on_change(gms::inet_address, gms::application_state, const gms::versioned_value&, gms::permit_id) override;
+    virtual future<> on_join(gms::inet_address, gms::endpoint_state) override;
+    virtual future<> on_change(gms::inet_address, gms::application_state, const gms::versioned_value&) override;

    future<> check_and_repair_cdc_streams();

--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -15,6 +15,10 @@

 extern logging::logger cdc_log;

+namespace cdc {
+    extern const api::timestamp_clock::duration generation_leeway;
+} // namespace cdc
+
 static api::timestamp_type to_ts(db_clock::time_point tp) {
    // This assumes that timestamp_clock and db_clock have the same epochs.
    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
@@ -36,7 +40,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const utils::chunked_vector<cdc::token_range_description>& entries,
+        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
@@ -69,7 +73,7 @@ bool cdc::metadata::streams_available() const {

 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
-    if (ts > now + get_generation_leeway().count()) {
+    if (ts > now + generation_leeway.count()) {
        throw exceptions::invalid_request_exception(format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
@@ -82,43 +86,27 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // Nothing protects us from that until we start using transactions for generation switching.
    }

-    auto it = gen_used_at(now - get_generation_leeway().count());
-
-    if (it != _gens.end()) {
-        // Garbage-collect generations that will no longer be used.
-        it = _gens.erase(_gens.begin(), it);
-    }
-
-    if (ts <= now - get_generation_leeway().count()) {
-        // We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
-        // happens iff one of the following is true:
-        // - the write is to no generation,
-        // - the write is to a generation older than the generation under `it`,
-        // - the write is to the generation under `it` and that generation is not the current generation.
-        // Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
-        // but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
-        // are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
-        // the generation under `it` because that generation was operating at `now - generation_leeway`.
-        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
-        if (it == _gens.end() || ts < it->first || is_previous_gen) {
-            throw exceptions::invalid_request_exception(format(
-                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
-                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                    " consistency properties.\n"
-                    "We *do* allow sending writes into the near past, but our ability to do that is limited."
-                    " Are you using client-side timestamps? Make sure your clocks are well-synchronized"
-                    " with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
-        }
-    }
-
-    it = _gens.begin();
-    if (it == _gens.end() || ts < it->first) {
+    auto it = gen_used_at(now);
+    if (it == _gens.end()) {
        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
+                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
    }

-    // Find the generation operating at `ts`.
+    // Garbage-collect generations that will no longer be used.
+    it = _gens.erase(_gens.begin(), it);
+
+    if (it->first > ts) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream from an earlier generation than the currently used one."
+                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                " consistency properties (write timestamp: {}, current generation started at: {})",
+                format_timestamp(ts), format_timestamp(it->first)));
+    }
+
+    // With `generation_leeway` we allow sending writes to the near future. It might happen
+    // that `ts` doesn't belong to the current generation ("current" according to our clock),
+    // but to the next generation. Adjust for this case:
    {
        auto next_it = std::next(it);
        while (next_it != _gens.end() && next_it->first <= ts) {
@@ -159,8 +147,8 @@ bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
        ++it;
    }

-    // Check if the generation is obsolete.
-    return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
+    // Check if some new generation has already superseded this one.
+    return it != _gens.end() && it->first <= api::new_timestamp();
 }

 bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
@@ -169,7 +157,7 @@ bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen)
    }

    auto now = api::new_timestamp();
-    auto it = gen_used_at(now - get_generation_leeway().count());
+    auto it = gen_used_at(now);

    if (it != _gens.end()) {
        // Garbage-collect generations that will no longer be used.
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -42,9 +42,7 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
 public:
-    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
-     * it is older than the generation operating at `now - get_generation_leeway()`.
-     */
+    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
    bool known_or_obsolete(db_clock::time_point) const;

    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
@@ -56,9 +54,8 @@ public:
     *
     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
-     * the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
-     * `get_generation_leeway()`.
+     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
+     * by the `cdc::generation_leeway` constant.
     */
    stream_id get_stream(api::timestamp_type ts, dht::token tok);

--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -21,27 +21,27 @@ public:
            : file_impl(*get_file_impl(f)),  _error_handler(error_handler), _file(f) {
    }

-    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, io_intent* intent) override {
+    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->write_dma(pos, buffer, len, intent);
+            return get_file_impl(_file)->write_dma(pos, buffer, len, pc);
        });
    }

-    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, io_intent* intent) override {
+    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->write_dma(pos, iov, intent);
+            return get_file_impl(_file)->write_dma(pos, iov, pc);
        });
    }

-    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, io_intent* intent) override {
+    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->read_dma(pos, buffer, len, intent);
+            return get_file_impl(_file)->read_dma(pos, buffer, len, pc);
        });
    }

-    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, io_intent* intent) override {
+    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->read_dma(pos, iov, intent);
+            return get_file_impl(_file)->read_dma(pos, iov, pc);
        });
    }

@@ -99,9 +99,9 @@ public:
        });
    }

-    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, io_intent* intent) override {
+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->dma_read_bulk(offset, range_size, intent);
+            return get_file_impl(_file)->dma_read_bulk(offset, range_size, pc);
        });
    }
 private:
--- a/cmake/add_version_library.cmake
+++ b/cmake/add_version_library.cmake
@@ -1,31 +1,20 @@
 ###
 ### Generate version file and supply appropriate compile definitions for release.cc
 ###
-function(generate_scylla_version)
+function(add_version_library name source)
  set(version_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-VERSION-FILE)
  set(release_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-RELEASE-FILE)
-  set(product_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-PRODUCT-FILE)
  execute_process(
    COMMAND ${CMAKE_SOURCE_DIR}/SCYLLA-VERSION-GEN --output-dir "${CMAKE_CURRENT_BINARY_DIR}"
    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
-
  file(STRINGS ${version_file} scylla_version)
  file(STRINGS ${release_file} scylla_release)
-  file(STRINGS ${product_file} scylla_product)

-  string(REPLACE "-" "~" scylla_version_tilde ${scylla_version})
-
-  set(Scylla_VERSION "${scylla_version_tilde}" CACHE INTERNAL "")
-  set(Scylla_RELEASE "${scylla_release}" CACHE INTERNAL "")
-  set(Scylla_PRODUCT "${scylla_product}" CACHE INTERNAL "")
-endfunction(generate_scylla_version)
-
-function(add_version_library name source)
  add_library(${name} OBJECT ${source})
  target_compile_definitions(${name}
    PRIVATE
-      SCYLLA_VERSION=\"${Scylla_VERSION}\"
-      SCYLLA_RELEASE=\"${Scylla_RELEASE}\")
+      SCYLLA_VERSION=\"${scylla_version}\"
+      SCYLLA_RELEASE=\"${scylla_release}\")
  target_link_libraries(${name}
    PRIVATE
      Seastar::seastar)
--- a/cmake/add_whole_archive.cmake
+++ b/cmake/add_whole_archive.cmake
@@ -5,6 +5,15 @@
 # actually compiling a sample program.
 function(add_whole_archive name library)
  add_library(${name} INTERFACE)
-  target_link_libraries(${name} INTERFACE
-    "$<LINK_LIBRARY:WHOLE_ARCHIVE,${library}>")
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
+    target_link_libraries(${name} INTERFACE
+      "$<LINK_LIBRARY:WHOLE_ARCHIVE,${library}>")
+  else()
+    add_dependencies(${name} ${library})
+    target_include_directories(${name} INTERFACE
+      ${CMAKE_SOURCE_DIR})
+    target_link_options(auth INTERFACE
+      "$<$<CXX_COMPILER_ID:Clang>:SHELL:LINKER:-force_load $<TARGET_LINKER_FILE:${library}>>"
+      "$<$<CXX_COMPILER_ID:GNU>:SHELL:LINKER:--whole-archive $<TARGET_LINKER_FILE:${library}> LINKER:--no-whole-archive>")
+  endif()
 endfunction()
--- a/cmake/build_submodule.cmake
+++ b/cmake/build_submodule.cmake
@@ -1,50 +0,0 @@
-function(build_submodule name dir)
-  cmake_parse_arguments(parsed_args "NOARCH" "" "" ${ARGN})
-  set(version_release "${Scylla_VERSION}-${Scylla_RELEASE}")
-  set(product_version_release
-    "${Scylla_PRODUCT}-${Scylla_VERSION}-${Scylla_RELEASE}")
-  set(working_dir ${CMAKE_CURRENT_SOURCE_DIR}/${dir})
-  if(parsed_args_NOARCH)
-    set(arch "noarch")
-  else()
-    set(arch "${CMAKE_SYSTEM_PROCESSOR}")
-  endif()
-  set(reloc_args ${parsed_args_UNPARSED_ARGUMENTS})
-  set(reloc_pkg "${working_dir}/build/${Scylla_PRODUCT}-${name}-${version_release}.${arch}.tar.gz")
-  add_custom_command(
-    OUTPUT ${reloc_pkg}
-    COMMAND reloc/build_reloc.sh --version ${product_version_release} --nodeps ${reloc_args}
-    WORKING_DIRECTORY "${working_dir}"
-    JOB_POOL submodule_pool)
-  add_custom_target(dist-${name}-tar
-    DEPENDS ${reloc_pkg})
-  add_custom_target(dist-${name}-rpm
-    COMMAND reloc/build_rpm.sh --reloc-pkg ${reloc_pkg}
-    DEPENDS ${reloc_pkg}
-    WORKING_DIRECTORY "${working_dir}")
-  add_custom_target(dist-${name}-deb
-    COMMAND reloc/build_deb.sh --reloc-pkg ${reloc_pkg}
-    DEPENDS ${reloc_pkg}
-    WORKING_DIRECTORY "${working_dir}")
-  add_custom_target(dist-${name}
-    DEPENDS dist-${name}-tar dist-${name}-rpm dist-${name}-deb)
-endfunction()
-
-macro(dist_submodule name dir pkgs)
-  # defined as a macro, so that we can append the path to the dist tarball to
-  # specfied "pkgs"
-  cmake_parse_arguments(parsed_args "NOARCH" "" "" ${ARGN})
-  if(parsed_args_NOARCH)
-    set(arch "noarch")
-  else()
-    set(arch "${CMAKE_SYSTEM_PROCESSOR}")
-  endif()
-  set(pkg_name "${Scylla_PRODUCT}-${name}-${Scylla_VERSION}-${Scylla_RELEASE}.${arch}.tar.gz")
-  set(reloc_pkg "${CMAKE_SOURCE_DIR}/tools/${dir}/build/${pkg_name}")
-  set(dist_pkg "${CMAKE_CURRENT_BINARY_DIR}/${pkg_name}")
-  add_custom_command(
-    OUTPUT ${dist_pkg}
-    COMMAND ${CMAKE_COMMAND} -E copy ${reloc_pkg} ${dist_pkg}
-    DEPENDS dist-${name}-tar)
-  list(APPEND ${pkgs} "${dist_pkg}")
-endmacro()
--- a/cmake/generate_cql_grammar.cmake
+++ b/cmake/generate_cql_grammar.cmake
@@ -1,5 +1,7 @@
-find_program (ANTLR3 antlr3
-  REQUIRED)
+find_program (ANTLR3 antlr3)
+if(NOT ANTLR3)
+  message(FATAL "antlr3 is required")
+endif()

 # Parse antlr3 grammar files and generate C++ sources
 function(generate_cql_grammar)
--- a/cmake/mode.COVERAGE.cmake
+++ b/cmake/mode.COVERAGE.cmake
@@ -1,23 +0,0 @@
-set(Seastar_OptimizationLevel_COVERAGE "g")
-set(CMAKE_CXX_FLAGS_COVERAGE
-  ""
-  CACHE
-  INTERNAL
-  "")
-string(APPEND CMAKE_CXX_FLAGS_COVERAGE
-  " -O${Seastar_OptimizationLevel_SANITIZE}")
-
-set(Seastar_DEFINITIONS_COVERAGE
-  SCYLLA_BUILD_MODE=debug
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
-
-set(CMAKE_CXX_FLAGS_COVERAGE
-  " -O${Seastar_OptimizationLevel_COVERAGE} -fprofile-instr-generate -fcoverage-mapping -g -gz")
-
-set(CMAKE_STATIC_LINKER_FLAGS_COVERAGE
-  "-fprofile-instr-generate -fcoverage-mapping")
-
-set(stack_usage_threshold_in_KB 40)
--- a/cmake/mode.RELEASE.cmake
+++ b/cmake/mode.RELEASE.cmake
@@ -12,16 +12,16 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
 else()
  set(clang_inline_threshold 2500)
 endif()
-add_compile_options(
-  "$<$<CXX_COMPILER_ID:GNU>:--param;inline-unit-growth=300>"
-  "$<$<CXX_COMPILER_ID:Clang>:-mllvm;-inline-threshold=${clang_inline_threshold}>"
+string(APPEND CMAKE_CXX_FLAGS_RELEASE
+  " $<$<CXX_COMPILER_ID:GNU>:--param inline-unit-growth=300"
+  " $<$<CXX_COMPILER_ID:Clang>:-mllvm -inline-threshold=${clang_inline_threshold}>"
  # clang generates 16-byte loads that break store-to-load forwarding
  # gcc also has some trouble: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103554
-  "-fno-slp-vectorize")
-set(Seastar_DEFINITIONS_RELEASE
+  " -fno-slp-vectorize")
+set(Seastar_DEFINITIONS_DEBUG
  SCYLLA_BUILD_MODE=release)

-set(CMAKE_EXE_LINKER_FLAGS_RELEASE
+set(CMAKE_STATIC_LINKER_FLAGS_RELEASE
  "-Wl,--gc-sections")

 set(stack_usage_threshold_in_KB 13)
--- a/cmake/mode.SANITIZE.cmake
+++ b/cmake/mode.SANITIZE.cmake
@@ -1,17 +0,0 @@
-set(Seastar_OptimizationLevel_SANITIZE "s")
-set(CMAKE_CXX_FLAGS_SANITIZE
-  ""
-  CACHE
-  INTERNAL
-  "")
-string(APPEND CMAKE_CXX_FLAGS_SANITIZE
-  " -O${Seastar_OptimizationLevel_SANITIZE}")
-
-set(Seastar_DEFINITIONS_SANITIZE
-  SCYLLA_BUILD_MODE=sanitize
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
-
-set(stack_usage_threshold_in_KB 50)
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -15,19 +15,13 @@ string(JOIN " " CMAKE_CXX_FLAGS
  "-Wall"
  "-Werror"
  "-Wno-error=deprecated-declarations"
-  "-Wimplicit-fallthrough"
  ${_supported_warnings})

 function(default_target_arch arch)
  set(x86_instruction_sets i386 i686 x86_64)
  if(CMAKE_SYSTEM_PROCESSOR IN_LIST x86_instruction_sets)
    set(${arch} "westmere" PARENT_SCOPE)
-  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-    # we always use intrinsics like vmull.p64 for speeding up crc32 calculations
-    # on the aarch64 architectures, and they require the crypto extension, so
-    # we have to add "+crypto" in the architecture flags passed to -march. the
-    # same applies to crc32 instructions, which need the ARMv8-A CRC32 extension
-    # please note, Seastar also sets -march when compiled with DPDK enabled.
+  elseif(CMAKE_SYSTEM_PROCESSOR EQUAL "aarch64")
    set(${arch} "armv8-a+crc+crypto" PARENT_SCOPE)
  else()
    set(${arch} "" PARENT_SCOPE)
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -29,27 +29,32 @@
 #include <seastar/core/shared_ptr.hh>

 #include "dht/i_partitioner.hh"
-#include "sstables/exceptions.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstable_writer.hh"
 #include "sstables/progress_monitor.hh"
 #include "sstables/sstables_manager.hh"
 #include "compaction.hh"
+#include "compaction_manager.hh"
 #include "schema/schema.hh"
 #include "db/system_keyspace.hh"
+#include "service/priority_manager.hh"
 #include "db_clock.hh"
 #include "mutation/mutation_compactor.hh"
 #include "leveled_manifest.hh"
+#include "dht/token.hh"
 #include "dht/partition_filter.hh"
 #include "mutation_writer/shard_based_splitting_writer.hh"
 #include "mutation_writer/partition_based_splitting_writer.hh"
 #include "mutation/mutation_source_metadata.hh"
 #include "mutation/mutation_fragment_stream_validator.hh"
+#include "utils/UUID_gen.hh"
+#include "utils/utf8.hh"
+#include "utils/fmt-compat.hh"
 #include "utils/error_injection.hh"
-#include "readers/multi_range.hh"
+#include "readers/filtering.hh"
 #include "readers/compacting.hh"
 #include "tombstone_gc.hh"
-#include "replica/database.hh"
+#include "keys.hh"

 namespace sstables {

@@ -143,22 +148,32 @@ std::ostream& operator<<(std::ostream& os, compaction_type_options::scrub::quara
    return os << to_string(quarantine_mode);
 }

+std::ostream& operator<<(std::ostream& os, pretty_printed_data_size data) {
+    static constexpr const char* suffixes[] = { " bytes", "kB", "MB", "GB", "TB", "PB" };
+
+    unsigned exp = 0;
+    while ((data._size >= 1000) && (exp < sizeof(suffixes))) {
+        exp++;
+        data._size /= 1000;
+    }
+
+    os << data._size << suffixes[exp];
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
+    uint64_t throughput = tp._duration.count() > 0 ? tp._size / tp._duration.count() : 0;
+    os << pretty_printed_data_size(throughput) << "/s";
+    return os;
+}
+
 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks,
-        const api::timestamp_type compacting_max_timestamp) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
        return api::min_timestamp;
    }

-    auto timestamp = api::max_timestamp;
-    auto memtable_min_timestamp = table_s.min_memtable_timestamp();
-    // Use memtable timestamp if it contains data older than the sstables being compacted,
-    // and if the memtable also contains the key we're calculating max purgeable timestamp for.
-    // First condition helps to not penalize the common scenario where memtable only contains
-    // newer data.
-    if (memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
-        timestamp = memtable_min_timestamp;
-    }
+    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
        if (compacting_set.contains(sst)) {
@@ -432,9 +447,9 @@ class compaction {
 protected:
    compaction_data& _cdata;
    table_state& _table_s;
-    const compaction_sstable_creator_fn _sstable_creator;
-    const schema_ptr _schema;
-    const reader_permit _permit;
+    compaction_sstable_creator_fn _sstable_creator;
+    schema_ptr _schema;
+    reader_permit _permit;
    std::vector<shared_sstable> _sstables;
    std::vector<generation_type> _input_sstable_generations;
    // Unused sstables are tracked because if compaction is interrupted we can only delete them.
@@ -443,34 +458,29 @@ protected:
    std::vector<shared_sstable> _new_unused_sstables;
    std::vector<shared_sstable> _all_new_sstables;
    lw_shared_ptr<sstable_set> _compacting;
-    const sstables::compaction_type _type;
-    const uint64_t _max_sstable_size;
-    const uint32_t _sstable_level;
+    sstables::compaction_type _type;
+    uint64_t _max_sstable_size;
+    uint32_t _sstable_level;
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
-    // fully expired files, which are skipped, aren't taken into account.
-    uint64_t _compacting_data_file_size = 0;
-    api::timestamp_type _compacting_max_timestamp = api::min_timestamp;
    uint64_t _estimated_partitions = 0;
-    double _estimated_droppable_tombstone_ratio = 0;
    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
-    const bool _can_split_large_partition = false;
+    bool _can_split_large_partition = false;
    bool _contains_multi_fragment_runs = false;
    mutation_source_metadata _ms_metadata = {};
-    const compaction_sstable_replacer_fn _replacer;
-    const run_id _run_identifier;
+    compaction_sstable_replacer_fn _replacer;
+    run_id _run_identifier;
+    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
    std::optional<sstable_set> _sstable_set;
    // used to incrementally calculate max purgeable timestamp, as we iterate through decorated keys.
    std::optional<sstable_set::incremental_selector> _selector;
    std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
    // optional owned_ranges vector for cleanup;
-    const owned_ranges_ptr _owned_ranges = {};
-    // required for reshard compaction.
-    const dht::sharder* _sharder = nullptr;
-    const std::optional<dht::incremental_owned_ranges_checker> _owned_ranges_checker;
+    owned_ranges_ptr _owned_ranges = {};
+    std::optional<dht::incremental_owned_ranges_checker> _owned_ranges_checker;
    // Garbage collected sstables that are sealed but were not added to SSTable set yet.
    std::vector<shared_sstable> _unused_garbage_collected_sstables;
    // Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
@@ -481,26 +491,6 @@ private:
        cdata.compaction_fan_in = descriptor.fan_in();
        return cdata;
    }
-
-    // Called in a seastar thread
-    dht::partition_range_vector
-    get_ranges_for_invalidation(const std::vector<shared_sstable>& sstables) {
-        // If owned ranges is disengaged, it means no cleanup work was done and
-        // so nothing needs to be invalidated.
-        if (!_owned_ranges) {
-            return dht::partition_range_vector{};
-        }
-        auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
-
-        auto non_owned_ranges = boost::copy_range<dht::partition_range_vector>(sstables
-                | boost::adaptors::transformed([] (const shared_sstable& sst) {
-            seastar::thread::maybe_yield();
-            return dht::partition_range::make({sst->get_first_decorated_key(), true},
-                                              {sst->get_last_decorated_key(), true});
-        }));
-
-        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
-    }
 protected:
    compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
        : _cdata(init_compaction_data(cdata, descriptor))
@@ -515,11 +505,11 @@ protected:
        , _can_split_large_partition(descriptor.can_split_large_partition)
        , _replacer(std::move(descriptor.replacer))
        , _run_identifier(descriptor.run_identifier)
+        , _io_priority(descriptor.io_priority)
        , _sstable_set(std::move(descriptor.all_sstables_snapshot))
        , _selector(_sstable_set ? _sstable_set->make_incremental_selector() : std::optional<sstable_set::incremental_selector>{})
        , _compacting_for_max_purgeable_func(std::unordered_set<shared_sstable>(_sstables.begin(), _sstables.end()))
        , _owned_ranges(std::move(descriptor.owned_ranges))
-        , _sharder(descriptor.sharder)
        , _owned_ranges_checker(_owned_ranges ? std::optional<dht::incremental_owned_ranges_checker>(*_owned_ranges) : std::nullopt)
    {
        for (auto& sst : _sstables) {
@@ -534,9 +524,9 @@ protected:
    virtual uint64_t partitions_per_sstable() const {
        // some tests use _max_sstable_size == 0 for force many one partition per sstable
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
-        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_compacting_data_file_size) / max_sstable_size)));
+        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -580,10 +570,9 @@ protected:
        return _stats_collector.get();
    }

-    compaction_completion_desc
+    virtual compaction_completion_desc
    get_compaction_completion_desc(std::vector<shared_sstable> input_sstables, std::vector<shared_sstable> output_sstables) {
-        auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
-        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
+        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables)};
    }

    // Tombstone expiration is enabled based on the presence of sstable set.
@@ -592,15 +581,15 @@ protected:
        return bool(_sstable_set) && _table_s.tombstone_gc_enabled();
    }

-    compaction_writer create_gc_compaction_writer(run_id gc_run) const {
+    compaction_writer create_gc_compaction_writer() const {
        auto sst = _sstable_creator(this_shard_id());

+        auto&& priority = _io_priority;
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
-        cfg.run_identifier = gc_run;
+        cfg.run_identifier = _run_identifier;
        cfg.monitor = monitor.get();
-        uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
-        auto writer = sst->get_writer(*schema(), estimated_partitions, cfg, get_encoding_stats());
+        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }

@@ -619,14 +608,8 @@ protected:
    // When compaction finishes, all the temporary sstables generated here will be deleted and removed
    // from table's sstable set.
    compacted_fragments_writer get_gc_compacted_fragments_writer() {
-        // because the temporary sstable run can overlap with the non-gc sstables run created by
-        // get_compacted_fragments_writer(), we have to use a different run_id. the gc_run_id is
-        // created here as:
-        // 1. it can be shared across all sstables created by this writer
-        // 2. it is optional, as gc writer is not always used
-        auto gc_run = run_id::create_random_id();
        return compacted_fragments_writer(*this,
-             [this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
+             [this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
             [this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
             _stop_request_observable);
    }
@@ -643,8 +626,18 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    }
+
+    flat_mutation_reader_v2::filter make_partition_filter() const {
+        return [this] (const dht::decorated_key& dk) {
+            if (!_owned_ranges_checker->belongs_to_current_node(dk.token())) {
+                log_trace("Token {} does not belong to this node, skipping", dk.token());
+                return false;
+            }
+            return true;
+        };
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -657,55 +650,17 @@ public:
    }
 private:
    // Default range sstable reader that will only return mutation that belongs to current shard.
-    virtual flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                        reader_permit permit,
-                                                        const dht::partition_range& range,
-                                                        const query::partition_slice& slice,
-                                                        tracing::trace_state_ptr,
-                                                        streamed_mutation::forwarding fwd,
-                                                        mutation_reader::forwarding) const = 0;
+    virtual flat_mutation_reader_v2 make_sstable_reader() const = 0;

+    // Make a filtering reader if needed
+    // FIXME: the sstable reader itself should be pass the owned ranges
+    // so it can skip over the disowned ranges efficiently using the index.
+    // Ref https://github.com/scylladb/scylladb/issues/12998
    flat_mutation_reader_v2 setup_sstable_reader() const {
        if (!_owned_ranges_checker) {
-            return make_sstable_reader(_schema,
-                                       _permit,
-                                       query::full_partition_range,
-                                       _schema->full_slice(),
-                                       tracing::trace_state_ptr(),
-                                       ::streamed_mutation::forwarding::no,
-                                       ::mutation_reader::forwarding::no);
+            return make_sstable_reader();
        }
-
-        auto source = mutation_source([this] (schema_ptr s,
-                reader_permit permit,
-                const dht::partition_range& range,
-                const query::partition_slice& slice,
-                tracing::trace_state_ptr trace_state,
-                streamed_mutation::forwarding fwd,
-                mutation_reader::forwarding fwd_mr) {
-            log_trace("Creating sstable set reader with range {}", range);
-            return make_sstable_reader(std::move(s),
-                                       std::move(permit),
-                                       range,
-                                       slice,
-                                       std::move(trace_state),
-                                       fwd,
-                                       fwd_mr);
-        });
-
-        auto owned_range_generator = [this] () -> std::optional<dht::partition_range> {
-            auto r = _owned_ranges_checker->next_owned_range();
-            if (r == nullptr) {
-                return std::nullopt;
-            }
-            log_trace("Skipping to the next owned range {}", *r);
-            return dht::to_partition_range(*r);
-        };
-
-        return make_flat_multi_range_reader(_schema, _permit, std::move(source),
-                                            std::move(owned_range_generator),
-                                            _schema->full_slice(),
-                                            tracing::trace_state_ptr());
+        return make_filtering_reader(make_sstable_reader(), make_partition_filter());
    }

    virtual sstables::sstable_set make_sstable_set_for_input() const {
@@ -719,7 +674,6 @@ private:
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

-        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
@@ -746,10 +700,6 @@ private:
            // for a better estimate for the number of partitions in the merged
            // sstable than just adding up the lengths of individual sstables.
            _estimated_partitions += sst->get_estimated_key_count();
-            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state(), _schema);
-            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
-            _compacting_data_file_size += sst->ondisk_data_size();
-
            // TODO:
            // Note that this is not fully correct. Since we might be merging sstables that originated on
            // another shard (#cpu changed), we might be comparing RP:s with differing shard ids,
@@ -758,16 +708,12 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
-
-            _compacting_max_timestamp = std::max(_compacting_max_timestamp, sst->get_stats_metadata().max_timestamp);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->size(), _sstables.size());
        }
-        // _estimated_droppable_tombstone_ratio could exceed 1.0 in certain cases, so limit it to 1.0.
-        _estimated_droppable_tombstone_ratio = std::min(1.0, sum_of_estimated_droppable_tombstone_ratio / ssts->size());

        _compacting = std::move(ssts);

@@ -782,7 +728,7 @@ private:
        auto consumer = make_interposer_consumer([this] (flat_mutation_reader_v2 reader) mutable {
            return seastar::async([this, reader = std::move(reader)] () mutable {
                auto close_reader = deferred_close(reader);
-                auto cfc = get_compacted_fragments_writer();
+                auto cfc = compacted_fragments_writer(get_compacted_fragments_writer());
                reader.consume_in_thread(std::move(cfc));
            });
        });
@@ -860,8 +806,8 @@ protected:
        // By the time being, using estimated key count.
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
-                _input_sstable_generations.size(), new_sstables_msg, utils::pretty_printed_data_size(_start_size), utils::pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), utils::pretty_printed_throughput(_start_size, duration),
+                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -882,7 +828,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
        };
    }

@@ -998,7 +944,7 @@ void compacted_fragments_writer::split_large_partition() {
        _c.log_debug("Closing active tombstone {} with {} for partition {}", _current_partition.current_emitted_tombstone, rtc, *_current_partition.dk);
        _compaction_writer->writer.consume(std::move(rtc));
    }
-    _c.log_debug("Splitting large partition {} in order to respect SSTable size limit of {}", *_current_partition.dk, utils::pretty_printed_data_size(_c._max_sstable_size));
+    _c.log_debug("Splitting large partition {} in order to respect SSTable size limit of {}", *_current_partition.dk, pretty_printed_data_size(_c._max_sstable_size));
    // Close partition in current writer, and open it again in a new writer.
    do_consume_end_of_partition();
    stop_current_writer();
@@ -1082,6 +1028,51 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

+class reshape_compaction : public compaction {
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+        : compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            finish_new_sstable(writer);
+        }
+    }
+};
+
 class regular_compaction : public compaction {
    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
    mutable compaction_read_monitor_generator _monitor_generator;
@@ -1093,20 +1084,15 @@ public:
    {
    }

-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_local_shard_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
-                std::move(trace),
-                sm_fwd,
-                mr_fwd,
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
                _monitor_generator);
    }

@@ -1125,7 +1111,7 @@ public:
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = make_sstable_writer_config(_type);
        cfg.monitor = monitor.get();
-        return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
+        return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
    }

    virtual void stop_sstable_writer(compaction_writer* writer) override {
@@ -1196,13 +1182,12 @@ private:
    }

    void update_pending_ranges() {
-        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
-        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
+        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : pending_replacements) {
+        for (auto& pending_replacement : _cdata.pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1216,79 +1201,33 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-    }
-};
-
-class reshape_compaction : public regular_compaction {
-private:
-    bool has_sstable_replacer() const noexcept {
-        return bool(_replacer);
-    }
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
-            : regular_compaction(table_s, std::move(descriptor), cdata) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
-    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
-        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_local_shard_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
-                std::move(trace),
-                sm_fwd,
-                mr_fwd,
-                default_read_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            if (has_sstable_replacer()) {
-                regular_compaction::stop_sstable_writer(writer);
-            } else {
-                finish_new_sstable(writer);
-            }
-        }
-    }
-
-    virtual void on_end_of_compaction() override {
-        if (has_sstable_replacer()) {
-            regular_compaction::on_end_of_compaction();
-        }
+        _cdata.pending_replacements.clear();
    }
 };

 class cleanup_compaction final : public regular_compaction {
+private:
+    // Called in a seastar thread
+    dht::partition_range_vector
+    get_ranges_for_invalidation(const std::vector<shared_sstable>& sstables) {
+        auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
+
+        auto non_owned_ranges = boost::copy_range<dht::partition_range_vector>(sstables
+                | boost::adaptors::transformed([] (const shared_sstable& sst) {
+            seastar::thread::maybe_yield();
+            return dht::partition_range::make({sst->get_first_decorated_key(), true},
+                                              {sst->get_last_decorated_key(), true});
+        }));
+
+        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
+    }
+protected:
+    virtual compaction_completion_desc
+    get_compaction_completion_desc(std::vector<shared_sstable> input_sstables, std::vector<shared_sstable> output_sstables) override {
+        auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
+        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
+    }
+
 public:
    cleanup_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
        : regular_compaction(table_s, std::move(descriptor), cdata)
@@ -1538,17 +1477,8 @@ public:
        return _scrub_finish_description;
    }

-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        if (!range.is_full()) {
-            on_internal_error(clogger, fmt::format("Scrub compaction in mode {} expected full partition range, but got {} instead", _options.operation_mode, range));
-        }
-        auto crawling_reader = _compacting->make_crawling_reader(std::move(s), std::move(permit), nullptr);
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        auto crawling_reader = _compacting->make_crawling_reader(_schema, _permit, _io_priority, nullptr);
        return make_flat_mutation_reader_v2<reader>(std::move(crawling_reader), _options.operation_mode, _validation_errors);
    }

@@ -1567,7 +1497,7 @@ public:
            return end_consumer;
        }
        return [this, end_consumer = std::move(end_consumer)] (flat_mutation_reader_v2 reader) mutable -> future<> {
-            auto cfg = mutation_writer::segregate_config{memory::stats().total_memory() / 10};
+            auto cfg = mutation_writer::segregate_config{_io_priority, memory::stats().total_memory() / 10};
            return mutation_writer::segregate_by_partition(std::move(reader), cfg,
                    [consumer = std::move(end_consumer), this] (flat_mutation_reader_v2 rd) {
                ++_bucket_count;
@@ -1614,7 +1544,7 @@ private:
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
@@ -1639,20 +1569,15 @@ public:
    ~resharding_compaction() { }

    // Use reader that makes sure no non-local mutation will not be filtered out.
-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_range_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_range_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
                nullptr,
-                sm_fwd,
-                mr_fwd);
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no);

    }

@@ -1675,14 +1600,14 @@ public:
    }

    compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto shard = _sharder->shard_of(dk.token());
+        auto shard = dht::shard_of(*_schema, dk.token());
        auto sst = _sstable_creator(shard);
        setup_new_sstable(sst);

        auto cfg = make_sstable_writer_config(compaction_type::Reshard);
        // sstables generated for a given shard will share the same run identifier.
        cfg.run_identifier = _run_identifiers.at(shard);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), shard), sst};
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), _io_priority, shard), sst};
    }

    void stop_sstable_writer(compaction_writer* writer) override {
@@ -1762,7 +1687,7 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    for (const auto& sst : descriptor.sstables) {
        clogger.info("Scrubbing in validate mode {}", sst->get_filename());

-        validation_errors += co_await sst->validate(permit, cdata.abort, [&schema] (sstring what) {
+        validation_errors += co_await sst->validate(permit, descriptor.io_priority, cdata.abort, [&schema] (sstring what) {
            scrub_compaction::report_validation_error(compaction_type::Scrub, *schema, what);
        });
        // Did validation actually finish because aborted?
@@ -1774,10 +1699,9 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
        clogger.info("Finished scrubbing in validate mode {} - sstable is {}", sst->get_filename(), validation_errors == 0 ? "valid" : "invalid");
    }

-    using scrub = sstables::compaction_type_options::scrub;
-    if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
+    if (validation_errors != 0) {
        for (auto& sst : descriptor.sstables) {
-            co_await sst->change_state(sstables::sstable_state::quarantine);
+            co_await sst->change_state(sstables::quarantine_dir);
        }
    }

@@ -1819,7 +1743,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
    int64_t min_timestamp = std::numeric_limits<int64_t>::max();

    for (auto& sstable : overlapping) {
-        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        if (sstable->get_max_local_deletion_time() >= gc_before) {
            min_timestamp = std::min(min_timestamp, sstable->get_stats_metadata().min_timestamp);
        }
@@ -1838,7 +1762,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable

    // SStables that do not contain live data is added to list of possibly expired sstables.
    for (auto& candidate : compacting) {
-        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        clogger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}",
                    candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before);
        // A fully expired sstable which has an ancestor undeleted shouldn't be compacted because
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -13,8 +13,8 @@
 #include "compaction/compaction_descriptor.hh"
 #include "gc_clock.hh"
 #include "compaction_weight_registration.hh"
+#include "service/priority_manager.hh"
 #include "utils/UUID.hh"
-#include "utils/pretty_printers.hh"
 #include "table_state.hh"
 #include <seastar/core/thread.hh>
 #include <seastar/core/abort_source.hh>
@@ -25,6 +25,21 @@ namespace sstables {

 bool is_eligible_for_compaction(const sstables::shared_sstable& sst) noexcept;

+class pretty_printed_data_size {
+    uint64_t _size;
+public:
+    pretty_printed_data_size(uint64_t size) : _size(size) {}
+    friend std::ostream& operator<<(std::ostream&, pretty_printed_data_size);
+};
+
+class pretty_printed_throughput {
+    uint64_t _size;
+    std::chrono::duration<float> _duration;
+public:
+    pretty_printed_throughput(uint64_t size, std::chrono::duration<float> dur) : _size(size), _duration(std::move(dur)) {}
+    friend std::ostream& operator<<(std::ostream&, pretty_printed_throughput);
+};
+
 // Return the name of the compaction type
 // as used over the REST api, e.g. "COMPACTION" or "CLEANUP".
 sstring compaction_name(compaction_type type);
--- a/compaction/compaction_backlog_manager.hh
+++ b/compaction/compaction_backlog_manager.hh
@@ -12,6 +12,7 @@
 #include <memory>
 #include <seastar/core/shared_ptr.hh>
 #include "sstables/shared_sstable.hh"
+#include "sstables/progress_monitor.hh"
 #include "timestamp.hh"

 class compaction_backlog_manager;
@@ -59,20 +60,18 @@ public:
    using ongoing_compactions = std::unordered_map<sstables::shared_sstable, backlog_read_progress_manager*>;

    struct impl {
-        // FIXME: Should provide strong exception safety guarantees
-        virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) = 0;
+        virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) = 0;
        virtual double backlog(const ongoing_writes& ow, const ongoing_compactions& oc) const = 0;
        virtual ~impl() { }
    };

    compaction_backlog_tracker(std::unique_ptr<impl> impl) : _impl(std::move(impl)) {}
    compaction_backlog_tracker(compaction_backlog_tracker&&);
-    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) = delete;
+    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) noexcept;
    compaction_backlog_tracker(const compaction_backlog_tracker&) = delete;
    ~compaction_backlog_tracker();

    double backlog() const;
-    // FIXME: Should provide strong exception safety guarantees
    void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts);
    void register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp);
    void register_compacting_sstable(sstables::shared_sstable sst, backlog_read_progress_manager& rp);
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -18,6 +18,7 @@
 #include "sstables/sstable_set.hh"
 #include "utils/UUID.hh"
 #include "dht/i_partitioner.hh"
+#include "compaction_weight_registration.hh"
 #include "compaction_fwd.hh"

 namespace sstables {
@@ -72,12 +73,6 @@ public:
            only, // scrub only quarantined sstables
        };
        quarantine_mode quarantine_operation_mode = quarantine_mode::include;
-
-        using quarantine_invalid_sstables = bool_class<class quarantine_invalid_sstables_tag>;
-
-        // Should invalid sstables be moved into quarantine.
-        // Only applies to validate-mode.
-        quarantine_invalid_sstables quarantine_sstables = quarantine_invalid_sstables::yes;
    };
    struct reshard {
    };
@@ -114,8 +109,8 @@ public:
        return compaction_type_options(upgrade{});
    }

-    static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes) {
-        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables});
+    static compaction_type_options make_scrub(scrub::mode mode) {
+        return compaction_type_options(scrub{mode});
    }

    template <typename... Visitor>
@@ -123,11 +118,6 @@ public:
        return std::visit(std::forward<Visitor>(visitor)..., _options);
    }

-    template <typename OptionType>
-    const auto& as() const {
-        return std::get<OptionType>(_options);
-    }
-
    const options_variant& options() const { return _options; }

    compaction_type type() const;
@@ -161,12 +151,12 @@ struct compaction_descriptor {
    compaction_type_options options = compaction_type_options::make_regular();
    // If engaged, compaction will cleanup the input sstables by skipping non-owned ranges.
    compaction::owned_ranges_ptr owned_ranges;
-    // Required for reshard compaction.
-    const dht::sharder* sharder;

    compaction_sstable_creator_fn creator;
    compaction_sstable_replacer_fn replacer;

+    ::io_priority_class io_priority = default_priority_class();
+
    // Denotes if this compaction task is comprised solely of completely expired SSTables
    sstables::has_only_fully_expired has_only_fully_expired = has_only_fully_expired::no;

@@ -176,6 +166,7 @@ struct compaction_descriptor {
    static constexpr uint64_t default_max_sstable_bytes = std::numeric_limits<uint64_t>::max();

    explicit compaction_descriptor(std::vector<sstables::shared_sstable> sstables,
+                                   ::io_priority_class io_priority,
                                   int level = default_level,
                                   uint64_t max_sstable_bytes = default_max_sstable_bytes,
                                   run_id run_identifier = run_id::create_random_id(),
@@ -187,15 +178,18 @@ struct compaction_descriptor {
        , run_identifier(run_identifier)
        , options(options)
        , owned_ranges(std::move(owned_ranges_))
+        , io_priority(io_priority)
    {}

    explicit compaction_descriptor(sstables::has_only_fully_expired has_only_fully_expired,
-                                   std::vector<sstables::shared_sstable> sstables)
+                                   std::vector<sstables::shared_sstable> sstables,
+                                   ::io_priority_class io_priority)
        : sstables(std::move(sstables))
        , level(default_level)
        , max_sstable_bytes(default_max_sstable_bytes)
        , run_identifier(run_id::create_random_id())
        , options(compaction_type_options::make_regular())
+        , io_priority(io_priority)
        , has_only_fully_expired(has_only_fully_expired)
    {}

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -31,8 +31,8 @@
 #include <functional>
 #include <algorithm>
 #include "compaction.hh"
+#include "compaction_weight_registration.hh"
 #include "compaction_backlog_manager.hh"
-#include "compaction/compaction_descriptor.hh"
 #include "compaction/task_manager_module.hh"
 #include "compaction_state.hh"
 #include "strategy_control.hh"
@@ -46,14 +46,14 @@ class system_keyspace;
 class compaction_history_entry;
 }

+class compacting_sstable_registration;
+
 class repair_history_map {
 public:
    boost::icl::interval_map<dht::token, gc_clock::time_point, boost::icl::partial_absorber, std::less, boost::icl::inplace_max> map;
 };

 namespace compaction {
-using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
 class compaction_task_executor;
 class sstables_task_executor;
 class major_compaction_task_executor;
@@ -64,6 +64,8 @@ class rewrite_sstables_compaction_task_executor;
 class cleanup_sstables_compaction_task_executor;
 class validate_sstables_compaction_task_executor;
 }
+class compaction_manager_test_task_executor;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -161,21 +163,7 @@ private:
    per_table_history_maps _repair_history_maps;
    tombstone_gc_state _tombstone_gc_state;
 private:
-    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
-    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
-
-    // Return nullopt if compaction cannot be started
-    std::optional<gate::holder> start_compaction(table_state& t);
-
-    // parent_info set to std::nullopt means that task manager should not register this task executor.
-    // To create a task manager task with no parent, parent_info argument should contain empty task_info.
-    template<typename TaskExecutor, typename... Args>
-    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
-            std::is_base_of_v<compaction_task_impl, TaskExecutor> &&
-    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
-        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
-    }
-    future<compaction_manager::compaction_stats_opt> perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
+    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor>);

    future<> stop_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -194,20 +182,17 @@ private:
    // Get candidates for compaction strategy, which are all sstables but the ones being compacted.
    std::vector<sstables::shared_sstable> get_candidates(compaction::table_state& t) const;

-    bool eligible_for_compaction(const sstables::shared_sstable& sstable) const;
-    bool eligible_for_compaction(const sstables::frozen_sstable_run& sstable_run) const;
-
    template <std::ranges::range Range>
-    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable> || std::convertible_to<std::ranges::range_value_t<Range>, sstables::frozen_sstable_run>
-    std::vector<std::ranges::range_value_t<Range>> get_candidates(table_state& t, const Range& sstables) const;
+    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable>
+    std::vector<sstables::shared_sstable> get_candidates(table_state& t, const Range& sstables) const;

-    template <std::ranges::range Range>
-    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
-    void register_compacting_sstables(const Range& range);
+    template <typename Iterator, typename Sentinel>
+    requires std::same_as<Sentinel, Iterator> || std::sentinel_for<Sentinel, Iterator>
+    void register_compacting_sstables(Iterator first, Sentinel last);

-    template <std::ranges::range Range>
-    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
-    void deregister_compacting_sstables(const Range& range);
+    template <typename Iterator, typename Sentinel>
+    requires std::same_as<Sentinel, Iterator> || std::sentinel_for<Sentinel, Iterator>
+    void deregister_compacting_sstables(Iterator first, Sentinel last);

    // gets the table's compaction state
    // throws std::out_of_range exception if not found.
@@ -226,7 +211,7 @@ private:
    // similar-sized compaction.
    void postpone_compaction_for_table(compaction::table_state* t);

-    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t);
    future<> update_static_shares(float shares);

    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
@@ -234,11 +219,10 @@ private:
    // Guarantees that a maintenance task, e.g. cleanup, will be performed on all files available at the time
    // by retrieving set of candidates only after all compactions for table T were stopped, if any.
    template<typename TaskType, typename... Args>
-    requires std::derived_from<TaskType, compaction_task_executor> &&
-            std::derived_from<TaskType, compaction_task_impl>
-    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(std::optional<tasks::task_info> info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);
+    requires std::derived_from<TaskType, compaction::compaction_task_executor>
+    future<compaction_stats_opt> perform_task_on_all_files(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, Args... args);

-    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, std::optional<tasks::task_info> info, can_purge_tombstones can_purge = can_purge_tombstones::yes);
+    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, can_purge_tombstones can_purge = can_purge_tombstones::yes);

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
@@ -310,7 +294,7 @@ public:

    // Submit a table to be off-strategy compacted.
    // Returns true iff off-strategy compaction was required and performed.
-    future<bool> perform_offstrategy(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<bool> perform_offstrategy(compaction::table_state& t);

    // Submit a table to be cleaned up and wait for its termination.
    //
@@ -319,23 +303,21 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t);
 private:
-    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t);

    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
-
-    future<> on_compaction_completion(table_state& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
 public:
    // Submit a table to be upgraded and wait for its termination.
-    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version);

    // Submit a table to be scrubbed and wait for its termination.
-    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts, std::optional<tasks::task_info> info = std::nullopt);
+    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts);

    // Submit a table for major compaction.
-    future<> perform_major_compaction(compaction::table_state& t, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_major_compaction(compaction::table_state& t);


    // Run a custom job for a given table, defined by a function
@@ -345,7 +327,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job, std::optional<tasks::task_info> info, throw_if_stopping do_throw_if_stopping);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);

    class compaction_reenabler {
        compaction_manager& _cm;
@@ -433,7 +415,6 @@ public:

    // checks if the sstable is in the respective compaction_state.sstables_requiring_cleanup set.
    bool requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const;
-    const std::unordered_set<sstables::shared_sstable>& sstables_requiring_cleanup(table_state& t) const;

    friend class compacting_sstable_registration;
    friend class compaction_weight_registration;
@@ -448,11 +429,12 @@ public:
    friend class compaction::rewrite_sstables_compaction_task_executor;
    friend class compaction::cleanup_sstables_compaction_task_executor;
    friend class compaction::validate_sstables_compaction_task_executor;
+    friend class compaction_manager_test_task_executor;
 };

 namespace compaction {

-class compaction_task_executor : public enable_shared_from_this<compaction_task_executor> {
+class compaction_task_executor {
 public:
    enum class state {
        none,       // initial and final state
@@ -460,54 +442,42 @@ public:
                    // counted in compaction_manager::stats::pending_tasks
        active,     // task initiated active compaction, may alternate with pending
                    // counted in compaction_manager::stats::active_tasks
-        done,       // task completed successfully (may transition only to state::none, or
-                    // state::pending for regular compaction)
+        done,       // task completed successfully (may transition only to state::none)
                    // counted in compaction_manager::stats::completed_tasks
        postponed,  // task was postponed (may transition only to state::none)
                    // represented by the postponed_compactions metric
        failed,     // task failed (may transition only to state::none)
                    // counted in compaction_manager::stats::errors
    };
+    static std::string_view to_string(state);
 protected:
    compaction_manager& _cm;
    ::compaction::table_state* _compacting_table = nullptr;
    compaction::compaction_state& _compaction_state;
    sstables::compaction_data _compaction_data;
    state _state = state::none;
-    throw_if_stopping _do_throw_if_stopping;

 private:
    shared_future<compaction_manager::compaction_stats_opt> _compaction_done = make_ready_future<compaction_manager::compaction_stats_opt>();
    exponential_backoff_retry _compaction_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
    sstables::compaction_type _type;
    sstables::run_id _output_run_identifier;
+    gate::holder _gate_holder;
    sstring _description;
-    compaction_manager::compaction_stats_opt _stats = std::nullopt;

 public:
-    explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);
+    explicit compaction_task_executor(compaction_manager& mgr, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);

    compaction_task_executor(compaction_task_executor&&) = delete;
    compaction_task_executor(const compaction_task_executor&) = delete;

-    virtual ~compaction_task_executor() = default;
-
-    // called when a compaction replaces the exhausted sstables with the new set
-    struct on_replacement {
-        virtual ~on_replacement() {}
-        // called after the replacement completes
-        // @param sstables the old sstable which are replaced in this replacement
-        virtual void on_removal(const std::vector<sstables::shared_sstable>& sstables) = 0;
-        // called before the replacement happens
-        // @param sstables the new sstables to be added to the table's sstable set
-        virtual void on_addition(const std::vector<sstables::shared_sstable>& sstables) = 0;
-    };
+    virtual ~compaction_task_executor();

 protected:
-    future<> perform();
-
    virtual future<compaction_manager::compaction_stats_opt> do_run() = 0;

+    using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
    state switch_state(state new_state);

    future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -524,27 +494,24 @@ protected:
    // otherwise, returns stop_iteration::no after sleep for exponential retry.
    future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);

-    future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
+    // Compacts set of SSTables according to the descriptor.
+    using release_exhausted_func_t = std::function<void(const std::vector<sstables::shared_sstable>& exhausted_sstables)>;
+    future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
+                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes);
+    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes);
-    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
-                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes,
-                                sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> update_history(::compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
    bool should_update_history(sstables::compaction_type ct) {
        return ct == sstables::compaction_type::Compaction;
    }
 public:
-    compaction_manager::compaction_stats_opt get_stats() const noexcept {
-        return _stats;
-    }
-
-    future<compaction_manager::compaction_stats_opt> run_compaction() noexcept;
+    future<compaction_manager::compaction_stats_opt> run() noexcept;

    const ::compaction::table_state* compacting_table() const noexcept {
        return _compacting_table;
    }

-    sstables::compaction_type compaction_type() const noexcept {
+    sstables::compaction_type type() const noexcept {
        return _type;
    }

@@ -570,46 +537,27 @@ public:
    const sstring& description() const noexcept {
        return _description;
    }
-private:
-    // Before _compaction_done is set in compaction_task_executor::run_compaction(), compaction_done() returns ready future.
+
    future<compaction_manager::compaction_stats_opt> compaction_done() noexcept {
        return _compaction_done.get_future();
    }
-public:
+
    bool stopping() const noexcept {
        return _compaction_data.abort.abort_requested();
    }

-    void stop_compaction(sstring reason) noexcept;
+    void stop(sstring reason) noexcept;

    sstables::compaction_stopped_exception make_compaction_stopped_exception() const;

-    template<typename TaskExecutor, typename... Args>
-    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
-            std::is_base_of_v<compaction_task_impl, TaskExecutor> &&
-    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
-        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
-    }
-    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
-    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
-    friend fmt::formatter<compaction_task_executor>;
-    friend future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason);
+    std::string describe() const;
 };

+std::ostream& operator<<(std::ostream& os, compaction::compaction_task_executor::state s);
+std::ostream& operator<<(std::ostream& os, const compaction::compaction_task_executor& task);
+
 }

-template <>
-struct fmt::formatter<compaction::compaction_task_executor::state> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    auto format(compaction::compaction_task_executor::state c, fmt::format_context& ctx) const -> decltype(ctx.out());
-};
-
-template <>
-struct fmt::formatter<compaction::compaction_task_executor> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    auto format(const compaction::compaction_task_executor& ex, fmt::format_context& ctx) const  -> decltype(ctx.out());
-};
-
 bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges);

 // Return all sstables but those that are off-strategy like the ones in maintenance set and staging dir.
--- a/compaction/compaction_state.hh
+++ b/compaction/compaction_state.hh
@@ -32,7 +32,7 @@ struct compaction_state {
    // Signaled whenever a compaction task completes.
    condition_variable compaction_done;

-    std::optional<compaction_backlog_tracker> backlog_tracker;
+    compaction_backlog_tracker backlog_tracker;

    std::unordered_set<sstables::shared_sstable> sstables_requiring_cleanup;
    compaction::owned_ranges_ptr owned_ranges_ptr;
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -12,8 +12,6 @@
 #include <vector>
 #include <chrono>
 #include <seastar/core/shared_ptr.hh>
-#include "seastar/core/on_internal_error.hh"
-#include "sstables/shared_sstable.hh"
 #include "sstables/sstables.hh"
 #include "compaction.hh"
 #include "compaction_strategy.hh"
@@ -26,6 +24,7 @@
 #include <boost/range/adaptors.hpp>
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include "size_tiered_compaction_strategy.hh"
+#include "date_tiered_compaction_strategy.hh"
 #include "leveled_compaction_strategy.hh"
 #include "time_window_compaction_strategy.hh"
 #include "backlog_controller.hh"
@@ -33,25 +32,26 @@
 #include "size_tiered_backlog_tracker.hh"
 #include "leveled_manifest.hh"

+logging::logger date_tiered_manifest::logger = logging::logger("DateTieredCompactionStrategy");
 logging::logger leveled_manifest::logger("LeveledManifest");

 namespace sstables {

 compaction_descriptor compaction_strategy_impl::make_major_compaction_job(std::vector<sstables::shared_sstable> candidates, int level, uint64_t max_sstable_bytes) {
    // run major compaction in maintenance priority
-    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes);
+    return compaction_descriptor(std::move(candidates), service::get_local_streaming_priority(), level, max_sstable_bytes);
 }

 std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
    // The default implementation is suboptimal and causes the writeamp problem described issue in #10097.
    // The compaction strategy relying on it should strive to implement its own method, to make cleanup bucket aware.
    return boost::copy_range<std::vector<compaction_descriptor>>(candidates | boost::adaptors::transformed([] (const shared_sstable& sst) {
-        return compaction_descriptor({ sst },
+        return compaction_descriptor({ sst }, service::get_local_compaction_priority(),
            sst->get_sstable_level(), sstables::compaction_descriptor::default_max_sstable_bytes, sst->run_identifier());
    }));
 }

-bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t) {
+bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state) {
    if (_disable_tombstone_compaction) {
        return false;
    }
@@ -62,11 +62,11 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    if (db_clock::now()-_tombstone_compaction_interval < sst->data_file_write_time()) {
        return false;
    }
-    auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time, t.get_tombstone_gc_state(), t.schema());
+    auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time, gc_state);
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

-uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) const {
+uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) const {
    return partition_estimate;
 }

@@ -75,7 +75,7 @@ reader_consumer_v2 compaction_strategy_impl::make_interposer_consumer(const muta
 }

 compaction_descriptor
-compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
+compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const {
    return compaction_descriptor();
 }

@@ -87,96 +87,17 @@ std::optional<sstring> compaction_strategy_impl::get_value(const std::map<sstrin
    return it->second;
 }

-void compaction_strategy_impl::validate_min_max_threshold(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto min_threshold_key = "min_threshold", max_threshold_key = "max_threshold";
-
-    auto tmp_value = compaction_strategy_impl::get_value(options, min_threshold_key);
-    auto min_threshold = cql3::statements::property_definitions::to_long(min_threshold_key, tmp_value, DEFAULT_MIN_COMPACTION_THRESHOLD);
-    if (min_threshold < 2) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be bigger or equal to 2", min_threshold_key, min_threshold));
-    }
-
-    tmp_value = compaction_strategy_impl::get_value(options, max_threshold_key);
-    auto max_threshold = cql3::statements::property_definitions::to_long(max_threshold_key, tmp_value, DEFAULT_MAX_COMPACTION_THRESHOLD);
-    if (max_threshold < 2) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be bigger or equal to 2", max_threshold_key, max_threshold));
-    }
-
-    unchecked_options.erase(min_threshold_key);
-    unchecked_options.erase(max_threshold_key);
-}
-
-static double validate_tombstone_threshold(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION);
-    auto tombstone_threshold = cql3::statements::property_definitions::to_double(compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION, tmp_value, compaction_strategy_impl::DEFAULT_TOMBSTONE_THRESHOLD);
-    if (tombstone_threshold < 0.0 || tombstone_threshold > 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be between 0.0 and 1.0", compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION, tombstone_threshold));
-    }
-    return tombstone_threshold;
-}
-
-static double validate_tombstone_threshold(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto tombstone_threshold = validate_tombstone_threshold(options);
-    unchecked_options.erase(compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION);
-    return tombstone_threshold;
-}
-
-static db_clock::duration validate_tombstone_compaction_interval(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION);
-    auto interval = cql3::statements::property_definitions::to_long(compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION, tmp_value, compaction_strategy_impl::DEFAULT_TOMBSTONE_COMPACTION_INTERVAL().count());
-    auto tombstone_compaction_interval = db_clock::duration(std::chrono::seconds(interval));
-    if (interval <= 0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be positive", compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION, tombstone_compaction_interval));
-    }
-    return tombstone_compaction_interval;
-}
-
-static db_clock::duration validate_tombstone_compaction_interval(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto tombstone_compaction_interval = validate_tombstone_compaction_interval(options);
-    unchecked_options.erase(compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION);
-    return tombstone_compaction_interval;
-}
-
-void compaction_strategy_impl::validate_options_for_strategy_type(const std::map<sstring, sstring>& options, sstables::compaction_strategy_type type) {
-    auto unchecked_options = options;
-    compaction_strategy_impl::validate_options(options, unchecked_options);
-    switch (type) {
-        case compaction_strategy_type::size_tiered:
-            size_tiered_compaction_strategy::validate_options(options, unchecked_options);
-            break;
-        case compaction_strategy_type::leveled:
-            leveled_compaction_strategy::validate_options(options, unchecked_options);
-            break;
-        case compaction_strategy_type::time_window:
-            time_window_compaction_strategy::validate_options(options, unchecked_options);
-            break;
-        default:
-            break;
-    }
-
-    unchecked_options.erase("class");
-    if (!unchecked_options.empty()) {
-        throw exceptions::configuration_exception(fmt::format("Invalid compaction strategy options {} for chosen strategy type", unchecked_options));
-    }
-}
-
-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void compaction_strategy_impl::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    validate_tombstone_threshold(options, unchecked_options);
-    validate_tombstone_compaction_interval(options, unchecked_options);
-
-    auto it = options.find("enabled");
-    if (it != options.end() && it->second != "true" && it->second != "false") {
-        throw exceptions::configuration_exception(fmt::format("enabled value ({}) must be \"true\" or \"false\"", it->second));
-    }
-    unchecked_options.erase("enabled");
-}
-
 compaction_strategy_impl::compaction_strategy_impl(const std::map<sstring, sstring>& options) {
-    _tombstone_threshold = validate_tombstone_threshold(options);
-    _tombstone_compaction_interval = validate_tombstone_compaction_interval(options);
+    using namespace cql3::statements;
+
+    auto tmp_value = get_value(options, TOMBSTONE_THRESHOLD_OPTION);
+    _tombstone_threshold = property_definitions::to_double(TOMBSTONE_THRESHOLD_OPTION, tmp_value, DEFAULT_TOMBSTONE_THRESHOLD);
+
+    tmp_value = get_value(options, TOMBSTONE_COMPACTION_INTERVAL_OPTION);
+    auto interval = property_definitions::to_long(TOMBSTONE_COMPACTION_INTERVAL_OPTION, tmp_value, DEFAULT_TOMBSTONE_COMPACTION_INTERVAL().count());
+    _tombstone_compaction_interval = db_clock::duration(std::chrono::seconds(interval));
+
+    // FIXME: validate options.
 }

 } // namespace sstables
@@ -188,7 +109,7 @@ size_tiered_backlog_tracker::compacted_backlog(const compaction_backlog_tracker:
        // A SSTable being compacted may not contribute to backlog if compaction strategy decided
        // to perform a low-efficiency compaction when system is under little load, or when user
        // performs major even though strategy is completely satisfied
-        if (!_contrib.sstables.contains(crp.first)) {
+        if (!_sstables_contributing_backlog.contains(crp.first)) {
            continue;
        }
        auto compacted = crp.second->compacted();
@@ -198,11 +119,11 @@ size_tiered_backlog_tracker::compacted_backlog(const compaction_backlog_tracker:
    return in;
 }

-// Provides strong exception safety guarantees.
-size_tiered_backlog_tracker::sstables_backlog_contribution size_tiered_backlog_tracker::calculate_sstables_backlog_contribution(const std::vector<sstables::shared_sstable>& all, const sstables::size_tiered_compaction_strategy_options& stcs_options) {
-    sstables_backlog_contribution contrib;
-    if (all.empty()) {
-        return contrib;
+void size_tiered_backlog_tracker::refresh_sstables_backlog_contribution() {
+    _sstables_backlog_contribution = 0.0f;
+    _sstables_contributing_backlog = {};
+    if (_all.empty()) {
+        return;
    }
    using namespace sstables;

@@ -212,27 +133,25 @@ size_tiered_backlog_tracker::sstables_backlog_contribution size_tiered_backlog_t
    // in efficient jobs acting more aggressive than they really have to.
    // TODO: potentially switch to compaction manager's fan-in threshold, so to account for the dynamic
    //  fan-in threshold behavior.
-    const auto& newest_sst = std::ranges::max(all, std::less<generation_type>(), std::mem_fn(&sstable::generation));
+    const auto& newest_sst = std::ranges::max(_all, std::less<generation_type>(), std::mem_fn(&sstable::generation));
    auto threshold = newest_sst->get_schema()->min_compaction_threshold();

-    for (auto& bucket : size_tiered_compaction_strategy::get_buckets(all, stcs_options)) {
+    for (auto& bucket : size_tiered_compaction_strategy::get_buckets(boost::copy_range<std::vector<shared_sstable>>(_all), _stcs_options)) {
        if (!size_tiered_compaction_strategy::is_bucket_interesting(bucket, threshold)) {
            continue;
        }
-        contrib.value += boost::accumulate(bucket | boost::adaptors::transformed([] (const shared_sstable& sst) -> double {
+        _sstables_backlog_contribution += boost::accumulate(bucket | boost::adaptors::transformed([this] (const shared_sstable& sst) -> double {
            return sst->data_size() * log4(sst->data_size());
        }), double(0.0f));
        // Controller is disabled if exception is caught during add / remove calls, so not making any effort to make this exception safe
-        contrib.sstables.insert(bucket.begin(), bucket.end());
+        _sstables_contributing_backlog.insert(bucket.begin(), bucket.end());
    }
-
-    return contrib;
 }

 double size_tiered_backlog_tracker::backlog(const compaction_backlog_tracker::ongoing_writes& ow, const compaction_backlog_tracker::ongoing_compactions& oc) const {
    inflight_component compacted = compacted_backlog(oc);

-    auto total_backlog_bytes = boost::accumulate(_contrib.sstables | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::data_size)), uint64_t(0));
+    auto total_backlog_bytes = boost::accumulate(_sstables_contributing_backlog | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::data_size)), uint64_t(0));

    // Bail out if effective backlog is zero, which happens in a small window where ongoing compaction exhausted
    // input files but is still sealing output files or doing managerial stuff like updating history table
@@ -249,41 +168,26 @@ double size_tiered_backlog_tracker::backlog(const compaction_backlog_tracker::on
    auto effective_backlog_bytes = total_backlog_bytes - compacted.total_bytes;

    // Sum of (Si - Ci) * log (Si) for all SSTables contributing backlog
-    auto sstables_contribution = _contrib.value - compacted.contribution;
+    auto sstables_contribution = _sstables_backlog_contribution - compacted.contribution;
    // This is subtracting ((Si - Ci) * log (Si)) from ((Si - Ci) * log(T)), yielding the final backlog
    auto b = (effective_backlog_bytes * log4(_total_bytes)) - sstables_contribution;
    return b > 0 ? b : 0;
 }

-// Provides strong exception safety guarantees.
-void size_tiered_backlog_tracker::replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) {
-    auto tmp_all = _all;
-    auto tmp_total_bytes = _total_bytes;
-    tmp_all.reserve(_all.size() + new_ssts.size());
-
+void size_tiered_backlog_tracker::replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) {
    for (auto& sst : old_ssts) {
        if (sst->data_size() > 0) {
-            auto erased = tmp_all.erase(sst);
-            if (erased) {
-                tmp_total_bytes -= sst->data_size();
-            }
+            _total_bytes -= sst->data_size();
+            _all.erase(sst);
        }
    }
    for (auto& sst : new_ssts) {
        if (sst->data_size() > 0) {
-            auto [_, inserted] = tmp_all.insert(sst);
-            if (inserted) {
-                tmp_total_bytes += sst->data_size();
-            }
+            _total_bytes += sst->data_size();
+            _all.insert(std::move(sst));
        }
    }
-    auto tmp_contrib = calculate_sstables_backlog_contribution(boost::copy_range<std::vector<shared_sstable>>(tmp_all), _stcs_options);
-
-    std::invoke([&] () noexcept {
-        _all = std::move(tmp_all);
-        _total_bytes = tmp_total_bytes;
-        _contrib = std::move(tmp_contrib);
-    });
+    refresh_sstables_backlog_contribution();
 }

 namespace sstables {
@@ -361,25 +265,23 @@ public:
        return b;
    }

-    // Provides strong exception safety guarantees
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {
        struct replacement {
            std::vector<sstables::shared_sstable> old_ssts;
            std::vector<sstables::shared_sstable> new_ssts;
        };
        std::unordered_map<api::timestamp_type, replacement> per_window_replacement;
-        auto tmp_windows = _windows;

        for (auto& sst : new_ssts) {
            auto bound = lower_bound_of(sst->get_stats_metadata().max_timestamp);
-            if (!tmp_windows.contains(bound)) {
-                tmp_windows.emplace(bound, size_tiered_backlog_tracker(_stcs_options));
+            if (!_windows.contains(bound)) {
+                _windows.emplace(bound, size_tiered_backlog_tracker(_stcs_options));
            }
            per_window_replacement[bound].new_ssts.push_back(std::move(sst));
        }
        for (auto& sst : old_ssts) {
            auto bound = lower_bound_of(sst->get_stats_metadata().max_timestamp);
-            if (tmp_windows.contains(bound)) {
+            if (_windows.contains(bound)) {
                per_window_replacement[bound].old_ssts.push_back(std::move(sst));
            }
        }
@@ -387,20 +289,12 @@ public:
        for (auto& [bound, r] : per_window_replacement) {
            // All windows must exist here, as windows are created for new files and will
            // remain alive as long as there's a single file in them
-            auto it = tmp_windows.find(bound);
-            if (it == tmp_windows.end()) {
-                on_internal_error(clogger, fmt::format("window for bound {} not found", bound));
-            }
-            auto& w = it->second;
-            w.replace_sstables(r.old_ssts, r.new_ssts);
+            auto& w = _windows.at(bound);
+            w.replace_sstables(std::move(r.old_ssts), std::move(r.new_ssts));
            if (w.total_bytes() <= 0) {
-                tmp_windows.erase(bound);
+                _windows.erase(bound);
            }
        }
-
-        std::invoke([&] () noexcept {
-            _windows = std::move(tmp_windows);
-        });
    }
 };

@@ -500,31 +394,25 @@ public:
        return b;
    }

-    // Provides strong exception safety guarantees
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {
-        auto tmp_size_per_level = _size_per_level;
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {
        std::vector<sstables::shared_sstable> l0_old_ssts, l0_new_ssts;
        for (auto& sst : new_ssts) {
            auto level = sst->get_sstable_level();
-            tmp_size_per_level[level] += sst->data_size();
+            _size_per_level[level] += sst->data_size();
            if (level == 0) {
                l0_new_ssts.push_back(std::move(sst));
            }
        }
        for (auto& sst : old_ssts) {
            auto level = sst->get_sstable_level();
-            tmp_size_per_level[level] -= sst->data_size();
+            _size_per_level[level] -= sst->data_size();
            if (level == 0) {
                l0_old_ssts.push_back(std::move(sst));
            }
        }
        if (l0_old_ssts.size() || l0_new_ssts.size()) {
-            // stcs replace_sstables guarantees strong exception safety
            _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
        }
-        std::invoke([&] () noexcept {
-            _size_per_level = std::move(tmp_size_per_level);
-        });
    }
 };

@@ -532,14 +420,14 @@ struct unimplemented_backlog_tracker final : public compaction_backlog_tracker::
    virtual double backlog(const compaction_backlog_tracker::ongoing_writes& ow, const compaction_backlog_tracker::ongoing_compactions& oc) const override {
        return compaction_controller::disable_backlog;
    }
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {}
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {}
 };

 struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
    virtual double backlog(const compaction_backlog_tracker::ongoing_writes& ow, const compaction_backlog_tracker::ongoing_compactions& oc) const override {
        return 0;
    }
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {}
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {}
 };

 //
@@ -548,7 +436,7 @@ struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
 //
 class null_compaction_strategy : public compaction_strategy_impl {
 public:
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override {
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override {
        return sstables::compaction_descriptor();
    }

@@ -572,20 +460,6 @@ leveled_compaction_strategy::leveled_compaction_strategy(const std::map<sstring,
 {
 }

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void leveled_compaction_strategy::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    size_tiered_compaction_strategy_options::validate(options, unchecked_options);
-
-    auto tmp_value = compaction_strategy_impl::get_value(options, SSTABLE_SIZE_OPTION);
-    auto min_sstables_size = cql3::statements::property_definitions::to_long(SSTABLE_SIZE_OPTION, tmp_value, DEFAULT_MAX_SSTABLE_SIZE_IN_MB);
-    if (min_sstables_size <= 0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be positive", SSTABLE_SIZE_OPTION, min_sstables_size));
-    }
-    unchecked_options.erase(SSTABLE_SIZE_OPTION);
-}
-
 std::unique_ptr<compaction_backlog_tracker::impl> leveled_compaction_strategy::make_backlog_tracker() const {
    return std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options);
 }
@@ -619,22 +493,201 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
    _use_clustering_key_filter = true;
 }

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void time_window_compaction_strategy::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    time_window_compaction_strategy_options::validate(options, unchecked_options);
-    size_tiered_compaction_strategy_options::validate(options, unchecked_options);
-}
-
 std::unique_ptr<compaction_backlog_tracker::impl> time_window_compaction_strategy::make_backlog_tracker() const {
    return std::make_unique<time_window_backlog_tracker>(_options, _stcs_options);
 }

 } // namespace sstables

+std::vector<sstables::shared_sstable>
+date_tiered_manifest::get_next_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& uncompacting, gc_clock::time_point compaction_time) {
+    if (table_s.main_sstable_set().all()->empty()) {
+        return {};
+    }
+
+    // Find fully expired SSTables. Those will be included no matter what.
+    auto expired = table_s.fully_expired_sstables(uncompacting, compaction_time);
+
+    if (!expired.empty()) {
+        auto is_expired = [&] (const sstables::shared_sstable& s) { return expired.contains(s); };
+        uncompacting.erase(boost::remove_if(uncompacting, is_expired), uncompacting.end());
+    }
+
+    auto compaction_candidates = get_next_non_expired_sstables(table_s, uncompacting, compaction_time);
+    if (!expired.empty()) {
+        compaction_candidates.insert(compaction_candidates.end(), expired.begin(), expired.end());
+    }
+    return compaction_candidates;
+}
+
+int64_t date_tiered_manifest::get_estimated_tasks(table_state& table_s) const {
+    int base = table_s.schema()->min_compaction_threshold();
+    int64_t now = get_now(table_s.main_sstable_set().all());
+    std::vector<sstables::shared_sstable> sstables;
+    int64_t n = 0;
+
+    auto all_sstables = table_s.main_sstable_set().all();
+    sstables.reserve(all_sstables->size());
+    for (auto& entry : *all_sstables) {
+        sstables.push_back(entry);
+    }
+    auto candidates = filter_old_sstables(sstables, _options.max_sstable_age, now);
+    auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);
+
+    for (auto& bucket : buckets) {
+        if (bucket.size() >= size_t(table_s.schema()->min_compaction_threshold())) {
+            n += std::ceil(double(bucket.size()) / table_s.schema()->max_compaction_threshold());
+        }
+    }
+    return n;
+}
+
+std::vector<sstables::shared_sstable>
+date_tiered_manifest::get_next_non_expired_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& non_expiring_sstables, gc_clock::time_point compaction_time) {
+    int base = table_s.schema()->min_compaction_threshold();
+    int64_t now = get_now(table_s.main_sstable_set().all());
+    auto most_interesting = get_compaction_candidates(table_s, non_expiring_sstables, now, base);
+
+    return most_interesting;
+
+    // FIXME: implement functionality below that will look for a single sstable with worth dropping tombstone,
+    // iff strategy didn't find anything to compact. So it's not essential.
+#if 0
+    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
+    // ratio is greater than threshold.
+
+    List<SSTableReader> sstablesWithTombstones = Lists.newArrayList();
+    for (SSTableReader sstable : nonExpiringSSTables)
+    {
+        if (worthDroppingTombstones(sstable, gcBefore))
+            sstablesWithTombstones.add(sstable);
+    }
+    if (sstablesWithTombstones.isEmpty())
+        return Collections.emptyList();
+
+    return Collections.singletonList(Collections.min(sstablesWithTombstones, new SSTableReader.SizeComparator()));
+#endif
+}
+
+std::vector<sstables::shared_sstable>
+date_tiered_manifest::get_compaction_candidates(table_state& table_s, std::vector<sstables::shared_sstable> candidate_sstables, int64_t now, int base) {
+    int min_threshold = table_s.schema()->min_compaction_threshold();
+    int max_threshold = table_s.schema()->max_compaction_threshold();
+    auto candidates = filter_old_sstables(candidate_sstables, _options.max_sstable_age, now);
+
+    auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);
+
+    return newest_bucket(buckets, min_threshold, max_threshold, now, _options.base_time);
+}
+
+int64_t date_tiered_manifest::get_now(lw_shared_ptr<const sstables::sstable_list> shared_set) {
+    int64_t max_timestamp = 0;
+    for (auto& sst : *shared_set) {
+        int64_t candidate = sst->get_stats_metadata().max_timestamp;
+        max_timestamp = candidate > max_timestamp ? candidate : max_timestamp;
+    }
+    return max_timestamp;
+}
+
+std::vector<sstables::shared_sstable>
+date_tiered_manifest::filter_old_sstables(std::vector<sstables::shared_sstable> sstables, api::timestamp_type max_sstable_age, int64_t now) {
+    if (max_sstable_age == 0) {
+        return sstables;
+    }
+    int64_t cutoff = now - max_sstable_age;
+
+    std::erase_if(sstables, [cutoff] (auto& sst) {
+        return sst->get_stats_metadata().max_timestamp < cutoff;
+    });
+
+    return sstables;
+}
+
+std::vector<std::pair<sstables::shared_sstable,int64_t>>
+date_tiered_manifest::create_sst_and_min_timestamp_pairs(const std::vector<sstables::shared_sstable>& sstables) {
+    std::vector<std::pair<sstables::shared_sstable,int64_t>> sstable_min_timestamp_pairs;
+    sstable_min_timestamp_pairs.reserve(sstables.size());
+    for (auto& sst : sstables) {
+        sstable_min_timestamp_pairs.emplace_back(sst, sst->get_stats_metadata().min_timestamp);
+    }
+    return sstable_min_timestamp_pairs;
+}
+
+date_tiered_compaction_strategy_options::date_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options) {
+    using namespace cql3::statements;
+
+    auto tmp_value = sstables::compaction_strategy_impl::get_value(options, TIMESTAMP_RESOLUTION_KEY);
+    auto target_unit = tmp_value ? tmp_value.value() : DEFAULT_TIMESTAMP_RESOLUTION;
+
+    tmp_value = sstables::compaction_strategy_impl::get_value(options, MAX_SSTABLE_AGE_KEY);
+    auto fractional_days = property_definitions::to_double(MAX_SSTABLE_AGE_KEY, tmp_value, DEFAULT_MAX_SSTABLE_AGE_DAYS);
+    int64_t max_sstable_age_in_hours = std::lround(fractional_days * 24);
+    max_sstable_age = duration_conversor::convert(target_unit, std::chrono::hours(max_sstable_age_in_hours));
+
+    tmp_value = sstables::compaction_strategy_impl::get_value(options, BASE_TIME_KEY);
+    auto base_time_seconds = property_definitions::to_long(BASE_TIME_KEY, tmp_value, DEFAULT_BASE_TIME_SECONDS);
+    base_time = duration_conversor::convert(target_unit, std::chrono::seconds(base_time_seconds));
+}
+
+date_tiered_compaction_strategy_options::date_tiered_compaction_strategy_options() {
+    auto max_sstable_age_in_hours = int64_t(DEFAULT_MAX_SSTABLE_AGE_DAYS * 24);
+    max_sstable_age = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::hours(max_sstable_age_in_hours)).count();
+    base_time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::seconds(DEFAULT_BASE_TIME_SECONDS)).count();
+}
+
 namespace sstables {

+date_tiered_compaction_strategy::date_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
+    : compaction_strategy_impl(options)
+    , _manifest(options)
+{
+    clogger.warn("DateTieredCompactionStrategy is deprecated. Usually cases for which it is used are better handled by TimeWindowCompactionStrategy."
+            " Please change your compaction strategy to TWCS as DTCS will be retired in the near future");
+
+    // tombstone compaction is disabled by default because:
+    // - deletion shouldn't be used with DTCS; rather data is deleted through TTL.
+    // - with time series workloads, it's usually better to wait for whole sstable to be expired rather than
+    // compacting a single sstable when it's more than 20% (default value) expired.
+    // For more details, see CASSANDRA-9234
+    if (!options.contains(TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.contains(TOMBSTONE_THRESHOLD_OPTION)) {
+        _disable_tombstone_compaction = true;
+        date_tiered_manifest::logger.debug("Disabling tombstone compactions for DTCS");
+    } else {
+        date_tiered_manifest::logger.debug("Enabling tombstone compactions for DTCS");
+    }
+
+    _use_clustering_key_filter = true;
+}
+
+compaction_descriptor date_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
+    auto compaction_time = gc_clock::now();
+    auto sstables = _manifest.get_next_sstables(table_s, candidates, compaction_time);
+
+    if (!sstables.empty()) {
+        date_tiered_manifest::logger.debug("datetiered: Compacting {} out of {} sstables", sstables.size(), candidates.size());
+        return sstables::compaction_descriptor(std::move(sstables), service::get_local_compaction_priority());
+    }
+
+    // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
+    auto e = boost::range::remove_if(candidates, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
+        return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
+    });
+    candidates.erase(e, candidates.end());
+    if (candidates.empty()) {
+        return sstables::compaction_descriptor();
+    }
+    // find oldest sstable which is worth dropping tombstones because they are more unlikely to
+    // shadow data from other sstables, and it also tends to be relatively big.
+    auto it = std::min_element(candidates.begin(), candidates.end(), [] (auto& i, auto& j) {
+        return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
+    });
+    return sstables::compaction_descriptor({ *it }, service::get_local_compaction_priority());
+}
+
+std::unique_ptr<compaction_backlog_tracker::impl> date_tiered_compaction_strategy::make_backlog_tracker() const {
+    return std::make_unique<unimplemented_backlog_tracker>();
+}
+
 size_tiered_compaction_strategy::size_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
    : compaction_strategy_impl(options)
    , _options(options)
@@ -644,13 +697,6 @@ size_tiered_compaction_strategy::size_tiered_compaction_strategy(const size_tier
    : _options(options)
 {}

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void size_tiered_compaction_strategy::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    size_tiered_compaction_strategy_options::validate(options, unchecked_options);
-}
-
 std::unique_ptr<compaction_backlog_tracker::impl> size_tiered_compaction_strategy::make_backlog_tracker() const {
    return std::make_unique<size_tiered_backlog_tracker>(_options);
 }
@@ -667,8 +713,8 @@ compaction_strategy_type compaction_strategy::type() const {
    return _compaction_strategy_impl->type();
 }

-compaction_descriptor compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
-    return _compaction_strategy_impl->get_sstables_for_compaction(table_s, control);
+compaction_descriptor compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
+    return _compaction_strategy_impl->get_sstables_for_compaction(table_s, control, std::move(candidates));
 }

 compaction_descriptor compaction_strategy::get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
@@ -700,12 +746,12 @@ compaction_backlog_tracker compaction_strategy::make_backlog_tracker() const {
 }

 sstables::compaction_descriptor
-compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
-    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, cfg);
+compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const {
+    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

-uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) const {
-    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
+uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) const {
+    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate);
 }

 reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const {
@@ -729,6 +775,9 @@ compaction_strategy make_compaction_strategy(compaction_strategy_type strategy,
    case compaction_strategy_type::leveled:
        impl = ::make_shared<leveled_compaction_strategy>(options);
        break;
+    case compaction_strategy_type::date_tiered:
+        impl = ::make_shared<date_tiered_compaction_strategy>(options);
+        break;
    case compaction_strategy_type::time_window:
        impl = ::make_shared<time_window_compaction_strategy>(options);
        break;
@@ -739,13 +788,6 @@ compaction_strategy make_compaction_strategy(compaction_strategy_type strategy,
    return compaction_strategy(std::move(impl));
 }

-future<reshape_config> make_reshape_config(const sstables::storage& storage, reshape_mode mode) {
-    co_return sstables::reshape_config{
-        .mode = mode,
-        .free_storage_space = co_await storage.free_space() / smp::count,
-    };
-}
-
 }

 namespace compaction {
@@ -754,6 +796,7 @@ compaction_strategy_state compaction_strategy_state::make(const compaction_strat
    switch (cs.type()) {
        case compaction_strategy_type::null:
        case compaction_strategy_type::size_tiered:
+        case compaction_strategy_type::date_tiered:
            return compaction_strategy_state(default_empty_state{});
        case compaction_strategy_type::leveled:
            return compaction_strategy_state(leveled_compaction_strategy_state{});
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -31,7 +31,6 @@ class sstable;
 class sstable_set;
 struct compaction_descriptor;
 struct resharding_descriptor;
-class storage;

 class compaction_strategy {
    ::shared_ptr<compaction_strategy_impl> _compaction_strategy_impl;
@@ -45,7 +44,7 @@ public:
    compaction_strategy& operator=(compaction_strategy&&);

    // Return a list of sstables to be compacted after applying the strategy.
-    compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control);
+    compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidates);

    compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<shared_sstable> candidates);

@@ -72,6 +71,8 @@ public:
            return "SizeTieredCompactionStrategy";
        case compaction_strategy_type::leveled:
            return "LeveledCompactionStrategy";
+        case compaction_strategy_type::date_tiered:
+            return "DateTieredCompactionStrategy";
        case compaction_strategy_type::time_window:
            return "TimeWindowCompactionStrategy";
        default:
@@ -88,6 +89,8 @@ public:
            return compaction_strategy_type::size_tiered;
        } else if (short_name == "LeveledCompactionStrategy") {
            return compaction_strategy_type::leveled;
+        } else if (short_name == "DateTieredCompactionStrategy") {
+            return compaction_strategy_type::date_tiered;
        } else if (short_name == "TimeWindowCompactionStrategy") {
            return compaction_strategy_type::time_window;
        } else {
@@ -105,7 +108,7 @@ public:

    compaction_backlog_tracker make_backlog_tracker() const;

-    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr) const;
+    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) const;

    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const;

@@ -123,13 +126,11 @@ public:
    //
    // The caller should also pass a maximum number of SSTables which is the maximum amount of
    // SSTables that can be added into a single job.
-    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const;
+    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const;

 };

 // Creates a compaction_strategy object from one of the strategies available.
 compaction_strategy make_compaction_strategy(compaction_strategy_type strategy, const std::map<sstring, sstring>& options);

-future<reshape_config> make_reshape_config(const sstables::storage& storage, reshape_mode mode);
-
 }
--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -21,23 +21,20 @@ class sstable_set_impl;
 class resharding_descriptor;

 class compaction_strategy_impl {
-public:
    static constexpr float DEFAULT_TOMBSTONE_THRESHOLD = 0.2f;
    // minimum interval needed to perform tombstone removal compaction in seconds, default 86400 or 1 day.
    static constexpr std::chrono::seconds DEFAULT_TOMBSTONE_COMPACTION_INTERVAL() { return std::chrono::seconds(86400); }
-    static constexpr auto TOMBSTONE_THRESHOLD_OPTION = "tombstone_threshold";
-    static constexpr auto TOMBSTONE_COMPACTION_INTERVAL_OPTION = "tombstone_compaction_interval";
 protected:
+    const sstring TOMBSTONE_THRESHOLD_OPTION = "tombstone_threshold";
+    const sstring TOMBSTONE_COMPACTION_INTERVAL_OPTION = "tombstone_compaction_interval";
+
    bool _use_clustering_key_filter = false;
    bool _disable_tombstone_compaction = false;
    float _tombstone_threshold = DEFAULT_TOMBSTONE_THRESHOLD;
    db_clock::duration _tombstone_compaction_interval = DEFAULT_TOMBSTONE_COMPACTION_INTERVAL();
 public:
    static std::optional<sstring> get_value(const std::map<sstring, sstring>& options, const sstring& name);
-    static void validate_min_max_threshold(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
-    static void validate_options_for_strategy_type(const std::map<sstring, sstring>& options, sstables::compaction_strategy_type type);
 protected:
-    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
    compaction_strategy_impl() = default;
    explicit compaction_strategy_impl(const std::map<sstring, sstring>& options);
    static compaction_descriptor make_major_compaction_job(std::vector<sstables::shared_sstable> candidates,
@@ -45,7 +42,7 @@ protected:
            uint64_t max_sstable_bytes = compaction_descriptor::default_max_sstable_bytes);
 public:
    virtual ~compaction_strategy_impl() {}
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) = 0;
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) = 0;
    virtual compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
        return make_major_compaction_job(std::move(candidates));
    }
@@ -64,11 +61,11 @@ public:

    // Check if a given sstable is entitled for tombstone compaction based on its
    // droppable tombstone histogram and gc_before.
-    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t);
+    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state);

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const = 0;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) const;
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) const;

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) const;

@@ -76,6 +73,6 @@ public:
        return false;
    }

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const;
 };
 }
--- a/compaction/compaction_strategy_type.hh
+++ b/compaction/compaction_strategy_type.hh
@@ -8,22 +8,15 @@

 #pragma once

-#include <cstdint>
-
 namespace sstables {

 enum class compaction_strategy_type {
    null,
    size_tiered,
    leveled,
+    date_tiered,
    time_window,
 };

 enum class reshape_mode { strict, relaxed };
-
-struct reshape_config {
-    reshape_mode mode;
-    const uint64_t free_storage_space;
-};
-
 }
--- a/compaction/date_tiered_compaction_strategy.hh
+++ b/compaction/date_tiered_compaction_strategy.hh
@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2016-present-2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
+ */
+
+#pragma once
+
+#include <map>
+#include <chrono>
+#include <algorithm>
+#include <vector>
+#include <iterator>
+#include "sstables/sstables.hh"
+#include "compaction.hh"
+#include "timestamp.hh"
+#include "cql3/statements/property_definitions.hh"
+#include "compaction_strategy_impl.hh"
+
+static constexpr double DEFAULT_MAX_SSTABLE_AGE_DAYS = 365;
+static constexpr int64_t DEFAULT_BASE_TIME_SECONDS = 60;
+
+struct duration_conversor {
+    // Convert given duration to TargetDuration and return value as timestamp.
+    template <typename TargetDuration, typename SourceDuration>
+    static api::timestamp_type convert(SourceDuration d) {
+        return std::chrono::duration_cast<TargetDuration>(d).count();
+    }
+
+    // Convert given duration to duration that is represented by the string
+    // target_duration, and return value as timestamp.
+    template <typename SourceDuration>
+    static api::timestamp_type convert(const sstring& target_duration, SourceDuration d) {
+        if (target_duration == "HOURS") {
+            return convert<std::chrono::hours>(d);
+        } else if (target_duration == "MICROSECONDS") {
+            return convert<std::chrono::microseconds>(d);
+        } else if (target_duration == "MILLISECONDS") {
+            return convert<std::chrono::milliseconds>(d);
+        } else if (target_duration == "MINUTES") {
+            return convert<std::chrono::minutes>(d);
+        } else if (target_duration == "NANOSECONDS") {
+            return convert<std::chrono::nanoseconds>(d);
+        } else if (target_duration == "SECONDS") {
+            return convert<std::chrono::seconds>(d);
+        } else {
+            throw std::runtime_error(format("target duration {} is not available", target_duration));
+        }
+    }
+};
+
+class date_tiered_compaction_strategy_options {
+    const sstring DEFAULT_TIMESTAMP_RESOLUTION = "MICROSECONDS";
+    const sstring TIMESTAMP_RESOLUTION_KEY = "timestamp_resolution";
+    const sstring MAX_SSTABLE_AGE_KEY = "max_sstable_age_days";
+    const sstring BASE_TIME_KEY = "base_time_seconds";
+
+    api::timestamp_type max_sstable_age;
+    api::timestamp_type base_time;
+public:
+    date_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options);
+
+    date_tiered_compaction_strategy_options();
+private:
+
+    friend class date_tiered_manifest;
+};
+
+class date_tiered_manifest {
+    date_tiered_compaction_strategy_options _options;
+public:
+    static logging::logger logger;
+
+    date_tiered_manifest() = delete;
+
+    date_tiered_manifest(const std::map<sstring, sstring>& options)
+        : _options(options) {}
+
+    std::vector<sstables::shared_sstable>
+    get_next_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& uncompacting, gc_clock::time_point compaction_time);
+
+    int64_t get_estimated_tasks(table_state& table_s) const;
+private:
+    std::vector<sstables::shared_sstable>
+    get_next_non_expired_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& non_expiring_sstables, gc_clock::time_point compaction_time);
+
+    std::vector<sstables::shared_sstable>
+    get_compaction_candidates(table_state& table_s, std::vector<sstables::shared_sstable> candidate_sstables, int64_t now, int base);
+
+    /**
+     * Gets the timestamp that DateTieredCompactionStrategy considers to be the "current time".
+     * @return the maximum timestamp across all SSTables.
+     */
+    static int64_t get_now(lw_shared_ptr<const sstables::sstable_list> shared_set);
+
+    /**
+     * Removes all sstables with max timestamp older than maxSSTableAge.
+     * @return a list of sstables with the oldest sstables excluded
+     */
+    static std::vector<sstables::shared_sstable>
+    filter_old_sstables(std::vector<sstables::shared_sstable> sstables, api::timestamp_type max_sstable_age, int64_t now);
+
+    /**
+     *
+     * @param sstables
+     * @return
+     */
+    static std::vector<std::pair<sstables::shared_sstable,int64_t>>
+    create_sst_and_min_timestamp_pairs(const std::vector<sstables::shared_sstable>& sstables);
+
+    /**
+     * A target time span used for bucketing SSTables based on timestamps.
+     */
+    struct target {
+        // How big a range of timestamps fit inside the target.
+        int64_t size;
+        // A timestamp t hits the target iff t / size == divPosition.
+        int64_t div_position;
+
+        target() = delete;
+        target(int64_t size, int64_t div_position) : size(size), div_position(div_position) {}
+
+        /**
+         * Compares the target to a timestamp.
+         * @param timestamp the timestamp to compare.
+         * @return a negative integer, zero, or a positive integer as the target lies before, covering, or after than the timestamp.
+         */
+        int compare_to_timestamp(int64_t timestamp) {
+            auto ts1 = div_position;
+            auto ts2 = timestamp / size;
+            return (ts1 > ts2 ? 1 : (ts1 == ts2 ? 0 : -1));
+        }
+
+        /**
+         * Tells if the timestamp hits the target.
+         * @param timestamp the timestamp to test.
+         * @return <code>true</code> iff timestamp / size == divPosition.
+         */
+        bool on_target(int64_t timestamp) {
+            return compare_to_timestamp(timestamp) == 0;
+        }
+
+        /**
+         * Gets the next target, which represents an earlier time span.
+         * @param base The number of contiguous targets that will have the same size. Targets following those will be <code>base</code> times as big.
+         * @return
+         */
+        target next_target(int base)
+        {
+            if (div_position % base > 0) {
+                return target(size, div_position - 1);
+            } else {
+                return target(size * base, div_position / base - 1);
+            }
+        }
+    };
+
+
+    /**
+     * Group files with similar min timestamp into buckets. Files with recent min timestamps are grouped together into
+     * buckets designated to short timespans while files with older timestamps are grouped into buckets representing
+     * longer timespans.
+     * @param files pairs consisting of a file and its min timestamp
+     * @param timeUnit
+     * @param base
+     * @param now
+     * @return a list of buckets of files. The list is ordered such that the files with newest timestamps come first.
+     *         Each bucket is also a list of files ordered from newest to oldest.
+     */
+    std::vector<std::vector<sstables::shared_sstable>>
+    get_buckets(std::vector<std::pair<sstables::shared_sstable,int64_t>>&& files, api::timestamp_type time_unit, int base, int64_t now) const {
+        // Sort files by age. Newest first.
+        std::sort(files.begin(), files.end(), [] (auto& i, auto& j) {
+            return i.second > j.second;
+        });
+
+        std::vector<std::vector<sstables::shared_sstable>> buckets;
+        auto target = get_initial_target(now, time_unit);
+        auto it = files.begin();
+
+        while (it != files.end()) {
+            bool finish = false;
+            while (!target.on_target(it->second)) {
+                // If the file is too new for the target, skip it.
+                if (target.compare_to_timestamp(it->second) < 0) {
+                    it++;
+                    if (it == files.end()) {
+                        finish = true;
+                        break;
+                    }
+                } else { // If the file is too old for the target, switch targets.
+                    target = target.next_target(base);
+                }
+            }
+            if (finish) {
+                break;
+            }
+
+            std::vector<sstables::shared_sstable> bucket;
+            while (target.on_target(it->second)) {
+                bucket.push_back(it->first);
+                it++;
+                if (it == files.end()) {
+                    break;
+                }
+            }
+            buckets.push_back(bucket);
+        }
+
+        return buckets;
+    }
+
+    target get_initial_target(uint64_t now, int64_t time_unit) const {
+        return target(time_unit, now / time_unit);
+    }
+
+    /**
+     * @param buckets list of buckets, sorted from newest to oldest, from which to return the newest bucket within thresholds.
+     * @param minThreshold minimum number of sstables in a bucket to qualify.
+     * @param maxThreshold maximum number of sstables to compact at once (the returned bucket will be trimmed down to this).
+     * @return a bucket (list) of sstables to compact.
+     */
+    std::vector<sstables::shared_sstable>
+    newest_bucket(std::vector<std::vector<sstables::shared_sstable>>& buckets, int min_threshold, int max_threshold,
+            int64_t now, api::timestamp_type base_time) {
+
+        // If the "incoming window" has at least minThreshold SSTables, choose that one.
+        // For any other bucket, at least 2 SSTables is enough.
+        // In any case, limit to maxThreshold SSTables.
+        target incoming_window = get_initial_target(now, base_time);
+        for (auto& bucket : buckets) {
+            auto min_timestamp = bucket.front()->get_stats_metadata().min_timestamp;
+            if (bucket.size() >= size_t(min_threshold) ||
+                    (bucket.size() >= 2 && !incoming_window.on_target(min_timestamp))) {
+                trim_to_threshold(bucket, max_threshold);
+                return bucket;
+            }
+        }
+        return {};
+    }
+
+
+    /**
+     * @param bucket list of sstables, ordered from newest to oldest by getMinTimestamp().
+     * @param maxThreshold maximum number of sstables in a single compaction task.
+     * @return A bucket trimmed to the <code>maxThreshold</code> newest sstables.
+     */
+    static void trim_to_threshold(std::vector<sstables::shared_sstable>& bucket, int max_threshold) {
+        // Trim the oldest sstables off the end to meet the maxThreshold
+        bucket.resize(std::min(bucket.size(), size_t(max_threshold)));
+    }
+};
+
+namespace sstables {
+
+class date_tiered_compaction_strategy : public compaction_strategy_impl {
+    date_tiered_manifest _manifest;
+public:
+    date_tiered_compaction_strategy(const std::map<sstring, sstring>& options);
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;
+
+    virtual int64_t estimated_pending_compactions(table_state& table_s) const override {
+        return _manifest.get_estimated_tasks(table_s);
+    }
+
+    virtual compaction_strategy_type type() const override {
+        return compaction_strategy_type::date_tiered;
+    }
+
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;
+};
+
+}
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -19,9 +19,8 @@ leveled_compaction_strategy_state& leveled_compaction_strategy::get_state(table_
    return table_s.get_compaction_strategy_state().get<leveled_compaction_strategy_state>();
 }

-compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
    auto& state = get_state(table_s);
-    auto candidates = control.candidates(table_s);
    // NOTE: leveled_manifest creation may be slightly expensive, so later on,
    // we may want to store it in the strategy itself. However, the sstable
    // lists managed by the manifest may become outdated. For example, one
@@ -51,18 +50,18 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
        auto& sstables = manifest.get_level(level);
        // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
        auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
-            return !worth_dropping_tombstones(sst, compaction_time, table_s);
+            return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
        });
        sstables.erase(e, sstables.end());
        if (sstables.empty()) {
            continue;
        }
        auto& sst = *std::max_element(sstables.begin(), sstables.end(), [&] (auto& i, auto& j) {
-            auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
-            auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+            auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
+            auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
            return i->estimate_droppable_tombstone_ratio(gc_before1) < j->estimate_droppable_tombstone_ratio(gc_before2);
        });
-        return sstables::compaction_descriptor({ sst }, sst->get_sstable_level());
+        return sstables::compaction_descriptor({ sst }, service::get_local_compaction_priority(), sst->get_sstable_level());
    }
    return {};
 }
@@ -146,8 +145,7 @@ int64_t leveled_compaction_strategy::estimated_pending_compactions(table_state&
 }

 compaction_descriptor
-leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
-    auto mode = cfg.mode;
+leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const {
    std::array<std::vector<shared_sstable>, leveled_manifest::MAX_LEVELS> level_info;

    auto is_disjoint = [schema] (const std::vector<shared_sstable>& sstables, unsigned tolerance) -> std::tuple<bool, unsigned> {
@@ -157,8 +155,6 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    auto max_sstable_size_in_bytes = _max_sstable_size_in_mb * 1024 * 1024;

-    clogger.debug("get_reshaping_job: mode={} input.size={} max_sstable_size_in_bytes={}", mode == reshape_mode::relaxed ? "relaxed" : "strict", input.size(), max_sstable_size_in_bytes);
-
    for (auto& sst : input) {
        auto sst_level = sst->get_sstable_level();
        if (sst_level > leveled_manifest::MAX_LEVELS - 1) {
@@ -166,7 +162,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

            // This is really unexpected, so we'll just compact it all to fix it
            auto ideal_level = ideal_level_for_input(input, max_sstable_size_in_bytes);
-            compaction_descriptor desc(std::move(input), ideal_level, max_sstable_size_in_bytes);
+            compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -197,14 +193,14 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        unsigned ideal_level = ideal_level_for_input(level_info[0], max_sstable_size_in_bytes);

        leveled_manifest::logger.info("Reshaping {} disjoint sstables in level 0 into level {}", level_info[0].size(), ideal_level);
-        compaction_descriptor desc(std::move(input), ideal_level, max_sstable_size_in_bytes);
+        compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
        desc.options = compaction_type_options::make_reshape();
        return desc;
    }

    if (level_info[0].size() > offstrategy_threshold) {
        size_tiered_compaction_strategy stcs(_stcs_options);
-        return stcs.get_reshaping_job(std::move(level_info[0]), schema, cfg);
+        return stcs.get_reshaping_job(std::move(level_info[0]), schema, iop, mode);
    }

    for (unsigned level = leveled_manifest::MAX_LEVELS - 1; level > 0; --level) {
@@ -215,7 +211,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        auto [disjoint, overlapping_sstables] = is_disjoint(level_info[level], tolerance(level));
        if (!disjoint) {
            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so the level will be entirely compacted on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
-            compaction_descriptor desc(std::move(level_info[level]), level, max_sstable_size_in_bytes);
+            compaction_descriptor desc(std::move(level_info[level]), iop, level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -235,15 +231,12 @@ leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, s
        if (levels[level].empty()) {
            continue;
        }
-        ret.push_back(compaction_descriptor(std::move(levels[level]), level, _max_sstable_size_in_mb * 1024 * 1024));
+        ret.push_back(compaction_descriptor(std::move(levels[level]), service::get_local_compaction_priority(), level, _max_sstable_size_in_mb * 1024 * 1024));
    }
    return ret;
 }

 unsigned leveled_compaction_strategy::ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size) {
-    if (!max_sstable_size) {
-        return 1;
-    }
    auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
        double inv_log_fanout = 1.0f / std::log(fanout);
        return log(x) * inv_log_fanout;
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -36,7 +36,7 @@ struct leveled_compaction_strategy_state {

 class leveled_compaction_strategy : public compaction_strategy_impl {
    static constexpr int32_t DEFAULT_MAX_SSTABLE_SIZE_IN_MB = 160;
-    static constexpr auto SSTABLE_SIZE_OPTION = "sstable_size_in_mb";
+    const sstring SSTABLE_SIZE_OPTION = "sstable_size_in_mb";

    int32_t _max_sstable_size_in_mb = DEFAULT_MAX_SSTABLE_SIZE_IN_MB;
    size_tiered_compaction_strategy_options _stcs_options;
@@ -46,10 +46,9 @@ private:
    leveled_compaction_strategy_state& get_state(table_state& table_s) const;
 public:
    static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
-    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

    leveled_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;

    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;

@@ -74,7 +73,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const override;
 };

 }
--- a/compaction/leveled_manifest.hh
+++ b/compaction/leveled_manifest.hh
@@ -11,11 +11,13 @@
 #pragma once

 #include "sstables/sstables.hh"
+#include "compaction.hh"
 #include "size_tiered_compaction_strategy.hh"
 #include "range.hh"
 #include "log.hh"
 #include <boost/range/algorithm/sort.hpp>
 #include <boost/range/algorithm/partial_sort.hpp>
+#include "service/priority_manager.hh"

 class leveled_manifest {
    table_state& _table_s;
@@ -147,7 +149,8 @@ public:
            if (info.can_promote) {
                info.candidates = get_overlapping_starved_sstables(next_level, std::move(info.candidates), compaction_counter);
            }
-            return sstables::compaction_descriptor(std::move(info.candidates), next_level, _max_sstable_size_in_bytes);
+            return sstables::compaction_descriptor(std::move(info.candidates),
+                                                   service::get_local_compaction_priority(), next_level, _max_sstable_size_in_bytes);
        } else {
            logger.debug("No compaction candidates for L{}", level);
            return sstables::compaction_descriptor();
@@ -211,7 +214,8 @@ public:
                    _table_s.min_compaction_threshold(), _schema->max_compaction_threshold(), _stcs_options);
                if (!most_interesting.empty()) {
                    logger.debug("L0 is too far behind, performing size-tiering there first");
-                    return sstables::compaction_descriptor(std::move(most_interesting));
+                    return sstables::compaction_descriptor(std::move(most_interesting),
+                                                           service::get_local_compaction_priority());
                }
            }
            auto descriptor = get_descriptor_for_level(i, last_compacted_keys, compaction_counter);
@@ -225,7 +229,8 @@ public:
            auto info = get_candidates_for(0, last_compacted_keys);
            if (!info.candidates.empty()) {
                auto next_level = get_next_level(info.candidates, info.can_promote);
-                return sstables::compaction_descriptor(std::move(info.candidates), next_level, _max_sstable_size_in_bytes);
+                return sstables::compaction_descriptor(std::move(info.candidates),
+                                                       service::get_local_compaction_priority(), next_level, _max_sstable_size_in_bytes);
            }
        }

--- a/compaction/size_tiered_backlog_tracker.hh
+++ b/compaction/size_tiered_backlog_tracker.hh
@@ -9,6 +9,7 @@
 #include "compaction_backlog_manager.hh"
 #include "size_tiered_compaction_strategy.hh"
 #include <cmath>
+#include <ctgmath>

 // Backlog for one SSTable under STCS:
 //
@@ -63,14 +64,10 @@
 // certain point in time, whose size is the amount of bytes currently written. So all we need
 // to do is keep track of them too, and add the current estimate to the static part of (4).
 class size_tiered_backlog_tracker final : public compaction_backlog_tracker::impl {
-    struct sstables_backlog_contribution {
-        double value = 0.0f;
-        std::unordered_set<sstables::shared_sstable> sstables;
-    };
-
    sstables::size_tiered_compaction_strategy_options _stcs_options;
    int64_t _total_bytes = 0;
-    sstables_backlog_contribution _contrib;
+    double _sstables_backlog_contribution = 0.0f;
+    std::unordered_set<sstables::shared_sstable> _sstables_contributing_backlog;
    std::unordered_set<sstables::shared_sstable> _all;

    struct inflight_component {
@@ -80,12 +77,12 @@ class size_tiered_backlog_tracker final : public compaction_backlog_tracker::imp

    inflight_component compacted_backlog(const compaction_backlog_tracker::ongoing_compactions& ongoing_compactions) const;

-    static double log4(double x) {
+    double log4(double x) const {
        double inv_log_4 = 1.0f / std::log(4);
        return log(x) * inv_log_4;
    }

-    static sstables_backlog_contribution calculate_sstables_backlog_contribution(const std::vector<sstables::shared_sstable>& all, const sstables::size_tiered_compaction_strategy_options& stcs_options);
+    void refresh_sstables_backlog_contribution();
 public:
    size_tiered_backlog_tracker(sstables::size_tiered_compaction_strategy_options stcs_options) : _stcs_options(stcs_options) {}

@@ -93,8 +90,7 @@ public:

    // Removing could be the result of a failure of an in progress write, successful finish of a
    // compaction, or some one-off operation, like drop
-    // Provides strong exception safety guarantees.
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override;
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override;

    int64_t total_bytes() const {
        return _total_bytes;
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -15,73 +15,20 @@

 namespace sstables {

-static long validate_sstable_size(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY);
-    auto min_sstables_size = cql3::statements::property_definitions::to_long(size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_MIN_SSTABLE_SIZE);
-    if (min_sstables_size < 0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be non negative", size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY, min_sstables_size));
-    }
-    return min_sstables_size;
-}
-
-static long validate_sstable_size(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto min_sstables_size = validate_sstable_size(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY);
-    return min_sstables_size;
-}
-
-static double validate_bucket_low(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::BUCKET_LOW_KEY);
-    auto bucket_low = cql3::statements::property_definitions::to_double(size_tiered_compaction_strategy_options::BUCKET_LOW_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_BUCKET_LOW);
-    if (bucket_low <= 0.0 || bucket_low >= 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be between 0.0 and 1.0", size_tiered_compaction_strategy_options::BUCKET_LOW_KEY, bucket_low));
-    }
-    return bucket_low;
-}
-
-static double validate_bucket_low(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto bucket_low = validate_bucket_low(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::BUCKET_LOW_KEY);
-    return bucket_low;
-}
-
-static double validate_bucket_high(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY);
-    auto bucket_high = cql3::statements::property_definitions::to_double(size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_BUCKET_HIGH);
-    if (bucket_high <= 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be greater than 1.0", size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY, bucket_high));
-    }
-    return bucket_high;
-}
-
-static double validate_bucket_high(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto bucket_high = validate_bucket_high(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY);
-    return bucket_high;
-}
-
-static double validate_cold_reads_to_omit(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY);
-    auto cold_reads_to_omit = cql3::statements::property_definitions::to_double(size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_COLD_READS_TO_OMIT);
-    if (cold_reads_to_omit < 0.0 || cold_reads_to_omit > 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be between 0.0 and 1.0", size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY, cold_reads_to_omit));
-    }
-    return cold_reads_to_omit;
-}
-
-static double validate_cold_reads_to_omit(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto cold_reads_to_omit = validate_cold_reads_to_omit(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY);
-    return cold_reads_to_omit;
-}
-
 size_tiered_compaction_strategy_options::size_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options) {
    using namespace cql3::statements;

-    min_sstable_size = validate_sstable_size(options);
-    bucket_low = validate_bucket_low(options);
-    bucket_high = validate_bucket_high(options);
-    cold_reads_to_omit = validate_cold_reads_to_omit(options);
+    auto tmp_value = compaction_strategy_impl::get_value(options, MIN_SSTABLE_SIZE_KEY);
+    min_sstable_size = property_definitions::to_long(MIN_SSTABLE_SIZE_KEY, tmp_value, DEFAULT_MIN_SSTABLE_SIZE);
+
+    tmp_value = compaction_strategy_impl::get_value(options, BUCKET_LOW_KEY);
+    bucket_low = property_definitions::to_double(BUCKET_LOW_KEY, tmp_value, DEFAULT_BUCKET_LOW);
+
+    tmp_value = compaction_strategy_impl::get_value(options, BUCKET_HIGH_KEY);
+    bucket_high = property_definitions::to_double(BUCKET_HIGH_KEY, tmp_value, DEFAULT_BUCKET_HIGH);
+
+    tmp_value = compaction_strategy_impl::get_value(options, COLD_READS_TO_OMIT_KEY);
+    cold_reads_to_omit = property_definitions::to_double(COLD_READS_TO_OMIT_KEY, tmp_value, DEFAULT_COLD_READS_TO_OMIT);
 }

 size_tiered_compaction_strategy_options::size_tiered_compaction_strategy_options() {
@@ -91,20 +38,6 @@ size_tiered_compaction_strategy_options::size_tiered_compaction_strategy_options
    cold_reads_to_omit = DEFAULT_COLD_READS_TO_OMIT;
 }

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void size_tiered_compaction_strategy_options::validate(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    validate_sstable_size(options, unchecked_options);
-    auto bucket_low = validate_bucket_low(options, unchecked_options);
-    auto bucket_high = validate_bucket_high(options, unchecked_options);
-    if (bucket_high <= bucket_low) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) is less than or equal to the {} value ({})", BUCKET_HIGH_KEY, bucket_high, BUCKET_LOW_KEY, bucket_low));
-    }
-    validate_cold_reads_to_omit(options, unchecked_options);
-    compaction_strategy_impl::validate_min_max_threshold(options, unchecked_options);
-}
-
 std::vector<std::pair<sstables::shared_sstable, uint64_t>>
 size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) {

@@ -210,12 +143,11 @@ size_tiered_compaction_strategy::most_interesting_bucket(std::vector<std::vector
 }

 compaction_descriptor
-size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
    // make local copies so they can't be changed out from under us mid-method
    int min_threshold = table_s.min_compaction_threshold();
    int max_threshold = table_s.schema()->max_compaction_threshold();
    auto compaction_time = gc_clock::now();
-    auto candidates = control.candidates(table_s);

    // TODO: Add support to filter cold sstables (for reference: SizeTieredCompactionStrategy::filterColdSSTables).

@@ -223,13 +155,13 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_

    if (is_any_bucket_interesting(buckets, min_threshold)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), min_threshold, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        return sstables::compaction_descriptor(std::move(most_interesting), service::get_local_compaction_priority());
    }

    // If we are not enforcing min_threshold explicitly, try any pair of SStables in the same tier.
    if (!table_s.compaction_enforce_min_threshold() && is_any_bucket_interesting(buckets, 2)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), 2, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        return sstables::compaction_descriptor(std::move(most_interesting), service::get_local_compaction_priority());
    }

    if (!table_s.tombstone_gc_enabled()) {
@@ -243,7 +175,7 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
    for (auto&& sstables : buckets | boost::adaptors::reversed) {
        // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
        auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
-            return !worth_dropping_tombstones(sst, compaction_time, table_s);
+            return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
        });
        sstables.erase(e, sstables.end());
        if (sstables.empty()) {
@@ -253,7 +185,7 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
        auto it = std::min_element(sstables.begin(), sstables.end(), [] (auto& i, auto& j) {
            return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
        });
-        return sstables::compaction_descriptor({ *it });
+        return sstables::compaction_descriptor({ *it }, service::get_local_compaction_priority());
    }
    return sstables::compaction_descriptor();
 }
@@ -297,9 +229,8 @@ size_tiered_compaction_strategy::most_interesting_bucket(const std::vector<sstab
 }

 compaction_descriptor
-size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const
+size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const
 {
-    auto mode = cfg.mode;
    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));

@@ -314,7 +245,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
        // All sstables can be reshaped at once if the amount of overlapping will not cause memory usage to be high,
        // which is possible because partitioned set is able to incrementally open sstables during compaction
        if (sstable_set_overlapping_count(schema, input) <= max_sstables) {
-            compaction_descriptor desc(std::move(input));
+            compaction_descriptor desc(std::move(input), iop);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -330,7 +261,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
                });
                bucket.resize(max_sstables);
            }
-            compaction_descriptor desc(std::move(bucket));
+            compaction_descriptor desc(std::move(bucket), iop);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -358,7 +289,7 @@ size_tiered_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_
            unsigned needed = std::min(remaining, max_threshold);
            std::vector<shared_sstable> sstables;
            std::move(it, it + needed, std::back_inserter(sstables));
-            ret.push_back(compaction_descriptor(std::move(sstables)));
+            ret.push_back(compaction_descriptor(std::move(sstables), service::get_local_compaction_priority()));
            std::advance(it, needed);
        }
    }
--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -18,16 +18,15 @@ class size_tiered_backlog_tracker;
 namespace sstables {

 class size_tiered_compaction_strategy_options {
-public:
    static constexpr uint64_t DEFAULT_MIN_SSTABLE_SIZE = 50L * 1024L * 1024L;
    static constexpr double DEFAULT_BUCKET_LOW = 0.5;
    static constexpr double DEFAULT_BUCKET_HIGH = 1.5;
    static constexpr double DEFAULT_COLD_READS_TO_OMIT = 0.05;
-    static constexpr auto MIN_SSTABLE_SIZE_KEY = "min_sstable_size";
-    static constexpr auto BUCKET_LOW_KEY = "bucket_low";
-    static constexpr auto BUCKET_HIGH_KEY = "bucket_high";
-    static constexpr auto COLD_READS_TO_OMIT_KEY = "cold_reads_to_omit";
-private:
+    const sstring MIN_SSTABLE_SIZE_KEY = "min_sstable_size";
+    const sstring BUCKET_LOW_KEY = "bucket_low";
+    const sstring BUCKET_HIGH_KEY = "bucket_high";
+    const sstring COLD_READS_TO_OMIT_KEY = "cold_reads_to_omit";
+
    uint64_t min_sstable_size = DEFAULT_MIN_SSTABLE_SIZE;
    double bucket_low = DEFAULT_BUCKET_LOW;
    double bucket_high = DEFAULT_BUCKET_HIGH;
@@ -36,13 +35,48 @@ public:
    size_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options);

    size_tiered_compaction_strategy_options();
-    size_tiered_compaction_strategy_options(const size_tiered_compaction_strategy_options&) = default;
-    size_tiered_compaction_strategy_options(size_tiered_compaction_strategy_options&&) = default;
-    size_tiered_compaction_strategy_options& operator=(const size_tiered_compaction_strategy_options&) = default;
-    size_tiered_compaction_strategy_options& operator=(size_tiered_compaction_strategy_options&&) = default;

-    static void validate(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
+    // FIXME: convert java code below.
+#if 0
+    public static Map<String, String> validateOptions(Map<String, String> options, Map<String, String> uncheckedOptions) throws ConfigurationException
+    {
+        String optionValue = options.get(MIN_SSTABLE_SIZE_KEY);
+        try
+        {
+            long minSSTableSize = optionValue == null ? DEFAULT_MIN_SSTABLE_SIZE : Long.parseLong(optionValue);
+            if (minSSTableSize < 0)
+            {
+                throw new ConfigurationException(String.format("%s must be non negative: %d", MIN_SSTABLE_SIZE_KEY, minSSTableSize));
+            }
+        }
+        catch (NumberFormatException e)
+        {
+            throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", optionValue, MIN_SSTABLE_SIZE_KEY), e);
+        }

+        double bucketLow = parseDouble(options, BUCKET_LOW_KEY, DEFAULT_BUCKET_LOW);
+        double bucketHigh = parseDouble(options, BUCKET_HIGH_KEY, DEFAULT_BUCKET_HIGH);
+        if (bucketHigh <= bucketLow)
+        {
+            throw new ConfigurationException(String.format("%s value (%s) is less than or equal to the %s value (%s)",
+                                                           BUCKET_HIGH_KEY, bucketHigh, BUCKET_LOW_KEY, bucketLow));
+        }
+
+        double maxColdReadsRatio = parseDouble(options, COLD_READS_TO_OMIT_KEY, DEFAULT_COLD_READS_TO_OMIT);
+        if (maxColdReadsRatio < 0.0 || maxColdReadsRatio > 1.0)
+        {
+            throw new ConfigurationException(String.format("%s value (%s) should be between between 0.0 and 1.0",
+                                                           COLD_READS_TO_OMIT_KEY, optionValue));
+        }
+
+        uncheckedOptions.remove(MIN_SSTABLE_SIZE_KEY);
+        uncheckedOptions.remove(BUCKET_LOW_KEY);
+        uncheckedOptions.remove(BUCKET_HIGH_KEY);
+        uncheckedOptions.remove(COLD_READS_TO_OMIT_KEY);
+
+        return uncheckedOptions;
+    }
+#endif
    friend class size_tiered_compaction_strategy;
 };

@@ -75,9 +109,8 @@ public:

    size_tiered_compaction_strategy(const std::map<sstring, sstring>& options);
    explicit size_tiered_compaction_strategy(const size_tiered_compaction_strategy_options& options);
-    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;

    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;

@@ -96,7 +129,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const override;

    friend class ::size_tiered_backlog_tracker;
 };
--- a/compaction/strategy_control.hh
+++ b/compaction/strategy_control.hh
@@ -10,7 +10,6 @@
 #pragma once

 #include "compaction/compaction_fwd.hh"
-#include "sstables/sstable_set.hh"

 namespace compaction {

@@ -19,8 +18,6 @@ class strategy_control {
 public:
    virtual ~strategy_control() {}
    virtual bool has_ongoing_compaction(table_state& table_s) const noexcept = 0;
-    virtual std::vector<sstables::shared_sstable> candidates(table_state&) const = 0;
-    virtual std::vector<sstables::frozen_sstable_run> candidates_as_runs(table_state&) const = 0;
 };

 }
--- a/Show More
+++ b/Show More