Update seastar submodule

* seastar e45cef9c...1b299004 (3): > rpc: Abort server connection streams on stop > rpc: Do not register stream to dying parent > rpc: Fix client-side stream registration race refs: #13100 Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
scylla_fstrim_setup: start scylla-fstrim.timer on setup
2023-09-06 12:35:37 +03:00 · 2023-07-18 16:03:53 +03:00 · 2023-07-14 18:18:05 +03:00 · 2023-07-14 15:48:28 +03:00 · 2023-07-13 22:48:36 +03:00 · 2023-07-13 22:48:30 +03:00
1250 changed files with 37283 additions and 55418 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,8 @@ tags
 testlog
 test/*/*.reject
 .vscode
+docs/_build
+docs/poetry.lock
 compile_commands.json
 .ccls-cache/
 .mypy_cache
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.27)
+cmake_minimum_required(VERSION 3.18)

 project(scylla)

@@ -8,19 +8,11 @@ list(APPEND CMAKE_MODULE_PATH
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake
  ${CMAKE_CURRENT_SOURCE_DIR}/seastar/cmake)

+set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE
+    STRING "Choose the type of build." FORCE)
 # Set the possible values of build type for cmake-gui
-set(scylla_build_types
-    "Debug" "Release" "Dev" "Sanitize" "Coverage")
 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-  ${scylla_build_types})
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE
-        STRING "Choose the type of build." FORCE)
-    message(WARNING "CMAKE_BUILD_TYPE not specified, Using 'Release'")
-elseif(NOT CMAKE_BUILD_TYPE IN_LIST scylla_build_types)
-    message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}. "
-        "Following types are supported: ${scylla_build_types}")
-endif()
+  "Debug" "Release" "Dev" "Sanitize")
 string(TOUPPER "${CMAKE_BUILD_TYPE}" build_mode)
 include(mode.${build_mode})
 include(mode.common)
@@ -34,9 +26,7 @@ set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)

 set(Seastar_TESTING ON CACHE BOOL "" FORCE)
-set(Seastar_API_LEVEL 7 CACHE STRING "" FORCE)
-set(Seastar_APPS ON CACHE BOOL "" FORCE)
-set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE)
+set(Seastar_API_LEVEL 6 CACHE STRING "" FORCE)
 add_subdirectory(seastar)

 # System libraries dependencies
@@ -56,8 +46,6 @@ find_package(xxHash REQUIRED)
 set(scylla_gen_build_dir "${CMAKE_BINARY_DIR}/gen")
 file(MAKE_DIRECTORY "${scylla_gen_build_dir}")

-include(add_version_library)
-generate_scylla_version()

 add_library(scylla-main STATIC)
 target_sources(scylla-main
@@ -78,6 +66,7 @@ target_sources(scylla-main
    debug.cc
    init.cc
    keys.cc
+    message/messaging_service.cc
    multishard_mutation_query.cc
    mutation_query.cc
    partition_slice_builder.cc
@@ -123,10 +112,8 @@ add_subdirectory(index)
 add_subdirectory(interface)
 add_subdirectory(lang)
 add_subdirectory(locator)
-add_subdirectory(message)
 add_subdirectory(mutation)
 add_subdirectory(mutation_writer)
-add_subdirectory(node_ops)
 add_subdirectory(readers)
 add_subdirectory(redis)
 add_subdirectory(replica)
@@ -144,6 +131,7 @@ add_subdirectory(tracing)
 add_subdirectory(transport)
 add_subdirectory(types)
 add_subdirectory(utils)
+include(add_version_library)
 add_version_library(scylla_version
    release.cc)

@@ -165,7 +153,6 @@ target_link_libraries(scylla PRIVATE
    index
    lang
    locator
-    message
    mutation
    mutation_writer
    raft
@@ -194,8 +181,35 @@ target_link_libraries(scylla PRIVATE
    seastar
    Boost::program_options)

+# Force SHA1 build-id generation
+set(default_linker_flags "-Wl,--build-id=sha1")
+include(CheckLinkerFlag)
+set(Scylla_USE_LINKER
+    ""
+    CACHE
+    STRING
+    "Use specified linker instead of the default one")
+if(Scylla_USE_LINKER)
+    set(linkers "${Scylla_USE_LINKER}")
+else()
+    set(linkers "lld" "gold")
+endif()
+
+foreach(linker ${linkers})
+    set(linker_flag "-fuse-ld=${linker}")
+    check_linker_flag(CXX ${linker_flag} "CXX_LINKER_HAVE_${linker}")
+    if(CXX_LINKER_HAVE_${linker})
+        string(APPEND default_linker_flags " ${linker_flag}")
+        break()
+    elseif(Scylla_USE_LINKER)
+        message(FATAL_ERROR "${Scylla_USE_LINKER} is not supported.")
+    endif()
+endforeach()
+
+set(CMAKE_EXE_LINKER_FLAGS "${default_linker_flags}" CACHE INTERNAL "")
+
+# TODO: patch dynamic linker to match configure.py behavior
+
 target_include_directories(scylla PRIVATE
    "${CMAKE_CURRENT_SOURCE_DIR}"
    "${scylla_gen_build_dir}")
-
-add_subdirectory(dist)
--- a/12
+++ b/12
@@ -7,7 +7,6 @@ Options:
  -h|--help show this help message.
  -o|--output-dir PATH specify destination path at which the version files are to be created.
  -d|--date-stamp DATE manually set date for release parameter
-  -v|--verbose also print out the version number

 By default, the script will attempt to parse 'version' file
 in the current directory, which should contain a string of
@@ -34,7 +33,6 @@ END
 )

 DATE=""
-PRINT_VERSION=false

 while [ $# -gt 0 ]; do
 	opt="$1"
@@ -53,10 +51,6 @@ while [ $# -gt 0 ]; do
 			shift
 			shift
 			;;
-		-v|--verbose)
-			PRINT_VERSION=true
-			shift
-			;;
 		*)
 			echo "Unexpected argument found: $1"
 			echo
@@ -78,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.5.0-dev
+VERSION=5.3.0-rc1

 if test -f version
 then
@@ -108,9 +102,7 @@ if [ -f "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" ]; then
 	fi
 fi

-if $PRINT_VERSION; then
-	echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
-fi
+echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p "$OUTPUT_DIR"
 echo "$SCYLLA_VERSION" > "$OUTPUT_DIR/SCYLLA-VERSION-FILE"
 echo "$SCYLLA_RELEASE" > "$OUTPUT_DIR/SCYLLA-RELEASE-FILE"
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -38,6 +38,7 @@
 #include <seastar/json/json_elements.hh>
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include "collection_mutation.hh"
+#include "db/query_context.hh"
 #include "schema/schema.hh"
 #include "db/tags/extension.hh"
 #include "db/tags/utils.hh"
@@ -59,28 +60,7 @@ logging::logger elogger("alternator-executor");

 namespace alternator {

-enum class table_status {
-    active = 0,
-    creating,
-    updating,
-    deleting
-};
-
-static sstring_view table_status_to_sstring(table_status tbl_status) {
-    switch(tbl_status) {
-        case table_status::active:
-            return "ACTIVE";
-        case table_status::creating:
-            return "CREATING";
-        case table_status::updating:
-            return "UPDATING";
-        case table_status::deleting:
-            return "DELETING";
-    }
-    return "UKNOWN";
-}
-
-static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type);
+static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type);

 static map_type attrs_type() {
    static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true);
@@ -211,8 +191,9 @@ static std::string lsi_name(const std::string& table_name, std::string_view inde

 /** Extract table name from a request.
 *  Most requests expect the table's name to be listed in a "TableName" field.
- *  This convenience function returns the name or api_error in case the
- *  table name is missing or not a string.
+ *  This convenience function returns the name, with appropriate validation
+ *  and api_error in case the table name is missing or not a string, or
+ *  doesn't pass validate_table_name().
 */
 static std::optional<std::string> find_table_name(const rjson::value& request) {
    const rjson::value* table_name_value = rjson::find(request, "TableName");
@@ -223,6 +204,7 @@ static std::optional<std::string> find_table_name(const rjson::value& request) {
        throw api_error::validation("Non-string TableName field in request");
    }
    std::string table_name = table_name_value->GetString();
+    validate_table_name(table_name);
    return table_name;
 }

@@ -249,10 +231,6 @@ schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::valu
    try {
        return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(*table_name), *table_name);
    } catch(data_dictionary::no_such_column_family&) {
-        // DynamoDB returns validation error even when table does not exist
-        // and the table name is invalid.
-        validate_table_name(table_name.value());
-
        throw api_error::resource_not_found(
                format("Requested resource not found: Table: {} not found", *table_name));
    }
@@ -303,10 +281,6 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
        try {
            return { proxy.data_dictionary().find_schema(sstring(internal_ks_name), sstring(internal_table_name)), type };
        } catch (data_dictionary::no_such_column_family&) {
-            // DynamoDB returns validation error even when table does not exist
-            // and the table name is invalid.
-            validate_table_name(table_name);
-
            throw api_error::resource_not_found(
                format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
        }
@@ -442,91 +416,6 @@ static rjson::value generate_arn_for_index(const schema& schema, std::string_vie
        schema.ks_name(), schema.cf_name(), index_name));
 }

-static rjson::value fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy const& proxy)
-{
-    rjson::value table_description = rjson::empty_object();
-    rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
-    // FIXME: take the tables creation time, not the current time!
-    size_t creation_date_seconds = std::chrono::duration_cast<std::chrono::seconds>(gc_clock::now().time_since_epoch()).count();
-    // FIXME: In DynamoDB the CreateTable implementation is asynchronous, and
-    // the table may be in "Creating" state until creating is finished.
-    // We don't currently do this in Alternator - instead CreateTable waits
-    // until the table is really available. So/ DescribeTable returns either
-    // ACTIVE or doesn't exist at all (and DescribeTable returns an error).
-    // The states CREATING and UPDATING are not currently returned.
-    rjson::add(table_description, "TableStatus", rjson::from_string(table_status_to_sstring(tbl_status)));
-    rjson::add(table_description, "TableArn", generate_arn_for_table(*schema));
-    rjson::add(table_description, "TableId", rjson::from_string(schema->id().to_sstring()));
-    // FIXME: Instead of hardcoding, we should take into account which mode was chosen
-    // when the table was created. But, Spark jobs expect something to be returned
-    // and PAY_PER_REQUEST seems closer to reality than PROVISIONED.
-    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
-    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
-    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
-    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
-    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
-    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
-    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
-    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
-
-   
-
-    data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
-    
-    if (tbl_status != table_status::deleting) {
-        rjson::add(table_description, "CreationDateTime", rjson::value(creation_date_seconds));
-        std::unordered_map<std::string,std::string> key_attribute_types;
-        // Add base table's KeySchema and collect types for AttributeDefinitions:
-        executor::describe_key_schema(table_description, *schema, key_attribute_types);
-        if (!t.views().empty()) {
-            rjson::value gsi_array = rjson::empty_array();
-            rjson::value lsi_array = rjson::empty_array();
-            for (const view_ptr& vptr : t.views()) {
-                rjson::value view_entry = rjson::empty_object();
-                const sstring& cf_name = vptr->cf_name();
-                size_t delim_it = cf_name.find(':');
-                if (delim_it == sstring::npos) {
-                    elogger.error("Invalid internal index table name: {}", cf_name);
-                    continue;
-                }
-                sstring index_name = cf_name.substr(delim_it + 1);
-                rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
-                rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
-                // Add indexes's KeySchema and collect types for AttributeDefinitions:
-                executor::describe_key_schema(view_entry, *vptr, key_attribute_types);
-                // Add projection type
-                rjson::value projection = rjson::empty_object();
-                rjson::add(projection, "ProjectionType", "ALL");
-                // FIXME: we have to get ProjectionType from the schema when it is added
-                rjson::add(view_entry, "Projection", std::move(projection));
-                // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
-                rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
-                rjson::push_back(index_array, std::move(view_entry));
-            }
-            if (!lsi_array.Empty()) {
-                rjson::add(table_description, "LocalSecondaryIndexes", std::move(lsi_array));
-            }
-            if (!gsi_array.Empty()) {
-                rjson::add(table_description, "GlobalSecondaryIndexes", std::move(gsi_array));
-            }
-        }
-        // Use map built by describe_key_schema() for base and indexes to produce
-        // AttributeDefinitions for all key columns:
-        rjson::value attribute_definitions = rjson::empty_array();
-        for (auto& type : key_attribute_types) {
-            rjson::value key = rjson::empty_object();
-            rjson::add(key, "AttributeName", rjson::from_string(type.first));
-            rjson::add(key, "AttributeType", rjson::from_string(type.second));
-            rjson::push_back(attribute_definitions, std::move(key));
-        }
-        rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
-    }
-    executor::supplement_table_stream_info(table_description, *schema, proxy);
-
-    // FIXME: still missing some response fields (issue #5026)
-    return table_description;
-}
-
 bool is_alternator_keyspace(const sstring& ks_name) {
    return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
 }
@@ -543,7 +432,85 @@ future<executor::request_return_type> executor::describe_table(client_state& cli

    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

-    rjson::value table_description = fill_table_description(schema, table_status::active, _proxy);
+    rjson::value table_description = rjson::empty_object();
+    rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
+    // FIXME: take the tables creation time, not the current time!
+    size_t creation_date_seconds = std::chrono::duration_cast<std::chrono::seconds>(gc_clock::now().time_since_epoch()).count();
+    rjson::add(table_description, "CreationDateTime", rjson::value(creation_date_seconds));
+    // FIXME: In DynamoDB the CreateTable implementation is asynchronous, and
+    // the table may be in "Creating" state until creating is finished.
+    // We don't currently do this in Alternator - instead CreateTable waits
+    // until the table is really available. So/ DescribeTable returns either
+    // ACTIVE or doesn't exist at all (and DescribeTable returns an error).
+    // The other states (CREATING, UPDATING, DELETING) are not currently
+    // returned.
+    rjson::add(table_description, "TableStatus", "ACTIVE");
+    rjson::add(table_description, "TableArn", generate_arn_for_table(*schema));
+    rjson::add(table_description, "TableId", rjson::from_string(schema->id().to_sstring()));
+    // FIXME: Instead of hardcoding, we should take into account which mode was chosen
+    // when the table was created. But, Spark jobs expect something to be returned
+    // and PAY_PER_REQUEST seems closer to reality than PROVISIONED.
+    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
+    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
+    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
+
+    std::unordered_map<std::string,std::string> key_attribute_types;
+    // Add base table's KeySchema and collect types for AttributeDefinitions:
+    describe_key_schema(table_description, *schema, key_attribute_types);
+
+    data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
+    if (!t.views().empty()) {
+        rjson::value gsi_array = rjson::empty_array();
+        rjson::value lsi_array = rjson::empty_array();
+        for (const view_ptr& vptr : t.views()) {
+            rjson::value view_entry = rjson::empty_object();
+            const sstring& cf_name = vptr->cf_name();
+            size_t delim_it = cf_name.find(':');
+            if (delim_it == sstring::npos) {
+                elogger.error("Invalid internal index table name: {}", cf_name);
+                continue;
+            }
+            sstring index_name = cf_name.substr(delim_it + 1);
+            rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
+            rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
+            // Add indexes's KeySchema and collect types for AttributeDefinitions:
+            describe_key_schema(view_entry, *vptr, key_attribute_types);
+            // Add projection type
+            rjson::value projection = rjson::empty_object();
+            rjson::add(projection, "ProjectionType", "ALL");
+            // FIXME: we have to get ProjectionType from the schema when it is added
+            rjson::add(view_entry, "Projection", std::move(projection));
+            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
+            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
+            rjson::push_back(index_array, std::move(view_entry));
+        }
+        if (!lsi_array.Empty()) {
+            rjson::add(table_description, "LocalSecondaryIndexes", std::move(lsi_array));
+        }
+        if (!gsi_array.Empty()) {
+            rjson::add(table_description, "GlobalSecondaryIndexes", std::move(gsi_array));
+        }
+    }
+    // Use map built by describe_key_schema() for base and indexes to produce
+    // AttributeDefinitions for all key columns:
+    rjson::value attribute_definitions = rjson::empty_array();
+    for (auto& type : key_attribute_types) {
+        rjson::value key = rjson::empty_object();
+        rjson::add(key, "AttributeName", rjson::from_string(type.first));
+        rjson::add(key, "AttributeType", rjson::from_string(type.second));
+        rjson::push_back(attribute_definitions, std::move(key));
+    }
+    rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
+
+    supplement_table_stream_info(table_description, *schema, _proxy);
+    
+    // FIXME: still missing some response fields (issue #5026)
+
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Table", std::move(table_description));
    elogger.trace("returning {}", response);
@@ -555,17 +522,10 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    elogger.trace("Deleting table {}", request);

    std::string table_name = get_table_name(request);
-    // DynamoDB returns validation error even when table does not exist
-    // and the table name is invalid.
-    validate_table_name(table_name);
-
    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    tracing::add_table_name(trace_state, keyspace_name, table_name);
    auto& p = _proxy.container();

-    schema_ptr schema = get_table(_proxy, request);
-    rjson::value table_description = fill_table_description(schema, table_status::deleting, _proxy);
-
    co_await _mm.container().invoke_on(0, [&] (service::migration_manager& mm) -> future<> {
        // FIXME: the following needs to be in a loop. If mm.announce() below
        // fails, we need to retry the whole thing.
@@ -575,14 +535,18 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
            throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
        }

-        auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
-        auto m2 = co_await service::prepare_keyspace_drop_announcement(_proxy.local_db(), keyspace_name, group0_guard.write_timestamp());
+        auto m = co_await mm.prepare_column_family_drop_announcement(keyspace_name, table_name, group0_guard.write_timestamp(), service::migration_manager::drop_views::yes);
+        auto m2 = co_await mm.prepare_keyspace_drop_announcement(keyspace_name, group0_guard.write_timestamp());

        std::move(m2.begin(), m2.end(), std::back_inserter(m));

-        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: delete {} table", table_name));
+        co_await mm.announce(std::move(m), std::move(group0_guard));
    });

+    // FIXME: need more attributes?
+    rjson::value table_description = rjson::empty_object();
+    rjson::add(table_description, "TableName", rjson::from_string(table_name));
+    rjson::add(table_description, "TableStatus", "DELETING");
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TableDescription", std::move(table_description));
    elogger.trace("returning {}", response);
@@ -867,6 +831,17 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta
    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
 }

+static future<> wait_for_schema_agreement(service::migration_manager& mm, db::timeout_clock::time_point deadline) {
+    return do_until([&mm, deadline] {
+        if (db::timeout_clock::now() > deadline) {
+            throw std::runtime_error("Unable to reach schema agreement");
+        }
+        return mm.have_schema_agreement();
+    }, [] {
+        return seastar::sleep(500ms);
+    });
+}
+
 static void verify_billing_mode(const rjson::value& request) {
        // Alternator does not yet support billing or throughput limitations, but
    // let's verify that BillingMode is at least legal.
@@ -884,38 +859,6 @@ static void verify_billing_mode(const rjson::value& request) {
    }
 }

-// Validate that a AttributeDefinitions parameter in CreateTable is valid, and
-// throws user-facing api_error::validation if it's not.
-// In particular, verify that the same AttributeName doesn't appear more than
-// once (Issue #13870).
-static void validate_attribute_definitions(const rjson::value& attribute_definitions){
-    if (!attribute_definitions.IsArray()) {
-        throw api_error::validation("AttributeDefinitions must be an array");
-    }
-    std::unordered_set<std::string> seen_attribute_names;
-    for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
-        const rjson::value* attribute_name = rjson::find(*it, "AttributeName");
-        if (!attribute_name) {
-            throw api_error::validation("AttributeName missing in AttributeDefinitions");
-        }
-        if (!attribute_name->IsString()) {
-            throw api_error::validation("AttributeName in AttributeDefinitions must be a string");
-        }
-        auto [it2, added] = seen_attribute_names.emplace(rjson::to_string_view(*attribute_name));
-        if (!added) {
-            throw api_error::validation(format("Duplicate AttributeName={} in AttributeDefinitions",
-                rjson::to_string_view(*attribute_name)));
-        }
-        const rjson::value* attribute_type = rjson::find(*it, "AttributeType");
-        if (!attribute_type) {
-            throw api_error::validation("AttributeType missing in AttributeDefinitions");
-        }
-        if (!attribute_type->IsString()) {
-            throw api_error::validation("AttributeType in AttributeDefinitions must be a string");
-        }
-    }
-}
-
 static future<executor::request_return_type> create_table_on_shard0(tracing::trace_state_ptr trace_state, rjson::value request, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper) {
    assert(this_shard_id() == 0);

@@ -924,14 +867,11 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    // (e.g., verify that this table doesn't already exist) - we can only
    // do this further down - after taking group0_guard.
    std::string table_name = get_table_name(request);
-    validate_table_name(table_name);
-
    if (table_name.find(executor::INTERNAL_TABLE_PREFIX) == 0) {
        co_return api_error::validation(format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
    }
    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    const rjson::value& attribute_definitions = request["AttributeDefinitions"];
-    validate_attribute_definitions(attribute_definitions);

    tracing::add_table_name(trace_state, keyspace_name, table_name);

@@ -1121,9 +1061,8 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    auto group0_guard = co_await mm.start_group0_operation();
    auto ts = group0_guard.write_timestamp();
    std::vector<mutation> schema_mutations;
-    auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts);
    try {
-        schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
+        schema_mutations = co_await create_keyspace(keyspace_name, sp, mm, gossiper, ts);
    } catch (exceptions::already_exists_exception&) {
        if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
            co_return api_error::resource_in_use(format("Table {} already exists", table_name));
@@ -1133,14 +1072,22 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
        // This should never happen, the ID is supposed to be unique
        co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
    }
-    co_await service::prepare_new_column_family_announcement(schema_mutations, sp, *ksm, schema, ts);
+    db::schema_tables::add_table_or_view_to_schema_mutation(schema, ts, true, schema_mutations);
+    // we must call before_create_column_family callbacks - which allow
+    // listeners to modify our schema_mutations. For example, CDC may add
+    // another table (the CDC log table) to the same keyspace.
+    // Unfortunately the convention is that this callback must be run in
+    // a Seastar thread.
+    co_await seastar::async([&] {
+        mm.get_notifier().before_create_column_family(*schema, schema_mutations, ts);
+    });
    for (schema_builder& view_builder : view_builders) {
        db::schema_tables::add_table_or_view_to_schema_mutation(
            view_ptr(view_builder.build()), ts, true, schema_mutations);
    }
-    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), format("alternator-executor: create {} table", table_name));
+    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard));

-    co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
+    co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, sp);
    rjson::add(status, "TableDescription", std::move(request));
@@ -1203,11 +1150,11 @@ future<executor::request_return_type> executor::update_table(client_state& clien

        auto schema = builder.build();

-        auto m = co_await service::prepare_column_family_update_announcement(p.local(), schema, false,  std::vector<view_ptr>(), group0_guard.write_timestamp());
+        auto m = co_await mm.prepare_column_family_update_announcement(schema, false,  std::vector<view_ptr>(), group0_guard.write_timestamp());

-        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: update {} table", tab->cf_name()));
+        co_await mm.announce(std::move(m), std::move(group0_guard));

-        co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+        co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);

        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
@@ -1643,7 +1590,7 @@ static parsed::condition_expression get_parsed_condition_expression(rjson::value
        throw api_error::validation("ConditionExpression must not be empty");
    }
    try {
-        return parse_condition_expression(rjson::to_string_view(*condition_expression), "ConditionExpression");
+        return parse_condition_expression(rjson::to_string_view(*condition_expression));
    } catch(expressions_syntax_error& e) {
        throw api_error::validation(e.what());
    }
@@ -1658,16 +1605,17 @@ static bool check_needs_read_before_write(const parsed::condition_expression& co

 // Fail the expression if it has unused attribute names or values. This is
 // how DynamoDB behaves, so we do too.
-static void verify_all_are_used(const rjson::value* field,
-        const std::unordered_set<std::string>& used, const char* field_name, const char* operation) {
-    if (!field) {
+static void verify_all_are_used(const rjson::value& req, const char* field,
+        const std::unordered_set<std::string>& used, const char* operation) {
+    const rjson::value* attribute_names = rjson::find(req, field);
+    if (!attribute_names) {
        return;
    }
-    for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) {
+    for (auto it = attribute_names->MemberBegin(); it != attribute_names->MemberEnd(); ++it) {
        if (!used.contains(it->name.GetString())) {
            throw api_error::validation(
                format("{} has spurious '{}', not used in {}",
-                    field_name, it->name.GetString(), operation));
+                       field, it->name.GetString(), operation));
        }
    }
 }
@@ -1694,8 +1642,8 @@ public:
            resolve_condition_expression(_condition_expression,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
-            verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "PutItem");
-            verify_all_are_used(expression_attribute_values, used_attribute_values,"ExpressionAttributeValues", "PutItem");
+            verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "PutItem");
+            verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "PutItem");
        } else {
            if (expression_attribute_names) {
                throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
@@ -1779,8 +1727,8 @@ public:
            resolve_condition_expression(_condition_expression,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
-            verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "DeleteItem");
-            verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "DeleteItem");
+            verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "DeleteItem");
+            verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "DeleteItem");
        } else {
            if (expression_attribute_names) {
                throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
@@ -2553,8 +2501,8 @@ update_item_operation::update_item_operation(service::storage_proxy& proxy, rjso
            expression_attribute_names, expression_attribute_values,
            used_attribute_names, used_attribute_values);

-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "UpdateItem");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "UpdateItem");
+    verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "UpdateItem");
+    verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "UpdateItem");

    // DynamoDB forbids having both old-style AttributeUpdates or Expected
    // and new-style UpdateExpression or ConditionExpression in the same request
@@ -3163,8 +3111,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st

    std::unordered_set<std::string> used_attribute_names;
    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");

    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)).then(
@@ -3275,8 +3222,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
        rs.cl = get_read_consistency(it->value);
        std::unordered_set<std::string> used_attribute_names;
        rs.attrs_to_get = ::make_shared<const std::optional<attrs_to_get>>(calculate_attrs_to_get(it->value, used_attribute_names));
-        const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames");
-        verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem");
+        verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");
        auto& keys = (it->value)["Keys"];
        for (rjson::value& key : keys.GetArray()) {
            rs.add(key);
@@ -3445,7 +3391,7 @@ filter::filter(const rjson::value& request, request_type rt,
            throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet");
        }
        try {
-            auto parsed = parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression");
+            auto parsed = parse_condition_expression(rjson::to_string_view(*expression));
            const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
            const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
            resolve_condition_expression(parsed,
@@ -3849,10 +3795,8 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
    // optimized the filtering by modifying partition_ranges and/or
    // ck_bounds. We haven't done this optimization yet.

-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Scan");
+    verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Scan");

    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
            std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
@@ -4073,7 +4017,7 @@ calculate_bounds_condition_expression(schema_ptr schema,
    // sort-key range.
    parsed::condition_expression p;
    try {
-        p = parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression");
+        p = parse_condition_expression(rjson::to_string_view(expression));
    } catch(expressions_syntax_error& e) {
        throw api_error::validation(e.what());
    }
@@ -4293,17 +4237,13 @@ future<executor::request_return_type> executor::query(client_state& client_state
        throw api_error::validation("Query must have one of "
                "KeyConditions or KeyConditionExpression");
    }
-
-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
-
    // exactly one of key_conditions or key_condition_expression
    auto [partition_ranges, ck_bounds] = key_conditions
                ? calculate_bounds_conditions(schema, *key_conditions)
                : calculate_bounds_condition_expression(schema, *key_condition_expression,
-                        expression_attribute_values,
+                        rjson::find(request, "ExpressionAttributeValues"),
                        used_attribute_values,
-                        expression_attribute_names,
+                        rjson::find(request, "ExpressionAttributeNames"),
                        used_attribute_names);

    filter filter(request, filter::request_type::QUERY,
@@ -4330,8 +4270,8 @@ future<executor::request_return_type> executor::query(client_state& client_state
    select_type select = parse_select(request, table_type);

    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names, select);
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query");
+    verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Query");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Query");
    query::partition_slice::option_set opts;
    opts.set_if<query::partition_slice::option::reversed>(!forward);
    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
@@ -4392,17 +4332,6 @@ future<executor::request_return_type> executor::list_tables(client_state& client

 future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header) {
    _stats.api_operations.describe_endpoints++;
-    // The alternator_describe_endpoints configuration can be used to disable
-    // the DescribeEndpoints operation, or set it to return a fixed string
-    std::string override = _proxy.data_dictionary().get_config().alternator_describe_endpoints();
-    if (!override.empty()) {
-        if (override == "disabled") {
-            _stats.unsupported_operations++;
-            return make_ready_future<request_return_type>(api_error::unknown_operation(
-                "DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)"));
-        }
-        host_header = std::move(override);
-    }
    rjson::value response = rjson::empty_object();
    // Without having any configuration parameter to say otherwise, we tell
    // the user to return to the same endpoint they used to reach us. The only
@@ -4440,10 +4369,6 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
    try {
        schema = _proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
    } catch(data_dictionary::no_such_column_family&) {
-        // DynamoDB returns validation error even when table does not exist
-        // and the table name is invalid.
-        validate_table_name(table_name);
-
        throw api_error::table_not_found(
                format("Table {} not found", table_name));
    }
@@ -4457,23 +4382,25 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
    co_return make_jsonable(std::move(response));
 }

-// Create the metadata for the keyspace in which we put the alternator
-// table if it doesn't already exist.
+// Create the keyspace in which we put the alternator table, if it doesn't
+// already exist.
 // Currently, we automatically configure the keyspace based on the number
 // of nodes in the cluster: A cluster with 3 or more live nodes, gets RF=3.
 // A smaller cluster (presumably, a test only), gets RF=1. The user may
 // manually create the keyspace to override this predefined behavior.
-static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type ts) {
-    int endpoint_count = gossiper.num_endpoints();
+static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type ts) {
+    sstring keyspace_name_str(keyspace_name);
+    int endpoint_count = gossiper.get_endpoint_states().size();
    int rf = 3;
    if (endpoint_count < rf) {
        rf = 1;
        elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} nodes.",
-                keyspace_name, rf, endpoint_count);
+                keyspace_name_str, rf, endpoint_count);
    }
    auto opts = get_network_topology_options(sp, gossiper, rf);
+    auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);

-    return keyspace_metadata::new_keyspace(keyspace_name, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);
+    co_return mm.prepare_new_keyspace_announcement(ksm, ts);
 }

 future<> executor::start() {
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -225,10 +225,9 @@ private:
    friend class rmw_operation;

    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr);
+    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
    
 public:
-    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
-
    static std::optional<rjson::value> describe_single_item(schema_ptr,
        const query::partition_slice&,
        const cql3::selection::selection&,
@@ -249,7 +248,7 @@ public:

    static void add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
-    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
+    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
 };

 // is_big() checks approximately if the given JSON value is "bigger" than
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -29,7 +29,7 @@
 namespace alternator {

 template <typename Func, typename Result = std::result_of_t<Func(expressionsParser&)>>
-static Result do_with_parser(std::string_view input, Func&& f) {
+Result do_with_parser(std::string_view input, Func&& f) {
    expressionsLexer::InputStreamType input_stream{
        reinterpret_cast<const ANTLR_UINT8*>(input.data()),
        ANTLR_ENC_UTF8,
@@ -43,41 +43,31 @@ static Result do_with_parser(std::string_view input, Func&& f) {
    return result;
 }

-template <typename Func, typename Result = std::result_of_t<Func(expressionsParser&)>>
-static Result parse(const char* input_name, std::string_view input, Func&& f) {
-    if (input.length() > 4096) {
-        throw expressions_syntax_error(format("{} expression size {} exceeds allowed maximum 4096.",
-            input_name, input.length()));
-    }
-    try {
-        return do_with_parser(input, f);
-    } catch (expressions_syntax_error& e) {
-        // If already an expressions_syntax_error, don't print the type's
-        // name (it's just ugly), just the message.
-        // TODO: displayRecognitionError could set a position inside the
-        // expressions_syntax_error in throws, and we could use it here to
-        // mark the broken position in 'input'.
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
-            input_name, input, e.what()));
-    } catch (...) {
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
-            input_name, input, std::current_exception()));
-    }
-}
-
 parsed::update_expression
 parse_update_expression(std::string_view query) {
-    return parse("UpdateExpression", query,  std::mem_fn(&expressionsParser::update_expression));
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::update_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing UpdateExpression '{}': {}", query, std::current_exception()));
+    }
 }

 std::vector<parsed::path>
 parse_projection_expression(std::string_view query) {
-    return parse ("ProjectionExpression", query,  std::mem_fn(&expressionsParser::projection_expression));
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::projection_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ProjectionExpression '{}': {}", query, std::current_exception()));
+    }
 }

 parsed::condition_expression
-parse_condition_expression(std::string_view query, const char* caller) {
-    return parse(caller, query,  std::mem_fn(&expressionsParser::condition_expression));
+parse_condition_expression(std::string_view query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
+    }
 }

 namespace parsed {
@@ -428,14 +418,9 @@ void for_condition_expression_on(const parsed::condition_expression& ce, const n
 // calculate_size() is ConditionExpression's size() function, i.e., it takes
 // a JSON-encoded value and returns its "size" as defined differently for the
 // different types - also as a JSON-encoded number.
-// If the value's type (e.g. number) has no size defined, there are two cases:
-// 1. If from_data (the value came directly from an attribute of the data),
-//    It returns a JSON-encoded "null" value. Comparisons against this
-//    non-numeric value will later fail, so eventually the application will
-//    get a ConditionalCheckFailedException.
-// 2. Otherwise (the value came from a constant in the query or some other
-//    calculation), throw a ValidationException.
-static rjson::value calculate_size(const rjson::value& v, bool from_data) {
+// It return a JSON-encoded "null" value if this value's type has no size
+// defined. Comparisons against this non-numeric value will later fail.
+static rjson::value calculate_size(const rjson::value& v) {
    // NOTE: If v is improperly formatted for our JSON value encoding, it
    // must come from the request itself, not from the database, so it makes
    // sense to throw a ValidationException if we see such a problem.
@@ -464,12 +449,10 @@ static rjson::value calculate_size(const rjson::value& v, bool from_data) {
            throw api_error::validation(format("invalid byte string: {}", v));
        }
        ret = base64_decoded_len(rjson::to_string_view(it->value));
-    } else if (from_data) {
+    } else {
        rjson::value json_ret = rjson::empty_object();
        rjson::add(json_ret, "null", rjson::value(true));
        return json_ret;
-    } else {
-        throw api_error::validation(format("Unsupported operand type {} for function size()", it->name));
    }
    rjson::value json_ret = rjson::empty_object();
    rjson::add(json_ret, "N", rjson::from_string(std::to_string(ret)));
@@ -551,7 +534,7 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
                        format("{}: size() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
-            return calculate_size(v, f._parameters[0].is_path());
+            return calculate_size(v);
        }
    },
    {"attribute_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
@@ -679,7 +662,7 @@ static rjson::value extract_path(const rjson::value* item,
            // objects. But today Alternator does not validate the structure
            // of nested documents before storing them, so this can happen on
            // read.
-            throw api_error::validation(format("{}: malformed item read: {}", caller, *item));
+            throw api_error::validation(format("{}: malformed item read: {}", *item));
        }
        const char* type = v->MemberBegin()->name.GetString();
        v = &(v->MemberBegin()->value);
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -74,22 +74,7 @@ options {
 */
@parser::context {
    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
-        const char* err;
-        switch (ex->getType()) {
-        case antlr3::ExceptionType::FAILED_PREDICATE_EXCEPTION:
-            err = "expression nested too deeply";
-            break;
-        default:
-            err = "syntax error";
-            break;
-        }
-        // Alternator expressions are always single line so ex->get_line()
-        // is always 1, no sense to print it.
-        // TODO: return the position as part of the exception, so the
-        // caller in expressions.cc that knows the expression string can
-        // mark the error position in the final error message.
-        throw expressions_syntax_error(format("{} at char {}", err,
-            ex->get_charPositionInLine()));
+        throw expressions_syntax_error("syntax error");
    }
 }
@lexer::context {
@@ -98,23 +83,6 @@ options {
    }
 }

-/* Unfortunately, ANTLR uses recursion - not the heap - to parse recursive
- * expressions. To make things even worse, ANTLR has no way to limit the
- * depth of this recursion (unlike Yacc which has YYMAXDEPTH). So deeply-
- * nested expression like "(((((((((((((..." can easily crash Scylla on a
- * stack overflow (see issue #14477).
- *
- * We are lucky that in the grammar for DynamoDB expressions (below),
- * only a few specific rules can recurse, so it was fairly easy to add a
- * "depth" counter to a few specific rules, and then use a predicate
- * "{depth<MAX_DEPTH}?" to avoid parsing if the depth exceeds this limit,
- * and throw a FAILED_PREDICATE_EXCEPTION in that case, which we will
- * report to the user as a "expression nested too deeply" error.
- */
-@parser::members {
-    static constexpr int MAX_DEPTH = 400;
-}
-
 /*
 * Lexical analysis phase, i.e., splitting the input up to tokens.
 * Lexical analyzer rules have names starting in capital letters.
@@ -187,20 +155,19 @@ path returns [parsed::path p]:
      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
    )*;

-/* See comment above why the "depth" counter was needed here */
-value[int depth] returns [parsed::value v]:
+value returns [parsed::value v]:
      VALREF       { $v.set_valref($VALREF.text); }
    | path         { $v.set_path($path.p); }
-    | {depth<MAX_DEPTH}? NAME { $v.set_func_name($NAME.text); }
-     '(' x=value[depth+1]    { $v.add_func_parameter($x.v); }
-     (',' x=value[depth+1]   { $v.add_func_parameter($x.v); })*
+    | NAME         { $v.set_func_name($NAME.text); }
+     '(' x=value   { $v.add_func_parameter($x.v); }
+     (',' x=value  { $v.add_func_parameter($x.v); })*
     ')'
    ;

 update_expression_set_rhs returns [parsed::set_rhs rhs]:
-    v=value[0]  { $rhs.set_value(std::move($v.v)); }
-    (   '+' v=value[0]  { $rhs.set_plus(std::move($v.v)); }
-      | '-' v=value[0]  { $rhs.set_minus(std::move($v.v)); }
+    v=value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
    )?
    ;

@@ -238,7 +205,7 @@ projection_expression returns [std::vector<parsed::path> v]:


 primitive_condition returns [parsed::primitive_condition c]:
-      v=value[0]      { $c.add_value(std::move($v.v));
+      v=value         { $c.add_value(std::move($v.v));
                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
@@ -247,14 +214,14 @@ primitive_condition returns [parsed::primitive_condition c]:
          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
         )
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
         AND
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
-         v=value[0]   { $c.add_value(std::move($v.v)); }
-         (',' v=value[0] { $c.add_value(std::move($v.v)); })*
+         v=value      { $c.add_value(std::move($v.v)); }
+         (',' v=value { $c.add_value(std::move($v.v)); })*
         ')'
      )?
    ;
@@ -264,20 +231,19 @@ primitive_condition returns [parsed::primitive_condition c]:
 // common rule prefixes, and (lack of) support for operator precedence.
 // These rules could have been written more clearly using a more powerful
 // parser generator - such as Yacc.
-// See comment above why the "depth" counter was needed here.
-boolean_expression[int depth] returns [parsed::condition_expression e]:
-	  b=boolean_expression_1[depth]       { $e.append(std::move($b.e), '|'); }
-	  (OR b=boolean_expression_1[depth]   { $e.append(std::move($b.e), '|'); } )*
+boolean_expression returns [parsed::condition_expression e]:
+	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
+	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
 	;
-boolean_expression_1[int depth] returns [parsed::condition_expression e]:
-	  b=boolean_expression_2[depth]       { $e.append(std::move($b.e), '&'); }
-	  (AND b=boolean_expression_2[depth]  { $e.append(std::move($b.e), '&'); } )*
+boolean_expression_1 returns [parsed::condition_expression e]:
+	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
+	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
 	;
-boolean_expression_2[int depth] returns [parsed::condition_expression e]:
+boolean_expression_2 returns [parsed::condition_expression e]:
 	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
-	| {depth<MAX_DEPTH}? NOT b=boolean_expression_2[depth+1]   { $e = std::move($b.e); $e.apply_not(); }
-	| {depth<MAX_DEPTH}? '(' b=boolean_expression[depth+1] ')' { $e = std::move($b.e); }
+	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
+	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
    ;

 condition_expression returns [parsed::condition_expression e]:
-    boolean_expression[0] { e=std::move($boolean_expression.e); } EOF;
+    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -28,7 +28,7 @@ public:

 parsed::update_expression parse_update_expression(std::string_view query);
 std::vector<parsed::path> parse_projection_expression(std::string_view query);
-parsed::condition_expression parse_condition_expression(std::string_view query, const char* caller);
+parsed::condition_expression parse_condition_expression(std::string_view query);

 void resolve_update_expression(parsed::update_expression& ue,
        const rjson::value* expression_attribute_names,
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -424,7 +424,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    co_await client_state.maybe_update_per_service_level_params();

    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
-    tracing::trace(trace_state, "{}", op);
+    tracing::trace(trace_state, op);
    rjson::value json_request = co_await _json_parser.parse(std::move(content));
    co_return co_await callback_it->second(_executor, client_state, trace_state,
            make_service_permit(std::move(units)), std::move(json_request), std::move(req));
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -1096,7 +1096,7 @@ void executor::add_stream_options(const rjson::value& stream_specification, sche
    }
 }

-void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
+void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp) {
    auto& opts = schema.cdc_options();
    if (opts.enabled()) {
        auto db = sp.data_dictionary();
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -430,7 +430,6 @@ class token_ranges_owned_by_this_shard {
    size_t _range_idx;
    size_t _end_idx;
    std::optional<dht::selective_token_range_sharder> _intersecter;
-    locator::effective_replication_map_ptr _erm;
 public:
    token_ranges_owned_by_this_shard(replica::database& db, gms::gossiper& g, schema_ptr s)
        :  _s(s)
@@ -438,7 +437,6 @@ public:
                g, utils::fb_utilities::get_broadcast_address())
        , _range_idx(random_offset(0, _token_ranges.size() - 1))
        , _end_idx(_range_idx + _token_ranges.size())
-        , _erm(s->table().get_effective_replication_map())
    {
        tlogger.debug("Generating token ranges starting from base range {} of {}", _range_idx, _token_ranges.size());
    }
@@ -471,7 +469,7 @@ public:
                    return std::nullopt;
                }
            }
-            _intersecter.emplace(_erm->get_sharder(*_s), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
+            _intersecter.emplace(_s->get_sharder(), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
        }
    }

--- a/api/CMakeLists.txt
+++ b/api/CMakeLists.txt
@@ -14,7 +14,6 @@ set(swagger_files
  api-doc/hinted_handoff.json
  api-doc/lsa.json
  api-doc/messaging_service.json
-  api-doc/metrics.json
  api-doc/storage_proxy.json
  api-doc/storage_service.json
  api-doc/stream_manager.json
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -34,14 +34,6 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"parameters",
-                     "description":"dict of parameters to pass to the injection (json format)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"dict",
-                     "paramType":"body"
                  }
               ]
            },
@@ -66,30 +58,6 @@
            }
         ]
      },
-      {
-         "path":"/v2/error_injection/injection/{injection}/message",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Send message to trigger an event in injection's code",
-               "type":"void",
-               "nickname":"message_injection",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"injection",
-                     "description":"injection name, should correspond to an injection added in code",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/v2/error_injection/injection",
         "operations":[
@@ -118,15 +86,5 @@
            }
         ]
      }
-   ],
-   "components":{
-      "schemas": {
-         "dict": {
-            "type": "object",
-            "additionalProperties": {
-               "type": "string"
-            }
-         }
-      }
-   }
+   ]
 }
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -245,7 +245,7 @@
                 "GOSSIP_SHUTDOWN",
                 "DEFINITIONS_UPDATE",
                 "TRUNCATE",
-                 "UNUSED__REPLICATION_FINISHED",
+                 "REPLICATION_FINISHED",
                 "MIGRATION_REQUEST",
                 "PREPARE_MESSAGE",
                 "PREPARE_DONE_MESSAGE",
--- a/api/api-doc/metrics.def.json
+++ b/api/api-doc/metrics.def.json
@@ -1,34 +0,0 @@
-    "metrics_config": {
-        "id": "metrics_config",
-        "summary": "An entry in the metrics configuration",
-        "properties": {
-            "source_labels": {
-                "type": "array",
-                "items": {
-                    "type": "string"
-                },
-                "description": "The source labels, a match is based on concatination of the labels"
-            },
-            "action": {
-                "type": "string",
-                "description": "The action to perfrom on match",
-                "enum": ["skip_when_empty", "report_when_empty", "replace", "keep", "drop", "drop_label"]
-            },
-            "target_label": {
-                "type": "string",
-                "description": "The application state version"
-            },
-            "replacement": {
-                "type": "string",
-                "description": "The replacement string to use when replacing a value"
-            },
-            "regex": {
-                "type": "string",
-                "description": "The regex string to use when replacing a value"
-            },
-            "separator": {
-                "type": "string",
-                "description": "The separator string to use when concatinating the labels"
-            }
-        }
-    }
--- a/api/api-doc/metrics.json
+++ b/api/api-doc/metrics.json
@@ -1,66 +0,0 @@
-    "/v2/metrics-config/":{
-        "get":{
-            "description":"Return the metrics layer configuration",
-            "operationId":"get_metrics_config",
-            "produces":[
-                "application/json"
-            ],
-            "tags":[
-                "metrics"
-            ],
-            "parameters":[
-            ],
-            "responses":{
-                "200":{
-                "schema": {
-                    "type":"array",
-                    "items":{
-                        "$ref":"#/definitions/metrics_config",
-                        "description":"metrics Config value"
-                    }
-                    }
-                },
-                "default":{
-                    "description":"unexpected error",
-                    "schema":{
-                        "$ref":"#/definitions/ErrorModel"
-                    }
-                }
-            }
-        },
-        "post": {
-             "description":"Set the metrics layer relabel configuration",
-            "operationId":"set_metrics_config",
-            "produces":[
-                "application/json"
-            ],
-            "tags":[
-                "metrics"
-            ],
-            "parameters":[
-               {
-                "in":"body",
-                "name":"conf",
-                "description":"An array of relabel_config objects",
-                "schema": {
-                    "type":"array",
-                    "items":{
-                        "$ref":"#/definitions/metrics_config",
-                        "description":"metrics Config value"
-                    }
-                }
-               }
-            ],
-            "responses":{
-                "200":{
-                    "description": "OK"
-                },
-                "default":{
-                    "description":"unexpected error",
-                    "schema":{
-                        "$ref":"#/definitions/ErrorModel"
-                    }
-                }
-            }
-        }
-    }
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -465,7 +465,7 @@
         "operations":[
            {
               "method":"GET",
-               "summary":"Retrieve the mapping of endpoint to host ID of all nodes that own tokens",
+               "summary":"Retrieve the mapping of endpoint to host ID",
               "type":"array",
               "items":{
                  "type":"mapper"
@@ -1114,14 +1114,6 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"ranges_parallelism",
-                     "description":"An integer specifying the number of ranges to repair in parallel by user request. If this number is bigger than the max_repair_ranges_in_parallel calculated by Scylla core, the smaller one will be used.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
                  }
               ]
            },
@@ -1954,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Forces this node to recalculate versions of schema objects.",
+               "summary":"Reset local schema",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
@@ -2495,23 +2487,7 @@
               ]
            }
         ]
-      },
-      {
-         "path":"/storage_service/raft_topology/reload",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Reload Raft topology state from disk.",
-               "type":"void",
-               "nickname":"reload_raft_topology_state",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      }
+      }      
   ],
   "models":{
      "mapper":{
--- a/api/api-doc/swagger20_header.json
+++ b/api/api-doc/swagger20_header.json
@@ -16,7 +16,7 @@
    }
  },
  "host": "{{Host}}",
-  "basePath": "/",
+  "basePath": "/v2",
  "schemes": [
    "http"
  ],
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -1,182 +1,182 @@
 {
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/task_manager",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/task_manager/list_modules",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get all modules names",
-               "type":"array",
-               "items":{
-                  "type":"string"
-               },
-               "nickname":"get_modules",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/list_module_tasks/{module}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get a list of tasks",
-               "type":"array",
-               "items":{
-                  "type":"task_stats"
-               },
-               "nickname":"get_tasks",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"module",
-                     "description":"The module to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"internal",
-                     "description":"Boolean flag indicating whether internal tasks should be shown (false by default)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace to query about",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"table",
-                     "description":"The table to query about",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/task_status/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get task status",
-               "type":"task_status",
-               "nickname":"get_task_status",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/abort_task/{task_id}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Abort running task and its descendants",
-               "type":"void",
-               "nickname":"abort_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to abort",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/wait_task/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Wait for a task to complete",
-               "type":"task_status",
-               "nickname":"wait_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to wait for",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/task_status_recursive/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get statuses of the task and all its descendants",
-               "type":"array",
-               "items":{
-                  "type":"task_status"
-               },
-               "nickname":"get_task_status_recursively",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager/list_modules",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get all modules names",
+                "type":"array",
+                "items":{
+                   "type":"string"
+                },
+                "nickname":"get_modules",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/list_module_tasks/{module}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get a list of tasks",
+                "type":"array",
+                "items":{
+                    "type":"task_stats"
+                },
+                "nickname":"get_tasks",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"module",
+                        "description":"The module to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    },
+                    {
+                        "name":"internal",
+                        "description":"Boolean flag indicating whether internal tasks should be shown (false by default)",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"boolean",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/task_status/{task_id}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get task status",
+                "type":"task_status",
+                "nickname":"get_task_status",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/abort_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Abort running task and its descendants",
+                "type":"void",
+                "nickname":"abort_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to abort",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   }
+                ]
+             }
+          ]
+       },
+       {
+        "path":"/task_manager/wait_task/{task_id}",
+        "operations":[
+           {
+              "method":"GET",
+              "summary":"Wait for a task to complete",
+              "type":"task_status",
+              "nickname":"wait_task",
+              "produces":[
+                 "application/json"
+              ],
+              "parameters":[
+                 {
+                    "name":"task_id",
+                    "description":"The uuid of a task to wait for",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                 }
+              ]
+           }
+        ]
+     },
+     {
+      "path":"/task_manager/task_status_recursive/{task_id}",
+      "operations":[
+         {
+            "method":"GET",
+            "summary":"Get statuses of the task and all its descendants",
+            "type":"array",
+            "items":{
+               "type":"task_status"
+            },
+            "nickname":"get_task_status_recursively",
+            "produces":[
+               "application/json"
+            ],
+            "parameters":[
+                {
+                    "name":"task_id",
+                    "description":"The uuid of a task to query about",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                }
+            ]
+         }
+      ]
+     },
+     {
         "path":"/task_manager/ttl",
         "operations":[
            {
@@ -199,96 +199,88 @@
               ]
            }
         ]
-      }
-   ],
-   "models":{
-      "task_stats" :{
-         "id": "task_stats",
-         "description":"A task statistics object",
-         "properties":{
-            "task_id":{
-               "type":"string",
-               "description":"The uuid of a task"
-            },
-            "state":{
-               "type":"string",
-               "enum":[
+     }
+    ],
+    "models":{
+       "task_stats" :{
+           "id": "task_stats",
+           "description":"A task statistics object",
+           "properties":{
+             "task_id":{
+                "type":"string",
+                "description":"The uuid of a task"
+             },
+             "state":{
+                "type":"string",
+                "enum":[
                  "created",
                  "running",
                  "done",
                  "failed"
-               ],
-               "description":"The state of a task"
-            },
-            "type":{
-               "type":"string",
-               "description":"The description of the task"
-            },
-            "scope":{
-               "type":"string",
-               "description":"The scope of the task"
-            },
-            "keyspace":{
-               "type":"string",
-               "description":"The keyspace the task is working on (if applicable)"
-            },
-            "table":{
-               "type":"string",
-               "description":"The table the task is working on (if applicable)"
-            },
-            "entity":{
-               "type":"string",
-               "description":"Task-specific entity description"
-            },
-            "sequence_number":{
-               "type":"long",
-               "description":"The running sequence number of the task"
-            }
-         }
-      },
-      "task_status":{
-         "id":"task_status",
-         "description":"A task status object",
-         "properties":{
-            "id":{
-               "type":"string",
-               "description":"The uuid of the task"
-            },
-            "type":{
-               "type":"string",
-               "description":"The description of the task"
-            },
-            "scope":{
-               "type":"string",
-               "description":"The scope of the task"
-            },
-            "state":{
+                ],
+                "description":"The state of a task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "keyspace":{
+                "type":"string",
+                "description":"The keyspace the task is working on (if applicable)"
+             },
+             "table":{
+                "type":"string",
+                "description":"The table the task is working on (if applicable)"
+             },
+             "entity":{
+                "type":"string",
+                "description":"Task-specific entity description"
+             },
+             "sequence_number":{
+                "type":"long",
+                "description":"The running sequence number of the task"
+             }
+           }
+       },
+       "task_status":{
+          "id":"task_status",
+          "description":"A task status object",
+          "properties":{
+             "id":{
+                "type":"string",
+                "description":"The uuid of the task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "state":{
               "type":"string",
               "enum":[
-                  "created",
-                  "running",
-                  "done",
-                  "failed"
+                 "created",
+                 "running",
+                 "done",
+                 "failed"
               ],
-               "description":"The state of the task"
-            },
-            "is_abortable":{
-               "type":"boolean",
-               "description":"Boolean flag indicating whether the task can be aborted"
-            },
-            "start_time":{
-               "type":"datetime",
-               "description":"The start time of the task"
-            },
-            "end_time":{
-               "type":"datetime",
-               "description":"The end time of the task (unspecified when the task is not completed)"
-            },
-            "error":{
-               "type":"string",
-               "description":"Error string, if the task failed"
-            },
-            "parent_id":{
+                "description":"The state of the task"
+             },
+             "is_abortable":{
+                "type":"boolean",
+                "description":"Boolean flag indicating whether the task can be aborted"
+             },
+             "start_time":{
+                "type":"datetime",
+                "description":"The start time of the task"
+             },
+             "end_time":{
+                "type":"datetime",
+                "description":"The end time of the task (unspecified when the task is not completed)"
+             },
+             "error":{
+                "type":"string",
+                "description":"Error string, if the task failed"
+             },
+             "parent_id":{
               "type":"string",
               "description":"The uuid of the parent task"
            },
@@ -326,12 +318,12 @@
            },
            "children_ids":{
               "type":"array",
-               "items":{
-                  "type":"string"
-               },
+                "items":{
+                    "type":"string"
+                },
               "description":"Task IDs of children of this task"
            }
-         }
-      }
-   }
-}
+          }
+       }
+    }
+ }
--- a/api/api-doc/task_manager_test.json
+++ b/api/api-doc/task_manager_test.json
@@ -1,153 +1,153 @@
 {
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/task_manager_test",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/task_manager_test/test_module",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Register test module in task manager",
-               "type":"void",
-               "nickname":"register_test_module",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Unregister test module in task manager",
-               "type":"void",
-               "nickname":"unregister_test_module",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager_test/test_task",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Register test task",
-               "type":"string",
-               "nickname":"register_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to register",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"shard",
-                     "description":"The shard of the task",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"long",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"parent_id",
-                     "description":"The uuid of a parent task",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace the task is working on",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"table",
-                     "description":"The table the task is working on",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"entity",
-                     "description":"Task-specific entity description",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Unregister test task",
-               "type":"void",
-               "nickname":"unregister_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to register",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager_test/finish_test_task/{task_id}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Finish test task",
-               "type":"void",
-               "nickname":"finish_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to finish",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"error",
-                     "description":"The error with which task fails (if it does)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      }
-   ]
-}
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager_test",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager_test/test_module",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test module in task manager",
+                "type":"void",
+                "nickname":"register_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test module in task manager",
+                "type":"void",
+                "nickname":"unregister_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/test_task",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test task",
+                "type":"string",
+                "nickname":"register_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"shard",
+                        "description":"The shard of the task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"long",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"parent_id",
+                        "description":"The uuid of a parent task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"entity",
+                        "description":"Task-specific entity description",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test task",
+                "type":"void",
+                "nickname":"unregister_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/finish_test_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Finish test task",
+                "type":"void",
+                "nickname":"finish_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to finish",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   },
+                   {
+                      "name":"error",
+                      "description":"The error with which task fails (if it does)",
+                      "required":false,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"query"
+                   }
+                ]
+             }
+          ]
+       }
+    ]
+ }
--- a/api/api.cc
+++ b/api/api.cc
@@ -60,10 +60,8 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
-        rb02->register_api_file(r, "metrics");
        rb->register_function(r, "system",
                "The system related API");
-        rb02->add_definitions_file(r, "metrics");
        set_system(ctx, r);
    });
 }
@@ -71,7 +69,7 @@ future<> set_server_init(http_context& ctx) {
 future<> set_server_config(http_context& ctx, const db::config& cfg) {
    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
    return ctx.http_server.set_routes([&ctx, &cfg, rb02](routes& r) {
-        set_config(rb02, ctx, r, cfg, false);
+        set_config(rb02, ctx, r, cfg);
    });
 }

@@ -102,16 +100,12 @@ future<> unset_rpc_controller(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_rpc_controller(ctx, r); });
 }

-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    return register_api(ctx, "storage_service", "The storage service API", [&ss, &group0_client] (http_context& ctx, routes& r) {
-            set_storage_service(ctx, r, ss, group0_client);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<gms::gossiper>& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks) {
+    return register_api(ctx, "storage_service", "The storage service API", [&ss, &g, &cdc_gs, &sys_ks] (http_context& ctx, routes& r) {
+            set_storage_service(ctx, r, ss, g.local(), cdc_gs, sys_ks);
        });
 }

-future<> unset_server_storage_service(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
-}
-
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader) {
    return ctx.http_server.set_routes([&ctx, &sst_loader] (routes& r) { set_sstables_loader(ctx, r, sst_loader); });
 }
@@ -193,10 +187,10 @@ future<> unset_server_messaging_service(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_messaging_service(ctx, r); });
 }

-future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_proxy>& proxy) {
+future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_service>& ss) {
    return register_api(ctx, "storage_proxy",
-                "The storage proxy API", [&proxy] (http_context& ctx, routes& r) {
-                    set_storage_proxy(ctx, r, proxy);
+                "The storage proxy API", [&ss] (http_context& ctx, routes& r) {
+                    set_storage_proxy(ctx, r, ss);
                });
 }

@@ -220,10 +214,10 @@ future<> set_server_cache(http_context& ctx) {
            "The cache service API", set_cache_service);
 }

-future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& proxy) {
+future<> set_hinted_handoff(http_context& ctx, sharded<gms::gossiper>& g) {
    return register_api(ctx, "hinted_handoff",
-                "The hinted handoff API", [&proxy] (http_context& ctx, routes& r) {
-                    set_hinted_handoff(ctx, r, proxy);
+                "The hinted handoff API", [&g] (http_context& ctx, routes& r) {
+                    set_hinted_handoff(ctx, r, g.local());
                });
 }

@@ -270,36 +264,28 @@ future<> set_server_done(http_context& ctx) {
    });
 }

-future<> set_server_task_manager(http_context& ctx, sharded<tasks::task_manager>& tm, lw_shared_ptr<db::config> cfg) {
+future<> set_server_task_manager(http_context& ctx, lw_shared_ptr<db::config> cfg) {
    auto rb = std::make_shared < api_registry_builder > (ctx.api_doc);

-    return ctx.http_server.set_routes([rb, &ctx, &tm, &cfg = *cfg](routes& r) {
+    return ctx.http_server.set_routes([rb, &ctx, &cfg = *cfg](routes& r) {
        rb->register_function(r, "task_manager",
                "The task manager API");
-        set_task_manager(ctx, r, tm, cfg);
+        set_task_manager(ctx, r, cfg);
    });
 }

-future<> unset_server_task_manager(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_task_manager(ctx, r); });
-}
-
 #ifndef SCYLLA_BUILD_MODE_RELEASE

-future<> set_server_task_manager_test(http_context& ctx, sharded<tasks::task_manager>& tm) {
+future<> set_server_task_manager_test(http_context& ctx) {
    auto rb = std::make_shared < api_registry_builder > (ctx.api_doc);

-    return ctx.http_server.set_routes([rb, &ctx, &tm](routes& r) mutable {
+    return ctx.http_server.set_routes([rb, &ctx](routes& r) mutable {
        rb->register_function(r, "task_manager_test",
                "The task manager test API");
-        set_task_manager_test(ctx, r, tm);
+        set_task_manager_test(ctx, r);
    });
 }

-future<> unset_server_task_manager_test(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_task_manager_test(ctx, r); });
-}
-
 #endif

 void req_params::process(const request& req) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -22,7 +22,6 @@ namespace service {
 class load_meter;
 class storage_proxy;
 class storage_service;
-class raft_group0_client;

 } // namespace service

@@ -52,6 +51,7 @@ class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
+namespace cdc { class generation_service; }

 namespace gms {

@@ -61,10 +61,6 @@ class gossiper;

 namespace auth { class service; }

-namespace tasks {
-class task_manager;
-}
-
 namespace api {

 struct http_context {
@@ -72,12 +68,15 @@ struct http_context {
    sstring api_doc;
    httpd::http_server_control http_server;
    distributed<replica::database>& db;
+    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
    const sharded<locator::shared_token_metadata>& shared_token_metadata;
+    sharded<tasks::task_manager>& tm;

    http_context(distributed<replica::database>& _db,
-            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm)
-            : db(_db), lmeter(_lm), shared_token_metadata(_stm) {
+            distributed<service::storage_proxy>& _sp,
+            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm, sharded<tasks::task_manager>& _tm)
+            : db(_db), sp(_sp), lmeter(_lm), shared_token_metadata(_stm), tm(_tm) {
    }

    const locator::token_metadata& get_token_metadata();
@@ -87,8 +86,7 @@ future<> set_server_init(http_context& ctx);
 future<> set_server_config(http_context& ctx, const db::config& cfg);
 future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
 future<> unset_server_snitch(http_context& ctx);
-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
-future<> unset_server_storage_service(http_context& ctx);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<gms::gossiper>& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks);
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
 future<> unset_server_sstables_loader(http_context& ctx);
 future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb);
@@ -108,19 +106,17 @@ future<> set_server_load_sstable(http_context& ctx, sharded<db::system_keyspace>
 future<> unset_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
 future<> unset_server_messaging_service(http_context& ctx);
-future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_proxy>& proxy);
+future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_service>& ss);
 future<> unset_server_storage_proxy(http_context& ctx);
 future<> set_server_stream_manager(http_context& ctx, sharded<streaming::stream_manager>& sm);
 future<> unset_server_stream_manager(http_context& ctx);
-future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& p);
+future<> set_hinted_handoff(http_context& ctx, sharded<gms::gossiper>& g);
 future<> unset_hinted_handoff(http_context& ctx);
 future<> set_server_gossip_settle(http_context& ctx, sharded<gms::gossiper>& g);
 future<> set_server_cache(http_context& ctx);
 future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
-future<> set_server_task_manager(http_context& ctx, sharded<tasks::task_manager>& tm, lw_shared_ptr<db::config> cfg);
-future<> unset_server_task_manager(http_context& ctx);
-future<> set_server_task_manager_test(http_context& ctx, sharded<tasks::task_manager>& tm);
-future<> unset_server_task_manager_test(http_context& ctx);
+future<> set_server_task_manager(http_context& ctx, lw_shared_ptr<db::config> cfg);
+future<> set_server_task_manager_test(http_context& ctx);

 }
--- a/api/authorization_cache.cc
+++ b/api/authorization_cache.cc
@@ -11,7 +11,6 @@
 #include "api/authorization_cache.hh"
 #include "api/api.hh"
 #include "auth/common.hh"
-#include "auth/service.hh"

 namespace api {
 using namespace json;
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -43,7 +43,7 @@ std::tuple<sstring, sstring> parse_fully_qualified_cf_name(sstring name) {
    return std::make_tuple(name.substr(0, pos), name.substr(end));
 }

-table_id get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
+const table_id& get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
    try {
        return db.find_uuid(ks, cf);
    } catch (replica::no_such_column_family& e) {
@@ -51,7 +51,7 @@ table_id get_uuid(const sstring& ks, const sstring& cf, const replica::database&
    }
 }

-table_id get_uuid(const sstring& name, const replica::database& db) {
+const table_id& get_uuid(const sstring& name, const replica::database& db) {
    auto [ks, cf] = parse_fully_qualified_cf_name(name);
    return get_uuid(ks, cf, db);
 }
@@ -135,9 +135,9 @@ static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const
 static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
    std::function<utils::ihistogram(const replica::database&)> fun = [f] (const replica::database& db)  {
        utils::ihistogram res;
-        db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) mutable {
-            res += (table->get_stats().*f).hist;
-        });
+        for (auto i : db.get_column_families()) {
+            res += (i.second->get_stats().*f).hist;
+        }
        return res;
    };
    return ctx.db.map(fun).then([](const std::vector<utils::ihistogram> &res) {
@@ -162,9 +162,9 @@ static future<json::json_return_type>  get_cf_rate_and_histogram(http_context& c
 static future<json::json_return_type> get_cf_rate_and_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
    std::function<utils::rate_moving_average_and_histogram(const replica::database&)> fun = [f] (const replica::database& db)  {
        utils::rate_moving_average_and_histogram res;
-        db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
-            res += (table->get_stats().*f).rate();
-        });
+        for (auto i : db.get_column_families()) {
+            res += (i.second->get_stats().*f).rate();
+        }
        return res;
    };
    return ctx.db.map(fun).then([](const std::vector<utils::rate_moving_average_and_histogram> &res) {
@@ -306,21 +306,21 @@ ratio_holder filter_recent_false_positive_as_ratio_holder(const sstables::shared
 void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace>& sys_ks) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        std::vector<sstring> res;
-        ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
-            res.push_back(kscf.first + ":" + kscf.second);
-        });
+        for (auto i: ctx.db.local().get_column_families_mapping()) {
+            res.push_back(i.first.first + ":" + i.first.second);
+        }
        return res;
    });

    cf::get_column_family.set(r, [&ctx] (std::unique_ptr<http::request> req){
-        std::list<cf::column_family_info> res;
-            ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
+            std::list<cf::column_family_info> res;
+            for (auto i: ctx.db.local().get_column_families_mapping()) {
                cf::column_family_info info;
-                info.ks = kscf.first;
-                info.cf =  kscf.second;
+                info.ks = i.first.first;
+                info.cf =  i.first.second;
                info.type = "ColumnFamilies";
                res.push_back(info);
-            });
+            }
            return make_ready_future<json::json_return_type>(json::stream_range_as_array(std::move(res), std::identity()));
        });

@@ -1017,12 +1017,11 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        auto key = req->get_query_param("key");
        auto uuid = get_uuid(req->param["name"], ctx.db.local());

-        return ctx.db.map_reduce0([key, uuid] (replica::database& db) -> future<std::unordered_set<sstring>> {
-            auto sstables = co_await db.find_column_family(uuid).get_sstables_by_partition_key(key);
-            co_return boost::copy_range<std::unordered_set<sstring>>(sstables | boost::adaptors::transformed([] (auto s) { return s->get_filename(); }));
+        return ctx.db.map_reduce0([key, uuid] (replica::database& db) {
+            return db.find_column_family(uuid).get_sstables_by_partition_key(key);
        }, std::unordered_set<sstring>(),
-        [](std::unordered_set<sstring> a, std::unordered_set<sstring>&& b) mutable {
-            a.merge(b);
+            [](std::unordered_set<sstring> a, std::unordered_set<sstring>&& b) mutable {
+            a.insert(b.begin(),b.end());
            return a;
        }).then([](const std::unordered_set<sstring>& res) {
            return make_ready_future<json::json_return_type>(container_to_vec(res));
@@ -1054,10 +1053,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        apilog.info("column_family/force_major_compaction: name={}", req->param["name"]);
        auto [ks, cf] = parse_fully_qualified_cf_name(req->param["name"]);
        auto keyspace = validate_keyspace(ctx, ks);
-        std::vector<table_info> table_infos = {table_info{
-            .name = cf,
-            .id = ctx.db.local().find_uuid(ks, cf)
-        }};
+        std::vector<table_id> table_infos = {ctx.db.local().find_uuid(ks, cf)};

        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, std::move(table_infos));
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -23,7 +23,7 @@ namespace api {
 void set_column_family(http_context& ctx, httpd::routes& r, sharded<db::system_keyspace>& sys_ks);
 void unset_column_family(http_context& ctx, httpd::routes& r);

-table_id get_uuid(const sstring& name, const replica::database& db);
+const table_id& get_uuid(const sstring& name, const replica::database& db);
 future<> foreach_column_family(http_context& ctx, const sstring& name, std::function<void(replica::column_family&)> f);


@@ -68,10 +68,9 @@ struct map_reduce_column_families_locally {
    std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)> reducer;
    future<std::unique_ptr<std::any>> operator()(replica::database& db) const {
        auto res = seastar::make_lw_shared<std::unique_ptr<std::any>>(std::make_unique<std::any>(init));
-        return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) {
-            *res = reducer(std::move(*res), mapper(*table.get()));
-            return make_ready_future();
-        }).then([res] () {
+        return do_for_each(db.get_column_families(), [res, this](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) {
+            *res = reducer(std::move(*res), mapper(*i.second.get()));
+        }).then([res] {
            return std::move(*res);
        });
    }
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -68,8 +68,8 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) {
-                    replica::table& cf = *table.get();
+                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) -> future<> {
+                    replica::table& cf = *i.second.get();
                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
                    return make_ready_future<>();
                }).then([&tasks] {
--- a/api/config.cc
+++ b/api/config.cc
@@ -45,7 +45,7 @@ future<> get_config_swagger_entry(std::string_view name, const std::string& desc
    } else {
        ss <<',';
    };
-    ss << "\"/v2/config/" << name <<"\": {"
+    ss << "\"/config/" << name <<"\": {"
      "\"get\": {"
        "\"description\": \"" << boost::replace_all_copy(boost::replace_all_copy(boost::replace_all_copy(description,"\n","\\n"),"\"", "''"), "\t", " ") <<"\","
        "\"operationId\": \"find_config_"<< name <<"\","
@@ -76,9 +76,9 @@ future<> get_config_swagger_entry(std::string_view name, const std::string& desc

 namespace cs = httpd::config_json;

-void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx, routes& r, const db::config& cfg, bool first) {
-    rb->register_function(r, [&cfg, first] (output_stream<char>& os) {
-        return do_with(first, [&os, &cfg] (bool& first) {
+void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx, routes& r, const db::config& cfg) {
+    rb->register_function(r, [&cfg] (output_stream<char>& os) {
+        return do_with(true, [&os, &cfg] (bool& first) {
            auto f = make_ready_future();
            for (auto&& cfg_ref : cfg.values()) {
                auto&& cfg = cfg_ref.get();
--- a/api/config.hh
+++ b/api/config.hh
@@ -13,5 +13,5 @@

 namespace api {

-void set_config(std::shared_ptr<httpd::api_registry_builder20> rb, http_context& ctx, httpd::routes& r, const db::config& cfg, bool first = false);
+void set_config(std::shared_ptr<httpd::api_registry_builder20> rb, http_context& ctx, httpd::routes& r, const db::config& cfg);
 }
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -12,9 +12,7 @@
 #include <seastar/http/exception.hh>
 #include "log.hh"
 #include "utils/error_injection.hh"
-#include "utils/rjson.hh"
 #include <seastar/core/future-util.hh>
-#include <seastar/util/short_streams.hh>

 namespace api {
 using namespace seastar::httpd;
@@ -26,27 +24,10 @@ void set_error_injection(http_context& ctx, routes& r) {
    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
        sstring injection = req->param["injection"];
        bool one_shot = req->get_query_param("one_shot") == "True";
-        auto params = req->content;
-
-        const size_t max_params_size = 1024 * 1024;
-        if (params.size() > max_params_size) {
-            // This is a hard limit, because we don't want to allocate
-            // too much memory or block the thread for too long.
-            throw httpd::bad_param_exception(format("Injection parameters are too long, max length is {}", max_params_size));
-        }
-
-        try {
-            auto parameters = params.empty()
-                ? utils::error_injection_parameters{}
-                : rjson::parse_to_map<utils::error_injection_parameters>(params);
-
-            auto& errinj = utils::get_local_injector();
-            return errinj.enable_on_all(injection, one_shot, std::move(parameters)).then([] {
-                return make_ready_future<json::json_return_type>(json::json_void());
-            });
-        } catch (const rjson::error& e) {
-            throw httpd::bad_param_exception(format("Failed to parse injections parameters: {}", e.what()));
-        }
+        auto& errinj = utils::get_local_injector();
+        return errinj.enable_on_all(injection, one_shot).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
    });

    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
@@ -71,13 +52,6 @@ void set_error_injection(http_context& ctx, routes& r) {
        });
    });

-    hf::message_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->param["injection"];
-        auto& errinj = utils::get_local_injector();
-        return errinj.receive_message_on_all(injection).then([] {
-            return make_ready_future<json::json_return_type>(json::json_void());
-        });
-    });
 }

 } // namespace api
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -19,25 +19,24 @@ namespace fd = httpd::failure_detector_json;
 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
        std::vector<fd::endpoint_state> res;
-        res.reserve(g.num_endpoints());
-        g.for_each_endpoint_state([&] (const gms::inet_address& addr, const gms::endpoint_state& eps) {
+        for (auto i : g.get_endpoint_states()) {
            fd::endpoint_state val;
-            val.addrs = fmt::to_string(addr);
-            val.is_alive = g.is_alive(addr);
-            val.generation = eps.get_heart_beat_state().get_generation().value();
-            val.version = eps.get_heart_beat_state().get_heart_beat_version().value();
-            val.update_time = eps.get_update_timestamp().time_since_epoch().count();
-            for (const auto& [as_type, app_state] : eps.get_application_state_map()) {
+            val.addrs = fmt::to_string(i.first);
+            val.is_alive = i.second.is_alive();
+            val.generation = i.second.get_heart_beat_state().get_generation().value();
+            val.version = i.second.get_heart_beat_state().get_heart_beat_version().value();
+            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+            for (auto a : i.second.get_application_state_map()) {
                fd::version_value version_val;
                // We return the enum index and not it's name to stay compatible to origin
                // method that the state index are static but the name can be changed.
-                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(as_type);
-                version_val.value = app_state.value();
-                version_val.version = app_state.version().value();
+                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                version_val.value = a.second.value();
+                version_val.version = a.second.version().value();
                val.application_state.push(version_val);
            }
-            res.emplace_back(std::move(val));
-        });
+            res.push_back(val);
+        }
        return make_ready_future<json::json_return_type>(res);
    });

@@ -57,9 +56,9 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
        std::map<sstring, sstring> nodes_status;
-        g.for_each_endpoint_state([&] (const gms::inet_address& node, const gms::endpoint_state&) {
-            nodes_status.emplace(node.to_sstring(), g.is_alive(node) ? "UP" : "DOWN");
-        });
+        for (auto& entry : g.get_endpoint_states()) {
+            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+        }
        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
    });

@@ -71,7 +70,7 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        auto state = g.get_endpoint_state_ptr(gms::inet_address(req->param["addr"]));
+        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
        if (!state) {
            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
        }
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -6,11 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

-#include <seastar/core/coroutine.hh>
-
 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
-#include "gms/endpoint_state.hh"
 #include "gms/gossiper.hh"

 namespace api {
@@ -18,9 +15,9 @@ using namespace seastar::httpd;
 using namespace json;

 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
-    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
-        auto res = co_await g.get_unreachable_members_synchronized();
-        co_return json::json_return_type(container_to_vec(res));
+    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (const_req req) {
+        auto res = g.get_unreachable_members();
+        return container_to_vec(res);
    });


@@ -30,11 +27,9 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
        });
    });

-    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
-        gms::inet_address ep(req->param["addr"]);
-        // synchronize unreachable_members on all shards
-        co_await g.get_unreachable_members_synchronized();
-        co_return g.get_endpoint_downtime(ep);
+    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
+        gms::inet_address ep(req.param["addr"]);
+        return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<http::request> req) {
@@ -64,7 +59,7 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {

    httpd::gossiper_json::force_remove_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
        gms::inet_address ep(req->param["addr"]);
-        return g.force_remove_endpoint(ep, gms::null_permit_id).then([] {
+        return g.force_remove_endpoint(ep).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
--- a/api/hinted_handoff.cc
+++ b/api/hinted_handoff.cc
@@ -13,6 +13,7 @@
 #include "api/api-doc/hinted_handoff.json.hh"

 #include "gms/inet_address.hh"
+#include "gms/gossiper.hh"
 #include "service/storage_proxy.hh"

 namespace api {
@@ -21,33 +22,38 @@ using namespace json;
 using namespace seastar::httpd;
 namespace hh = httpd::hinted_handoff_json;

-void set_hinted_handoff(http_context& ctx, routes& r, sharded<service::storage_proxy>& proxy) {
-    hh::create_hints_sync_point.set(r, [&proxy] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto parse_hosts_list = [] (sstring arg) {
+void set_hinted_handoff(http_context& ctx, routes& r, gms::gossiper& g) {
+    hh::create_hints_sync_point.set(r, [&ctx, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto parse_hosts_list = [&g] (sstring arg) {
            std::vector<sstring> hosts_str = split(arg, ",");
            std::vector<gms::inet_address> hosts;
            hosts.reserve(hosts_str.size());

-            for (const auto& host_str : hosts_str) {
-                try {
-                    gms::inet_address host;
-                    host = gms::inet_address(host_str);
-                    hosts.push_back(host);
-                } catch (std::exception& e) {
-                    throw httpd::bad_param_exception(format("Failed to parse host address {}: {}", host_str, e.what()));
+            if (hosts_str.empty()) {
+                // No target_hosts specified means that we should wait for hints for all nodes to be sent
+                const auto members_set = g.get_live_members();
+                std::copy(members_set.begin(), members_set.end(), std::back_inserter(hosts));
+            } else {
+                for (const auto& host_str : hosts_str) {
+                    try {
+                        gms::inet_address host;
+                        host = gms::inet_address(host_str);
+                        hosts.push_back(host);
+                    } catch (std::exception& e) {
+                        throw httpd::bad_param_exception(format("Failed to parse host address {}: {}", host_str, e.what()));
+                    }
                }
            }
-
            return hosts;
        };

        std::vector<gms::inet_address> target_hosts = parse_hosts_list(req->get_query_param("target_hosts"));
-        return proxy.local().create_hint_sync_point(std::move(target_hosts)).then([] (db::hints::sync_point sync_point) {
+        return ctx.sp.local().create_hint_sync_point(std::move(target_hosts)).then([] (db::hints::sync_point sync_point) {
            return json::json_return_type(sync_point.encode());
        });
    });

-    hh::get_hints_sync_point.set(r, [&proxy] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    hh::get_hints_sync_point.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        db::hints::sync_point sync_point;
        const sstring encoded = req->get_query_param("id");
        try {
@@ -81,7 +87,7 @@ void set_hinted_handoff(http_context& ctx, routes& r, sharded<service::storage_p
        using return_type = hh::ns_get_hints_sync_point::get_hints_sync_point_return_type;
        using return_type_wrapper = hh::ns_get_hints_sync_point::return_type_wrapper;

-        return proxy.local().wait_for_hint_sync_point(std::move(sync_point), deadline).then([] {
+        return ctx.sp.local().wait_for_hint_sync_point(std::move(sync_point), deadline).then([] {
            return json::json_return_type(return_type_wrapper(return_type::DONE));
        }).handle_exception_type([] (const timed_out_error&) {
            return json::json_return_type(return_type_wrapper(return_type::IN_PROGRESS));
--- a/api/hinted_handoff.hh
+++ b/api/hinted_handoff.hh
@@ -8,14 +8,17 @@

 #pragma once

-#include <seastar/core/sharded.hh>
 #include "api.hh"

-namespace service { class storage_proxy; }
+namespace gms {
+
+class gossiper;
+
+}

 namespace api {

-void set_hinted_handoff(http_context& ctx, httpd::routes& r, sharded<service::storage_proxy>& p);
+void set_hinted_handoff(http_context& ctx, httpd::routes& r, gms::gossiper& g);
 void unset_hinted_handoff(http_context& ctx, httpd::routes& r);

 }
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -10,6 +10,7 @@
 #include "service/storage_proxy.hh"
 #include "api/api-doc/storage_proxy.json.hh"
 #include "api/api-doc/utils.json.hh"
+#include "service/storage_service.hh"
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "replica/database.hh"
@@ -115,17 +116,17 @@ utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimat
    return res;
 }

-static future<json::json_return_type>  sum_estimated_histogram(sharded<service::storage_proxy>& proxy, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(proxy, [f] (service::storage_proxy_stats::stats& stats) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
        return (stats.*f).histogram();
    }, utils::time_estimated_histogram_merge, utils::time_estimated_histogram()).then([](const utils::time_estimated_histogram& val) {
        return make_ready_future<json::json_return_type>(time_to_json_histogram(val));
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(sharded<service::storage_proxy>& proxy, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {

-    return two_dimensional_map_reduce(proxy, f, utils::estimated_histogram_merge,
+    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
@@ -133,8 +134,8 @@ static future<json::json_return_type>  sum_estimated_histogram(sharded<service::
    });
 }

-static future<json::json_return_type>  total_latency(sharded<service::storage_proxy>& proxy, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(proxy, [f] (service::storage_proxy_stats::stats& stats) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
            return (stats.*f).hist.mean * (stats.*f).hist.count;
        }, std::plus<double>(), 0.0).then([](double val) {
        int64_t res = val;
@@ -183,43 +184,43 @@ sum_timer_stats_storage_proxy(distributed<proxy>& d,
    });
 }

-void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_proxy>& proxy) {
+void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_service>& ss) {
    sp::get_total_hints.set(r, [](std::unique_ptr<http::request> req)  {
        //TBD
        unimplemented();
        return make_ready_future<json::json_return_type>(0);
    });

-    sp::get_hinted_handoff_enabled.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        const auto& filter = proxy.local().get_hints_host_filter();
+    sp::get_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        const auto& filter = ctx.sp.local().get_hints_host_filter();
        return make_ready_future<json::json_return_type>(!filter.is_disabled_for_all());
    });

-    sp::set_hinted_handoff_enabled.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::set_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        auto enable = req->get_query_param("enable");
        auto filter = (enable == "true" || enable == "1")
                ? db::hints::host_filter(db::hints::host_filter::enabled_for_all_tag {})
                : db::hints::host_filter(db::hints::host_filter::disabled_for_all_tag {});
-        return proxy.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+        return ctx.sp.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
            return sp.change_hints_host_filter(filter);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    sp::get_hinted_handoff_enabled_by_dc.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::get_hinted_handoff_enabled_by_dc.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        std::vector<sstring> res;
-        const auto& filter = proxy.local().get_hints_host_filter();
+        const auto& filter = ctx.sp.local().get_hints_host_filter();
        const auto& dcs = filter.get_dcs();
        res.reserve(res.size());
        std::copy(dcs.begin(), dcs.end(), std::back_inserter(res));
        return make_ready_future<json::json_return_type>(res);
    });

-    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        auto dcs = req->get_query_param("dcs");
        auto filter = db::hints::host_filter::parse_from_dc_list(std::move(dcs));
-        return proxy.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+        return ctx.sp.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
            return sp.change_hints_host_filter(filter);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -341,131 +342,144 @@ void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_pr
        return make_ready_future<json::json_return_type>(json_void());
    });

-    sp::get_read_repair_attempted.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_attempts);
+    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
    });

-    sp::get_read_repair_repaired_blocking.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
+    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
    });

-    sp::get_read_repair_repaired_background.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_repaired_background);
+    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
    });

-    sp::get_cas_read_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_read_timeouts);
+    sp::get_schema_versions.set(r, [&ss](std::unique_ptr<http::request> req)  {
+        return ss.local().describe_schema_versions().then([] (auto result) {
+            std::vector<sp::mapper_list> res;
+            for (auto e : result) {
+                sp::mapper_list entry;
+                entry.key = std::move(e.first);
+                entry.value = std::move(e.second);
+                res.emplace_back(std::move(entry));
+            }
+            return make_ready_future<json::json_return_type>(std::move(res));
+        });
    });

-    sp::get_cas_read_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_read_unavailables);
+    sp::get_cas_read_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_timeouts);
    });

-    sp::get_cas_write_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_write_timeouts);
+    sp::get_cas_read_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_unavailables);
    });

-    sp::get_cas_write_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_write_unavailables);
+    sp::get_cas_write_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_timeouts);
    });

-    sp::get_cas_write_metrics_unfinished_commit.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_write_unfinished_commit);
+    sp::get_cas_write_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_unavailables);
    });

-    sp::get_cas_write_metrics_contention.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &proxy::stats::cas_write_contention);
+    sp::get_cas_write_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_unfinished_commit);
    });

-    sp::get_cas_write_metrics_condition_not_met.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_write_condition_not_met);
+    sp::get_cas_write_metrics_contention.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_write_contention);
    });

-    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_failed_read_round_optimization);
+    sp::get_cas_write_metrics_condition_not_met.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

-    sp::get_cas_read_metrics_unfinished_commit.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_read_unfinished_commit);
+    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
    });

-    sp::get_cas_read_metrics_contention.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &proxy::stats::cas_read_contention);
+    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });

-    sp::get_read_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::read_timeouts);
+    sp::get_cas_read_metrics_contention.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_read_contention);
    });

-    sp::get_read_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::read_unavailables);
+    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

-    sp::get_range_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::range_slice_timeouts);
+    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

-    sp::get_range_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::range_slice_unavailables);
+    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

-    sp::get_write_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::write_timeouts);
+    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

-    sp::get_write_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::write_unavailables);
+    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

-    sp::get_read_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::read_timeouts);
+    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

-    sp::get_read_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::read_unavailables);
+    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

-    sp::get_range_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::range_slice_timeouts);
+    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

-    sp::get_range_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::range_slice_unavailables);
+    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

-    sp::get_write_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::write_timeouts);
+    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

-    sp::get_write_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::write_unavailables);
+    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

-    sp::get_range_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

-    sp::get_write_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_read_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_range_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_write_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::write);
-    });
-    sp::get_cas_write_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats(proxy, &proxy::stats::cas_write);
+    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_cas_read_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats(proxy, &proxy::stats::cas_read);
+    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
+    });
+    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
+    });
+
+    sp::get_cas_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_read);
    });

    sp::get_view_write_metrics_latency_histogram.set(r, [](std::unique_ptr<http::request> req) {
@@ -476,31 +490,31 @@ void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_pr
        return make_ready_future<json::json_return_type>(get_empty_moving_average());
    });

-    sp::get_read_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_read_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_read_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
-    sp::get_write_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_write_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_range_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_range_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
    });
 }

@@ -533,6 +547,7 @@ void unset_storage_proxy(http_context& ctx, routes& r) {
    sp::get_read_repair_attempted.unset(r);
    sp::get_read_repair_repaired_blocking.unset(r);
    sp::get_read_repair_repaired_background.unset(r);
+    sp::get_schema_versions.unset(r);
    sp::get_cas_read_timeouts.unset(r);
    sp::get_cas_read_unavailables.unset(r);
    sp::get_cas_write_timeouts.unset(r);
--- a/api/storage_proxy.hh
+++ b/api/storage_proxy.hh
@@ -11,11 +11,11 @@
 #include <seastar/core/sharded.hh>
 #include "api.hh"

-namespace service { class storage_proxy; }
+namespace service { class storage_service; }

 namespace api {

-void set_storage_proxy(http_context& ctx, httpd::routes& r, sharded<service::storage_proxy>& proxy);
+void set_storage_proxy(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss);
 void unset_storage_proxy(http_context& ctx, httpd::routes& r);

 }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -8,7 +8,6 @@

 #include "storage_service.hh"
 #include "api/api-doc/storage_service.json.hh"
-#include "api/api-doc/storage_proxy.json.hh"
 #include "db/config.hh"
 #include "db/schema_tables.hh"
 #include "utils/hash.hh"
@@ -43,6 +42,7 @@
 #include "thrift/controller.hh"
 #include "locator/token_metadata.hh"
 #include "cdc/generation_service.hh"
+#include "service/storage_proxy.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "sstables_loader.hh"
 #include "db/view/view_builder.hh"
@@ -52,10 +52,22 @@ using namespace std::chrono_literals;

 extern logging::logger apilog;

+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti) {
+    fmt::print(os, "table{{name={}, id={}}}", ti.name, ti.id);
+    return os;
+}
+
+} // namespace std
+
 namespace api {

+const locator::token_metadata& http_context::get_token_metadata() {
+        return *shared_token_metadata.local().get();
+}
+
 namespace ss = httpd::storage_service_json;
-namespace sp = httpd::storage_proxy_json;
 using namespace json;

 sstring validate_keyspace(http_context& ctx, sstring ks_name) {
@@ -317,7 +329,7 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair) {
    ss::repair_async.set(r, [&ctx, &repair](std::unique_ptr<http::request> req) {
        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "ignore_nodes", "trace",
-                "startToken", "endToken", "ranges_parallelism"};
+                "startToken", "endToken" };
        std::unordered_map<sstring, sstring> options_map;
        for (auto o : options) {
            auto s = req->get_query_param(o);
@@ -462,21 +474,29 @@ static future<json::json_return_type> describe_ring_as_json(sharded<service::sto
    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring(keyspace), token_range_endpoints_to_json));
 }

-void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    ss::local_hostid.set(r, [&ss](std::unique_ptr<http::request> req) {
-        auto id = ss.local().get_token_metadata().get_my_id();
+static std::vector<table_id> get_table_ids(const std::vector<table_info>& table_infos) {
+    std::vector<table_id> table_ids{table_infos.size()};
+    boost::transform(table_infos, table_ids.begin(), [] (const auto& ti) {
+        return ti.id;
+    });
+    return table_ids;
+}
+
+void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks) {
+    ss::local_hostid.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        auto id = ctx.db.local().get_config().host_id;
        return make_ready_future<json::json_return_type>(id.to_sstring());
    });

-    ss::get_tokens.set(r, [&ss] (std::unique_ptr<http::request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ss.local().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<http::request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().sorted_tokens(), [](const dht::token& i) {
           return fmt::to_string(i);
        }));
    });

-    ss::get_node_tokens.set(r, [&ss] (std::unique_ptr<http::request> req) {
+    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ss.local().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().get_tokens(addr), [](const dht::token& i) {
           return fmt::to_string(i);
       }));
    });
@@ -544,8 +564,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::get_leaving_nodes.set(r, [&ss](const_req req) {
-        return container_to_vec(ss.local().get_token_metadata().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
+        return container_to_vec(ctx.get_token_metadata().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -553,8 +573,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [&ss](const_req req) {
-        auto points = ss.local().get_token_metadata().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
+        auto points = ctx.get_token_metadata().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(fmt::to_string(i.second));
@@ -626,9 +646,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return describe_ring_as_json(ss, validate_keyspace(ctx, req->param));
    });

-    ss::get_host_id_map.set(r, [&ss](const_req req) {
+    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(ss.local().get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<http::request> req) {
@@ -648,9 +668,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::get_current_generation_number.set(r, [&ss](std::unique_ptr<http::request> req) {
+    ss::get_current_generation_number.set(r, [&g](std::unique_ptr<http::request> req) {
        gms::inet_address ep(utils::fb_utilities::get_broadcast_address());
-        return ss.local().gossiper().get_current_generation_number(ep).then([](gms::generation_type res) {
+        return g.get_current_generation_number(ep).then([](gms::generation_type res) {
            return make_ready_future<json::json_return_type>(res.value());
        });
    });
@@ -661,10 +681,11 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                req.get_query_param("key")));
    });

-    ss::cdc_streams_check_and_repair.set(r, [&ss] (std::unique_ptr<http::request> req) {
-        return ss.invoke_on(0, [] (service::storage_service& ss) {
-            return ss.check_and_repair_cdc_streams();
-        }).then([] {
+    ss::cdc_streams_check_and_repair.set(r, [&cdc_gs] (std::unique_ptr<http::request> req) {
+        if (!cdc_gs.local_is_initialized()) {
+            throw std::runtime_error("get_cdc_generation_service: not initialized yet");
+        }
+        return cdc_gs.local().check_and_repair_cdc_streams().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
@@ -676,7 +697,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, table_infos);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos);
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos));
        try {
            co_await task->done();
        } catch (...) {
@@ -699,7 +720,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        }

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos);
+        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos));
        try {
            co_await task->done();
        } catch (...) {
@@ -714,7 +735,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, res);
+        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, get_table_ids(table_infos), res);
        try {
            co_await task->done();
        } catch (...) {
@@ -732,7 +753,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos), exclude_current_version);
        try {
            co_await task->done();
        } catch (...) {
@@ -773,16 +794,21 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::remove_node.set(r, [&ss](std::unique_ptr<http::request> req) {
        auto host_id = validate_host_id(req->get_query_param("host_id"));
-        std::vector<sstring> ignore_nodes_strs = utils::split_comma_separated_list(req->get_query_param("ignore_nodes"));
+        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<locator::host_id_or_endpoint>();
-        for (const sstring& n : ignore_nodes_strs) {
+        for (std::string n : ignore_nodes_strs) {
            try {
-                auto hoep = locator::host_id_or_endpoint(n);
-                if (!ignore_nodes.empty() && hoep.has_host_id() != ignore_nodes.front().has_host_id()) {
-                    throw std::runtime_error("All nodes should be identified using the same method: either Host IDs or ip addresses.");
+                std::replace(n.begin(), n.end(), '\"', ' ');
+                std::replace(n.begin(), n.end(), '\'', ' ');
+                boost::trim_all(n);
+                if (!n.empty()) {
+                    auto hoep = locator::host_id_or_endpoint(n);
+                    if (!ignore_nodes.empty() && hoep.has_host_id() != ignore_nodes.front().has_host_id()) {
+                        throw std::runtime_error("All nodes should be identified using the same method: either Host IDs or ip addresses.");
+                    }
+                    ignore_nodes.push_back(std::move(hoep));
                }
-                ignore_nodes.push_back(std::move(hoep));
            } catch (...) {
                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}: {}", ignore_nodes_strs, n, std::current_exception()));
            }
@@ -895,11 +921,11 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(json_void());
    });

-    ss::is_initialized.set(r, [&ss](std::unique_ptr<http::request> req) {
-        return ss.local().get_operation_mode().then([&ss] (auto mode) {
+    ss::is_initialized.set(r, [&ss, &g](std::unique_ptr<http::request> req) {
+        return ss.local().get_operation_mode().then([&g] (auto mode) {
            bool is_initialized = mode >= service::storage_service::mode::STARTING;
            if (mode == service::storage_service::mode::NORMAL) {
-                is_initialized = ss.local().gossiper().is_enabled();
+                is_initialized = g.is_enabled();
            }
            return make_ready_future<json::json_return_type>(is_initialized);
        });
@@ -968,9 +994,10 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                ks.set_incremental_backups(value);
            }

-            db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
-                table->set_incremental_backups(value);
-            });
+            for (auto& pair: db.get_column_families()) {
+                auto cf_ptr = pair.second;
+                cf_ptr->set_incremental_backups(value);
+            }
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -1011,11 +1038,13 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    ss::reset_local_schema.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
+        auto& fs = ctx.sp.local().features();
        apilog.info("reset_local_schema");
-        co_await ss.local().reload_schema();
-        co_return json_void();
+        return db::schema_tables::recalculate_schema_version(sys_ks, ctx.sp, fs).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<http::request> req) {
@@ -1120,12 +1149,12 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(json_void());
      });

-    ss::get_cluster_name.set(r, [&ss](const_req req) {
-        return ss.local().gossiper().get_cluster_name();
+    ss::get_cluster_name.set(r, [&g](const_req req) {
+        return g.get_cluster_name();
    });

-    ss::get_partitioner_name.set(r, [&ss](const_req req) {
-        return ss.local().gossiper().get_partitioner_name();
+    ss::get_partitioner_name.set(r, [&g](const_req req) {
+        return g.get_partitioner_name();
    });

    ss::get_tombstone_warn_threshold.set(r, [](std::unique_ptr<http::request> req) {
@@ -1243,7 +1272,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

                auto& ext = db.get_config().extensions();

-                db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
+                for (auto& t : db.get_column_families() | boost::adaptors::map_values) {
                    auto& schema = t->schema();
                    if ((ks.empty() || ks == schema->ks_name()) && (cf.empty() || cf == schema->cf_name())) {
                        // at most Nsstables long
@@ -1324,7 +1353,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                        }
                        res.emplace_back(std::move(tst));
                    }
-                });
+                }
                std::sort(res.begin(), res.end(), [](const ss::table_sstables& t1, const ss::table_sstables& t2) {
                    return t1.keyspace() < t2.keyspace() || (t1.keyspace() == t2.keyspace() && t1.table() < t2.table());
                });
@@ -1334,123 +1363,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
            });
        });
    });
-
-    ss::reload_raft_topology_state.set(r,
-            [&ss, &group0_client] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        co_await ss.invoke_on(0, [&group0_client] (service::storage_service& ss) -> future<> {
-            apilog.info("Waiting for group 0 read/apply mutex before reloading Raft topology state...");
-            auto holder = co_await group0_client.hold_read_apply_mutex();
-            apilog.info("Reloading Raft topology state");
-            // Using topology_transition() instead of topology_state_load(), because the former notifies listeners
-            co_await ss.topology_transition();
-            apilog.info("Reloaded Raft topology state");
-        });
-        co_return json_void();
-    });
-
-    sp::get_schema_versions.set(r, [&ss](std::unique_ptr<http::request> req)  {
-        return ss.local().describe_schema_versions().then([] (auto result) {
-            std::vector<sp::mapper_list> res;
-            for (auto e : result) {
-                sp::mapper_list entry;
-                entry.key = std::move(e.first);
-                entry.value = std::move(e.second);
-                res.emplace_back(std::move(entry));
-            }
-            return make_ready_future<json::json_return_type>(std::move(res));
-        });
-    });
-}
-
-void unset_storage_service(http_context& ctx, routes& r) {
-    ss::local_hostid.unset(r);
-    ss::get_tokens.unset(r);
-    ss::get_node_tokens.unset(r);
-    ss::get_commitlog.unset(r);
-    ss::get_token_endpoint.unset(r);
-    ss::toppartitions_generic.unset(r);
-    ss::get_leaving_nodes.unset(r);
-    ss::get_moving_nodes.unset(r);
-    ss::get_joining_nodes.unset(r);
-    ss::get_release_version.unset(r);
-    ss::get_scylla_release_version.unset(r);
-    ss::get_schema_version.unset(r);
-    ss::get_all_data_file_locations.unset(r);
-    ss::get_saved_caches_location.unset(r);
-    ss::get_range_to_endpoint_map.unset(r);
-    ss::get_pending_range_to_endpoint_map.unset(r);
-    ss::describe_any_ring.unset(r);
-    ss::describe_ring.unset(r);
-    ss::get_host_id_map.unset(r);
-    ss::get_load.unset(r);
-    ss::get_load_map.unset(r);
-    ss::get_current_generation_number.unset(r);
-    ss::get_natural_endpoints.unset(r);
-    ss::cdc_streams_check_and_repair.unset(r);
-    ss::force_keyspace_compaction.unset(r);
-    ss::force_keyspace_cleanup.unset(r);
-    ss::perform_keyspace_offstrategy_compaction.unset(r);
-    ss::upgrade_sstables.unset(r);
-    ss::force_keyspace_flush.unset(r);
-    ss::decommission.unset(r);
-    ss::move.unset(r);
-    ss::remove_node.unset(r);
-    ss::get_removal_status.unset(r);
-    ss::force_remove_completion.unset(r);
-    ss::set_logging_level.unset(r);
-    ss::get_logging_levels.unset(r);
-    ss::get_operation_mode.unset(r);
-    ss::is_starting.unset(r);
-    ss::get_drain_progress.unset(r);
-    ss::drain.unset(r);
-    ss::truncate.unset(r);
-    ss::get_keyspaces.unset(r);
-    ss::stop_gossiping.unset(r);
-    ss::start_gossiping.unset(r);
-    ss::is_gossip_running.unset(r);
-    ss::stop_daemon.unset(r);
-    ss::is_initialized.unset(r);
-    ss::join_ring.unset(r);
-    ss::is_joined.unset(r);
-    ss::set_stream_throughput_mb_per_sec.unset(r);
-    ss::get_stream_throughput_mb_per_sec.unset(r);
-    ss::get_compaction_throughput_mb_per_sec.unset(r);
-    ss::set_compaction_throughput_mb_per_sec.unset(r);
-    ss::is_incremental_backups_enabled.unset(r);
-    ss::set_incremental_backups_enabled.unset(r);
-    ss::rebuild.unset(r);
-    ss::bulk_load.unset(r);
-    ss::bulk_load_async.unset(r);
-    ss::reschedule_failed_deletions.unset(r);
-    ss::sample_key_range.unset(r);
-    ss::reset_local_schema.unset(r);
-    ss::set_trace_probability.unset(r);
-    ss::get_trace_probability.unset(r);
-    ss::get_slow_query_info.unset(r);
-    ss::set_slow_query.unset(r);
-    ss::enable_auto_compaction.unset(r);
-    ss::disable_auto_compaction.unset(r);
-    ss::enable_tombstone_gc.unset(r);
-    ss::disable_tombstone_gc.unset(r);
-    ss::deliver_hints.unset(r);
-    ss::get_cluster_name.unset(r);
-    ss::get_partitioner_name.unset(r);
-    ss::get_tombstone_warn_threshold.unset(r);
-    ss::set_tombstone_warn_threshold.unset(r);
-    ss::get_tombstone_failure_threshold.unset(r);
-    ss::set_tombstone_failure_threshold.unset(r);
-    ss::get_batch_size_failure_threshold.unset(r);
-    ss::set_batch_size_failure_threshold.unset(r);
-    ss::set_hinted_handoff_throttle_in_kb.unset(r);
-    ss::get_metrics_load.unset(r);
-    ss::get_exceptions.unset(r);
-    ss::get_total_hints_in_progress.unset(r);
-    ss::get_total_hints.unset(r);
-    ss::get_ownership.unset(r);
-    ss::get_effective_ownership.unset(r);
-    ss::sstable_info.unset(r);
-    ss::reload_raft_topology_state.unset(r);
-    sp::get_schema_versions.unset(r);
 }

 enum class scrub_status {
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -25,6 +25,7 @@ class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
+namespace cdc { class generation_service; }
 class sstables_loader;

 namespace gms {
@@ -50,6 +51,11 @@ sstring validate_keyspace(http_context& ctx, const httpd::parameters& param);
 // If the parameter is found and empty, returns a list of all table names in the keyspace.
 std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

+struct table_info {
+    sstring name;
+    table_id id;
+};
+
 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective no_such_column_family error.
@@ -57,8 +63,7 @@ std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, con
 // if the parameter is not found or is empty, returns a list of all table infos in the keyspace.
 std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

-void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
-void unset_storage_service(http_context& ctx, httpd::routes& r);
+void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ls);
 void set_sstables_loader(http_context& ctx, httpd::routes& r, sharded<sstables_loader>& sst_loader);
 void unset_sstables_loader(http_context& ctx, httpd::routes& r);
 void set_view_builder(http_context& ctx, httpd::routes& r, sharded<db::view::view_builder>& vb);
@@ -74,3 +79,9 @@ void unset_snapshot(http_context& ctx, httpd::routes& r);
 seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);

 } // namespace api
+
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti);
+
+} // namespace std
--- a/api/system.cc
+++ b/api/system.cc
@@ -7,18 +7,10 @@
 */

 #include "api/api-doc/system.json.hh"
-#include "api/api-doc/metrics.json.hh"
-
 #include "api/api.hh"

 #include <seastar/core/reactor.hh>
-#include <seastar/core/metrics_api.hh>
-#include <seastar/core/relabel_config.hh>
 #include <seastar/http/exception.hh>
-#include <seastar/util/short_streams.hh>
-#include <seastar/http/short_streams.hh>
-#include "utils/rjson.hh"
-
 #include "log.hh"
 #include "replica/database.hh"

@@ -28,77 +20,8 @@ namespace api {
 using namespace seastar::httpd;

 namespace hs = httpd::system_json;
-namespace hm = httpd::metrics_json;

 void set_system(http_context& ctx, routes& r) {
-    hm::get_metrics_config.set(r, [](const_req req) {
-        std::vector<hm::metrics_config> res;
-        res.resize(seastar::metrics::get_relabel_configs().size());
-        size_t i = 0;
-        for (auto&& r : seastar::metrics::get_relabel_configs()) {
-            res[i].action = r.action;
-            res[i].target_label = r.target_label;
-            res[i].replacement = r.replacement;
-            res[i].separator = r.separator;
-            res[i].source_labels = r.source_labels;
-            res[i].regex = r.expr.str();
-            i++;
-        }
-        return res;
-    });
-
-    hm::set_metrics_config.set(r, [](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        rapidjson::Document doc;
-        doc.Parse(req->content.c_str());
-        if (!doc.IsArray()) {
-            throw bad_param_exception("Expected a json array");
-        }
-        std::vector<seastar::metrics::relabel_config> relabels;
-        relabels.resize(doc.Size());
-        for (rapidjson::SizeType i = 0; i < doc.Size(); i++) {
-            const auto& element = doc[i];
-            if (element.HasMember("source_labels")) {
-                std::vector<std::string> source_labels;
-                source_labels.resize(element["source_labels"].Size());
-
-                for (size_t j = 0; j < element["source_labels"].Size(); j++) {
-                    source_labels[j] = element["source_labels"][j].GetString();
-                }
-                relabels[i].source_labels = source_labels;
-            }
-            if (element.HasMember("action")) {
-                relabels[i].action = seastar::metrics::relabel_config_action(element["action"].GetString());
-            }
-            if (element.HasMember("replacement")) {
-                relabels[i].replacement = element["replacement"].GetString();
-            }
-            if (element.HasMember("separator")) {
-                relabels[i].separator = element["separator"].GetString();
-            }
-            if (element.HasMember("target_label")) {
-                relabels[i].target_label = element["target_label"].GetString();
-            }
-            if (element.HasMember("regex")) {
-                relabels[i].expr = element["regex"].GetString();
-            }
-        }
-        return do_with(std::move(relabels), false, [](const std::vector<seastar::metrics::relabel_config>& relabels, bool& failed) {
-            return smp::invoke_on_all([&relabels, &failed] {
-                return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
-                    if (result.metrics_relabeled_due_to_collision > 0) {
-                        failed = true;
-                    }
-                    return;
-                });
-            }).then([&failed](){
-                if (failed) {
-                    throw bad_param_exception("conflicts found during relabeling");
-                }
-                return make_ready_future<json::json_return_type>(seastar::json::json_void());
-            });
-        });
-    });
-
    hs::get_system_uptime.set(r, [](const_req req) {
        return std::chrono::duration_cast<std::chrono::milliseconds>(engine().uptime()).count();
    });
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -44,7 +44,6 @@ struct task_stats {
        : task_id(task->id().to_sstring())
        , state(task->get_status().state)
        , type(task->type())
-        , scope(task->get_status().scope)
        , keyspace(task->get_status().keyspace)
        , table(task->get_status().table)
        , entity(task->get_status().entity)
@@ -54,7 +53,6 @@ struct task_stats {
    sstring task_id;
    tasks::task_manager::task_state state;
    std::string type;
-    std::string scope;
    std::string keyspace;
    std::string table;
    std::string entity;
@@ -71,7 +69,6 @@ tm::task_status make_status(full_task_status status) {
    tm::task_status res{};
    res.id = status.task_status.id.to_sstring();
    res.type = status.type;
-    res.scope = status.task_status.scope;
    res.state = status.task_status.state;
    res.is_abortable = bool(status.abortable);
    res.start_time = st;
@@ -111,23 +108,18 @@ future<full_task_status> retrieve_status(const tasks::task_manager::foreign_task
    co_return s;
 }

-void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>& tm, db::config& cfg) {
-    tm::get_modules.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        std::vector<std::string> v = boost::copy_range<std::vector<std::string>>(tm.local().get_modules() | boost::adaptors::map_keys);
+void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {
+    tm::get_modules.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        std::vector<std::string> v = boost::copy_range<std::vector<std::string>>(ctx.tm.local().get_modules() | boost::adaptors::map_keys);
        co_return v;
    });

-    tm::get_tasks.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    tm::get_tasks.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        using chunked_stats = utils::chunked_vector<task_stats>;
        auto internal = tasks::is_internal{req_param<bool>(*req, "internal", false)};
-        std::vector<chunked_stats> res = co_await tm.map([&req, internal] (tasks::task_manager& tm) {
+        std::vector<chunked_stats> res = co_await ctx.tm.map([&req, internal] (tasks::task_manager& tm) {
            chunked_stats local_res;
-            tasks::task_manager::module_ptr module;
-            try {
-                module = tm.find_module(req->param["module"]);
-            } catch (...) {
-                throw bad_param_exception(fmt::format("{}", std::current_exception()));
-            }
+            auto module = tm.find_module(req->param["module"]);
            const auto& filtered_tasks = module->get_tasks() | boost::adaptors::filtered([&params = req->query_parameters, internal] (const auto& task) {
                return (internal || !task.second->is_internal()) && filter_tasks(task.second, params);
            });
@@ -156,76 +148,57 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
        co_return std::move(f);
    });

-    tm::get_task_status.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    tm::get_task_status.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            task = co_await tasks::task_manager::invoke_on_task(tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
-                }
-                co_return std::move(task);
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return std::move(task);
+        }));
        auto s = co_await retrieve_status(task);
        co_return make_status(s);
    });

-    tm::abort_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    tm::abort_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
-        try {
-            co_await tasks::task_manager::invoke_on_task(tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                if (!task->is_abortable()) {
-                    co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
-                }
-                co_await task->abort();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            if (!task->is_abortable()) {
+                co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
+            }
+            co_await task->abort();
+        });
        co_return json_void();
    });

-    tm::wait_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    tm::wait_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            task = co_await tasks::task_manager::invoke_on_task(tm, id, std::function([] (tasks::task_manager::task_ptr task) {
-                return task->done().then_wrapped([task] (auto f) {
-                    task->unregister_task();
-                    // done() is called only because we want the task to be complete before getting its status.
-                    // The future should be ignored here as the result does not matter.
-                    f.ignore_ready_future();
-                    return make_foreign(task);
-                });
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
+            return task->done().then_wrapped([task] (auto f) {
+                task->unregister_task();
+                f.get();
+                return make_foreign(task);
+            });
+        }));
        auto s = co_await retrieve_status(task);
        co_return make_status(s);
    });

-    tm::get_task_status_recursively.set(r, [&_tm = tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& tm = _tm;
+    tm::get_task_status_recursively.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto& _ctx = ctx;
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
        std::queue<tasks::task_manager::foreign_task_ptr> q;
        utils::chunked_vector<full_task_status> res;

-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            // Get requested task.
-            task = co_await tasks::task_manager::invoke_on_task(tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
-                }
-                co_return task;
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        // Get requested task.
+        auto task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return task;
+        }));

        // Push children's statuses in BFS order.
        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
@@ -255,23 +228,9 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>

    tm::get_and_update_ttl.set(r, [&cfg] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        uint32_t ttl = cfg.task_ttl_seconds();
-        try {
-            co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
-        } catch (...) {
-            throw bad_param_exception(fmt::format("{}", std::current_exception()));
-        }
+        co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
        co_return json::json_return_type(ttl);
    });
 }

-void unset_task_manager(http_context& ctx, routes& r) {
-    tm::get_modules.unset(r);
-    tm::get_tasks.unset(r);
-    tm::get_task_status.unset(r);
-    tm::abort_task.unset(r);
-    tm::wait_task.unset(r);
-    tm::get_task_status_recursively.unset(r);
-    tm::get_and_update_ttl.unset(r);
-}
-
 }
--- a/api/task_manager.hh
+++ b/api/task_manager.hh
@@ -8,17 +8,11 @@

 #pragma once

-#include <seastar/core/sharded.hh>
 #include "api.hh"
 #include "db/config.hh"

-namespace tasks {
-    class task_manager;
-}
-
 namespace api {

-void set_task_manager(http_context& ctx, httpd::routes& r, sharded<tasks::task_manager>& tm, db::config& cfg);
-void unset_task_manager(http_context& ctx, httpd::routes& r);
+void set_task_manager(http_context& ctx, httpd::routes& r, db::config& cfg);

 }
--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -20,17 +20,17 @@ namespace tmt = httpd::task_manager_test_json;
 using namespace json;
 using namespace seastar::httpd;

-void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_manager>& tm) {
-    tmt::register_test_module.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        co_await tm.invoke_on_all([] (tasks::task_manager& tm) {
+void set_task_manager_test(http_context& ctx, routes& r) {
+    tmt::register_test_module.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        co_await ctx.tm.invoke_on_all([] (tasks::task_manager& tm) {
            auto m = make_shared<tasks::test_module>(tm);
            tm.register_module("test", m);
        });
        co_return json_void();
    });

-    tmt::unregister_test_module.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        co_await tm.invoke_on_all([] (tasks::task_manager& tm) -> future<> {
+    tmt::unregister_test_module.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        co_await ctx.tm.invoke_on_all([] (tasks::task_manager& tm) -> future<> {
            auto module_name = "test";
            auto module = tm.find_module(module_name);
            co_await module->stop();
@@ -38,8 +38,8 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
        co_return json_void();
    });

-    tmt::register_test_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        sharded<tasks::task_manager>& tms = tm;
+    tmt::register_test_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        sharded<tasks::task_manager>& tms = ctx.tm;
        auto it = req->query_parameters.find("task_id");
        auto id = it != req->query_parameters.end() ? tasks::task_id{utils::UUID{it->second}} : tasks::task_id::create_null_id();
        it = req->query_parameters.find("shard");
@@ -54,7 +54,7 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
        tasks::task_info data;
        if (it != req->query_parameters.end()) {
            data.id = tasks::task_id{utils::UUID{it->second}};
-            auto parent_ptr = co_await tasks::task_manager::lookup_task_on_all_shards(tm, data.id);
+            auto parent_ptr = co_await tasks::task_manager::lookup_task_on_all_shards(ctx.tm, data.id);
            data.shard = parent_ptr->get_status().shard;
        }

@@ -69,50 +69,34 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
        co_return id.to_sstring();
    });

-    tmt::unregister_test_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    tmt::unregister_test_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->query_parameters["task_id"]}};
-        try {
-            co_await tasks::task_manager::invoke_on_task(tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                tasks::test_task test_task{task};
-                co_await test_task.unregister_task();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            tasks::test_task test_task{task};
+            co_await test_task.unregister_task();
+        });
        co_return json_void();
    });

-    tmt::finish_test_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    tmt::finish_test_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
        auto it = req->query_parameters.find("error");
        bool fail = it != req->query_parameters.end();
        std::string error = fail ? it->second : "";

-        try {
-            co_await tasks::task_manager::invoke_on_task(tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
-                tasks::test_task test_task{task};
-                if (fail) {
-                    test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
-                } else {
-                    test_task.finish();
-                }
-                return make_ready_future<>();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
+            tasks::test_task test_task{task};
+            if (fail) {
+                test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
+            } else {
+                test_task.finish();
+            }
+            return make_ready_future<>();
+        });
        co_return json_void();
    });
 }

-void unset_task_manager_test(http_context& ctx, routes& r) {
-    tmt::register_test_module.unset(r);
-    tmt::unregister_test_module.unset(r);
-    tmt::register_test_task.unset(r);
-    tmt::unregister_test_task.unset(r);
-    tmt::finish_test_task.unset(r);
-}
-
 }

 #endif
--- a/api/task_manager_test.hh
+++ b/api/task_manager_test.hh
@@ -10,17 +10,11 @@

 #pragma once

-#include <seastar/core/sharded.hh>
 #include "api.hh"

-namespace tasks {
-class task_manager;
-}
-
 namespace api {

-void set_task_manager_test(http_context& ctx, httpd::routes& r, sharded<tasks::task_manager>& tm);
-void unset_task_manager_test(http_context& ctx, httpd::routes& r);
+void set_task_manager_test(http_context& ctx, httpd::routes& r);

 }

--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -7,7 +7,6 @@ target_sources(scylla_auth
    allow_all_authorizer.cc
    authenticated_user.cc
    authenticator.cc
-    certificate_authenticator.cc
    common.cc
    default_authorizer.cc
    password_authenticator.cc
@@ -31,7 +30,6 @@ target_link_libraries(scylla_auth
  PRIVATE
    cql3
    idl
-    wasmtime_bindings
-    libxcrypt::libxcrypt)
+    wasmtime_bindings)

 add_whole_archive(auth scylla_auth)
--- a/auth/authenticator.cc
+++ b/auth/authenticator.cc
@@ -18,7 +18,3 @@

 const sstring auth::authenticator::USERNAME_KEY("username");
 const sstring auth::authenticator::PASSWORD_KEY("password");
-
-future<std::optional<auth::authenticated_user>> auth::authenticator::authenticate(session_dn_func) const {
-    return make_ready_future<std::optional<auth::authenticated_user>>(std::nullopt);
-}
--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -15,8 +15,6 @@
 #include <set>
 #include <stdexcept>
 #include <unordered_map>
-#include <optional>
-#include <functional>

 #include <seastar/core/enum.hh>
 #include <seastar/core/future.hh>
@@ -38,16 +36,6 @@ namespace auth {

 class authenticated_user;

-// Query alt name info as a single (subject style) string
-using alt_name_func = std::function<future<std::string>()>;
-
-struct certificate_info {
-    std::string subject;
-    alt_name_func get_alt_names;
-};
-
-using session_dn_func = std::function<future<std::optional<certificate_info>>()>;
-
 ///
 /// Abstract client for authenticating role identity.
 ///
@@ -99,13 +87,6 @@ public:
    ///
    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const = 0;

-    ///
-    /// Authenticate (early) using transport info
-    ///
-    /// \returns nullopt if not supported/required. exceptional future if failed
-    ///
-    virtual future<std::optional<authenticated_user>> authenticate(session_dn_func) const;
-
    ///
    /// Create an authentication record for a new user. This is required before the user can log-in.
    ///
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -1,181 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#include "auth/certificate_authenticator.hh"
-
-#include <regex>
-
-#include "utils/class_registrator.hh"
-#include "data_dictionary/data_dictionary.hh"
-#include "cql3/query_processor.hh"
-#include "db/config.hh"
-
-static const auto CERT_AUTH_NAME = "com.scylladb.auth.CertificateAuthenticator";
-const std::string_view auth::certificate_authenticator_name(CERT_AUTH_NAME);
-
-static logging::logger clogger("certificate_authenticator");
-
-static const std::string cfg_source_attr = "source";
-static const std::string cfg_query_attr = "query";
-
-static const std::string cfg_source_subject = "SUBJECT";
-static const std::string cfg_source_altname = "ALTNAME";
-
-static const class_registrator<auth::authenticator
-    , auth::certificate_authenticator
-    , cql3::query_processor&
-    , ::service::migration_manager&> cert_auth_reg(CERT_AUTH_NAME);
-
-enum class auth::certificate_authenticator::query_source {
-    subject, altname
-};
-
-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::migration_manager&)
-    : _queries([&] {
-        auto& conf = qp.db().get_config();
-        auto queries = conf.auth_certificate_role_queries();
-
-        if (queries.empty()) {
-            throw std::invalid_argument("No role extraction queries specified.");
-        }
-
-        std::vector<std::pair<query_source, boost::regex>> res;
-
-        for (auto& map : queries) {
-            // first, check for any invalid config keys
-            if (map.size() == 2) {
-                try {
-                    auto& source = map.at(cfg_source_attr);
-                    std::string query = map.at(cfg_query_attr);
-
-                    std::transform(source.begin(), source.end(), source.begin(), ::toupper);
-
-                    boost::regex ex(query);
-                    if (ex.mark_count() != 1) {
-                        throw std::invalid_argument("Role query must have exactly one mark expression");
-                    }
-
-                    clogger.debug("Append role query: {} : {}", source, query);
-
-                    if (source == cfg_source_subject) {
-                        res.emplace_back(query_source::subject, std::move(ex));
-                    } else if (source == cfg_source_altname) {
-                        res.emplace_back(query_source::altname, std::move(ex));
-                    } else {
-                        throw std::invalid_argument(fmt::format("Invalid source: {}", map.at(cfg_source_attr)));
-                    }
-                    continue;
-                } catch (std::out_of_range&) {
-                    // just fallthrough
-                } catch (std::regex_error&) {
-                    std::throw_with_nested(std::invalid_argument(fmt::format("Invalid query expression: {}", map.at(cfg_query_attr))));
-                }
-            }
-            throw std::invalid_argument(fmt::format("Invalid query: {}", map));
-        }
-        return res;
-    }())
-{}
-
-auth::certificate_authenticator::~certificate_authenticator() = default;
-
-future<> auth::certificate_authenticator::start() {
-    co_return;
-}
-
-future<> auth::certificate_authenticator::stop() {
-    co_return;
-}
-
-std::string_view auth::certificate_authenticator::qualified_java_name() const {
-    return certificate_authenticator_name;
-}
-
-bool auth::certificate_authenticator::require_authentication() const {
-    return true;
-}
-
-auth::authentication_option_set auth::certificate_authenticator::supported_options() const {
-    return {};
-}
-
-auth::authentication_option_set auth::certificate_authenticator::alterable_options() const {
-    return {};
-}
-
-future<std::optional<auth::authenticated_user>> auth::certificate_authenticator::authenticate(session_dn_func f) const {
-    if (!f) {
-        co_return std::nullopt;
-    }
-    auto dninfo = co_await f();
-    if (!dninfo) {
-        throw exceptions::authentication_exception("No valid certificate found");
-    }
-
-    auto& subject = dninfo->subject;
-    std::optional<std::string> altname ;
-
-    const std::string* source_str = nullptr;
-
-    for (auto& [source, expr] : _queries) {
-        switch (source) {
-            default:
-            case query_source::subject:
-                source_str = &subject;
-                break;
-            case query_source::altname:
-                if (!altname) {
-                    altname = dninfo->get_alt_names ? co_await dninfo->get_alt_names() : std::string{};
-                }
-                source_str = &*altname;
-                break;
-        }
-
-        clogger.debug("Checking {}: {}", int(source), *source_str);
-
-        boost::smatch m;
-        if (boost::regex_search(*source_str, m, expr)) {
-            auto username = m[1].str();
-            clogger.debug("Return username: {}", username);
-            co_return username;
-        }
-    }
-    throw exceptions::authentication_exception(format("Subject '{}'/'{}' does not match any query expression", subject, altname));
-}
-
-
-future<auth::authenticated_user> auth::certificate_authenticator::authenticate(const credentials_map&) const {
-    throw exceptions::authentication_exception("Cannot authenticate using attribute map");
-}
-
-future<> auth::certificate_authenticator::create(std::string_view role_name, const authentication_options& options) const {
-    // TODO: should we keep track of roles/enforce existence? Role manager should deal with this...
-    co_return;
-}
-
-future<> auth::certificate_authenticator::alter(std::string_view role_name, const authentication_options& options) const {
-    co_return;
-}
-
-future<> auth::certificate_authenticator::drop(std::string_view role_name) const {
-    co_return;
-}
-
-future<auth::custom_options> auth::certificate_authenticator::query_custom_options(std::string_view) const {
-    co_return auth::custom_options{};
-}
-
-const auth::resource_set& auth::certificate_authenticator::protected_resources() const {
-    static const resource_set resources;
-    return resources;
-}
-
-::shared_ptr<auth::sasl_challenge> auth::certificate_authenticator::new_sasl_challenge() const {
-    throw exceptions::authentication_exception("Login authentication not supported");
-}
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#pragma once
-
-#include <boost/regex.hpp>
-#include "auth/authenticator.hh"
-
-namespace cql3 {
-
-class query_processor;
-
-} // namespace cql3
-
-namespace service {
-class migration_manager;
-}
-
-namespace auth {
-
-extern const std::string_view certificate_authenticator_name;
-
-class certificate_authenticator : public authenticator {
-    enum class query_source;
-    std::vector<std::pair<query_source, boost::regex>> _queries;
-public:
-    certificate_authenticator(cql3::query_processor&, ::service::migration_manager&);
-    ~certificate_authenticator();
-
-    future<> start() override;
-    future<> stop() override;
-
-    std::string_view qualified_java_name() const override;
-
-    bool require_authentication() const override;
-
-    authentication_option_set supported_options() const override;
-    authentication_option_set alterable_options() const override;
-
-    future<authenticated_user> authenticate(const credentials_map& credentials) const override;
-    future<std::optional<authenticated_user>> authenticate(session_dn_func) const override;
-
-    future<> create(std::string_view role_name, const authentication_options& options) const override;
-    future<> alter(std::string_view role_name, const authentication_options& options) const override;
-    future<> drop(std::string_view role_name) const override;
-
-    future<custom_options> query_custom_options(std::string_view role_name) const override;
-
-    const resource_set& protected_resources() const override;
-
-    ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;
-private:
-};
-
-}
-
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -71,8 +71,7 @@ static future<> create_metadata_table_if_missing_impl(
        auto group0_guard = co_await mm.start_group0_operation();
        auto ts = group0_guard.write_timestamp();
        try {
-            co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
-                    std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
+            co_return co_await mm.announce(co_await mm.prepare_new_column_family_announcement(table, ts), std::move(group0_guard));
        } catch (exceptions::already_exists_exception&) {}
    }
 }
@@ -85,6 +84,20 @@ future<> create_metadata_table_if_missing(
    return futurize_invoke(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

+future<> wait_for_schema_agreement(::service::migration_manager& mm, const replica::database& db, seastar::abort_source& as) {
+    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
+
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != replica::database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
+    });
+}
+
 ::service::query_state& internal_distributed_query_state() noexcept {
 #ifdef DEBUG
    // Give the much slower debug tests more headroom for completing auth queries.
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -22,6 +22,7 @@
 #include "log.hh"
 #include "seastarx.hh"
 #include "utils/exponential_backoff_retry.hh"
+#include "service/query_state.hh"

 using namespace std::chrono_literals;

@@ -31,7 +32,6 @@ class database;

 namespace service {
 class migration_manager;
-class query_state;
 }

 namespace cql3 {
@@ -67,6 +67,8 @@ future<> create_metadata_table_if_missing(
        std::string_view cql,
        ::service::migration_manager&) noexcept;

+future<> wait_for_schema_agreement(::service::migration_manager&, const replica::database&, seastar::abort_source&);
+
 ///
 /// Time-outs for internal, non-local CQL queries.
 ///
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -129,7 +129,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -29,7 +29,6 @@
 #include "utils/class_registrator.hh"
 #include "replica/database.hh"
 #include "cql3/query_processor.hh"
-#include "db/config.hh"

 namespace auth {

@@ -51,23 +50,14 @@ static const class_registrator<

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

-static std::string_view get_config_value(std::string_view value, std::string_view def) {
-    return value.empty() ? def : value;
-}
-
-std::string password_authenticator::default_superuser(const db::config& cfg) {
-    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
-}
-
 password_authenticator::~password_authenticator() {
 }

 password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::migration_manager& mm)
    : _qp(qp)
    , _migration_manager(mm)
-    , _stopped(make_ready_future<>()) 
-    , _superuser(default_superuser(qp.db().get_config()))
-{}
+    , _stopped(make_ready_future<>()) {
+}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
@@ -116,17 +106,13 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 }

 future<> password_authenticator::create_default_if_missing() const {
-    return default_role_row_satisfies(_qp, &has_salted_hash, _superuser).then([this](bool exists) {
+    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
-            if (salted_pwd.empty()) {
-                salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
-            }
            return _qp.execute_internal(
                    update_row_query(),
                    db::consistency_level::QUORUM,
                    internal_distributed_query_state(),
-                    {salted_pwd, _superuser},
+                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME},
                    cql3::query_processor::cache_internal::no).then([](auto&&) {
                plogger.info("Created default superuser authentication record.");
            });
@@ -146,9 +132,9 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

-                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get0()) {
+                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
                         plogger.warn("Ignoring legacy authentication metadata since nondefault data already exist.");
                     }
@@ -175,8 +161,6 @@ future<> password_authenticator::stop() {
 }

 db::consistency_level password_authenticator::consistency_for_user(std::string_view role_name) {
-    // TODO: this is plain dung. Why treat hardcoded default special, but for example a user-created
-    // super user uses plain LOCAL_ONE?
    if (role_name == DEFAULT_USER_NAME) {
        return db::consistency_level::QUORUM;
    }
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -14,10 +14,6 @@

 #include "auth/authenticator.hh"

-namespace db {
-    class config;
-}
-
 namespace cql3 {

 class query_processor;
@@ -37,11 +33,9 @@ class password_authenticator : public authenticator {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    seastar::abort_source _as;
-    std::string _superuser;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
-    static std::string default_superuser(const db::config&);

    password_authenticator(cql3::query_processor&, ::service::migration_manager&);

--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -46,43 +46,60 @@ constexpr std::string_view qualified_name("system_auth.roles");

 future<bool> default_role_row_satisfies(
        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
+        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {} WHERE {} = ?",
            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

-    for (auto cl : { db::consistency_level::ONE, db::consistency_level::QUORUM }) {
-        auto results = co_await qp.execute_internal(query, cl
-            , internal_distributed_query_state()
-            , {rolename.value_or(std::string(meta::DEFAULT_SUPERUSER_NAME))}
-            , cql3::query_processor::cache_internal::yes
-            );
-        if (!results->empty()) {
-            co_return p(results->one());
-        }
-    }
-    co_return false;
+    return do_with(std::move(p), [&qp](const auto& p) {
+        return qp.execute_internal(
+                query,
+                db::consistency_level::ONE,
+                internal_distributed_query_state(),
+                {meta::DEFAULT_SUPERUSER_NAME},
+                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
+            if (results->empty()) {
+                return qp.execute_internal(
+                        query,
+                        db::consistency_level::QUORUM,
+                        internal_distributed_query_state(),
+                        {meta::DEFAULT_SUPERUSER_NAME},
+                        cql3::query_processor::cache_internal::yes).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+                    if (results->empty()) {
+                        return make_ready_future<bool>(false);
+                    }
+
+                    return make_ready_future<bool>(p(results->one()));
+                });
+            }
+
+            return make_ready_future<bool>(p(results->one()));
+        });
+    });
 }

 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
+        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name);

-    auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
-        , internal_distributed_query_state(), cql3::query_processor::cache_internal::no
-        );
-    if (results->empty()) {
-        co_return false;
-    }
-    static const sstring col_name = sstring(meta::roles_table::role_col_name);
+    return do_with(std::move(p), [&qp](const auto& p) {
+        return qp.execute_internal(
+                query,
+                db::consistency_level::QUORUM,
+                internal_distributed_query_state(),
+                cql3::query_processor::cache_internal::no).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+            if (results->empty()) {
+                return false;
+            }

-    co_return boost::algorithm::any_of(*results, [&](const cql3::untyped_result_set_row& row) {
-        auto superuser = rolename ? std::string_view(*rolename) : meta::DEFAULT_SUPERUSER_NAME;
-        const bool is_nondefault = row.get_as<sstring>(col_name) != superuser;
-        return is_nondefault && p(row);
+            static const sstring col_name = sstring(meta::roles_table::role_col_name);
+
+            return boost::algorithm::any_of(*results, [&p](const cql3::untyped_result_set_row& row) {
+                const bool is_nondefault = row.get_as<sstring>(col_name) != meta::DEFAULT_SUPERUSER_NAME;
+                return is_nondefault && p(row);
+            });
+        });
    });
 }

--- a/auth/roles-metadata.hh
+++ b/auth/roles-metadata.hh
@@ -43,17 +43,13 @@ constexpr std::string_view role_col_name{"role", 4};
 ///
 future<bool> default_role_row_satisfies(
        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
+        std::function<bool(const cql3::untyped_result_set_row&)>);

 ///
 /// Check that any nondefault role satisfies a predicate. `false` if no nondefault roles exist.
 ///
 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
+        std::function<bool(const cql3::untyped_result_set_row&)>);

 }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -178,8 +178,7 @@ future<> service::create_keyspace_if_missing(::service::migration_manager& mm) c
                    opts,
                    true);

-            co_return co_await mm.announce(::service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts),
-                    std::move(group0_guard), format("auth_service: create {} keyspace", meta::AUTH_KS));
+            co_return co_await mm.announce(mm.prepare_new_keyspace_announcement(ksm, ts), std::move(group0_guard));
        }
    }
 }
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -28,8 +28,6 @@
 #include "log.hh"
 #include "utils/class_registrator.hh"
 #include "replica/database.hh"
-#include "service/migration_manager.hh"
-#include "password_authenticator.hh"

 namespace auth {

@@ -129,13 +127,6 @@ static bool has_can_login(const cql3::untyped_result_set_row& row) {
    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob("can_login")).is_null());
 }

-standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::service::migration_manager& mm)
-    : _qp(qp)
-    , _migration_manager(mm)
-    , _stopped(make_ready_future<>())
-    , _superuser(password_authenticator::default_superuser(qp.db().get_config()))
-{}
-
 std::string_view standard_role_manager::qualified_java_name() const noexcept {
    return "org.apache.cassandra.auth.CassandraRoleManager";
 }
@@ -177,7 +168,7 @@ future<> standard_role_manager::create_metadata_tables_if_missing() const {
 }

 future<> standard_role_manager::create_default_role_if_missing() const {
-    return default_role_row_satisfies(_qp, &has_can_login, _superuser).then([this](bool exists) {
+    return default_role_row_satisfies(_qp, &has_can_login).then([this](bool exists) {
        if (!exists) {
            static const sstring query = format("INSERT INTO {} ({}, is_superuser, can_login) VALUES (?, true, true)",
                    meta::roles_table::qualified_name,
@@ -187,9 +178,9 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_query_state(),
-                    {_superuser},
-                    cql3::query_processor::cache_internal::no).then([this](auto&&) {
-                log.info("Created default superuser role '{}'.", _superuser);
+                    {meta::DEFAULT_SUPERUSER_NAME},
+                    cql3::query_processor::cache_internal::no).then([](auto&&) {
+                log.info("Created default superuser role '{}'.", meta::DEFAULT_SUPERUSER_NAME);
                return make_ready_future<>();
            });
        }
@@ -241,7 +232,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -34,10 +34,13 @@ class standard_role_manager final : public role_manager {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    seastar::abort_source _as;
-    std::string _superuser;

 public:
-    standard_role_manager(cql3::query_processor&, ::service::migration_manager&);
+    standard_role_manager(cql3::query_processor& qp, ::service::migration_manager& mm)
+            : _qp(qp)
+            , _migration_manager(mm)
+            , _stopped(make_ready_future<>()) {
+    }

    virtual std::string_view qualified_java_name() const noexcept override;

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -37,8 +37,10 @@
 // The constants q1 and q2 are used to determine the proportional factor at each stage.
 class backlog_controller {
 public:
-    using scheduling_group = seastar::scheduling_group;
-
+    struct scheduling_group {
+        seastar::scheduling_group cpu = default_scheduling_group();
+        seastar::io_priority_class io = default_priority_class();
+    };
    future<> shutdown() {
        _update_timer.cancel();
        return std::move(_inflight_update);
@@ -56,11 +58,11 @@ protected:
    };

    scheduling_group _scheduling_group;
+    timer<> _update_timer;

    std::vector<control_point> _control_points;

    std::function<float()> _current_backlog;
-    timer<> _update_timer;
    // updating shares for an I/O class may contact another shard and returns a future.
    future<> _inflight_update;

@@ -80,9 +82,9 @@ protected:
                       std::vector<control_point> control_points, std::function<float()> backlog,
                       float static_shares = 0)
        : _scheduling_group(std::move(sg))
+        , _update_timer([this] { adjust(); })
        , _control_points()
        , _current_backlog(std::move(backlog))
-        , _update_timer([this] { adjust(); })
        , _inflight_update(make_ready_future<>())
        , _static_shares(static_shares)
    {
--- a/bin/cqlsh
+++ b/bin/cqlsh
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-# Copyright (C) 2023-present ScyllaDB
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-here=$(dirname "$0")
-exec "$here/../tools/cqlsh/bin/cqlsh" "$@"
-
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -110,9 +110,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    flat_mutation_reader_v2* _underlying = nullptr;
    flat_mutation_reader_v2_opt _underlying_holder;

-    gc_clock::time_point _read_time;
-    gc_clock::time_point _gc_before;
-
    future<> do_fill_buffer();
    future<> ensure_underlying();
    void copy_from_cache_to_buffer();
@@ -181,20 +178,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    const schema& table_schema() {
        return *_snp->schema();
    }
-
-    gc_clock::time_point get_read_time() {
-        return _read_context.tombstone_gc_state() ? gc_clock::now() : gc_clock::time_point::min();
-    }
-
-    gc_clock::time_point get_gc_before(const schema& schema, dht::decorated_key dk, const gc_clock::time_point query_time) {
-        auto gc_state = _read_context.tombstone_gc_state();
-        if (gc_state) {
-            return gc_state->get_gc_before_for_key(schema.shared_from_this(), dk, query_time);
-        }
-
-        return gc_clock::time_point::min();
-    }
-
 public:
    cache_flat_mutation_reader(schema_ptr s,
                               dht::decorated_key dk,
@@ -213,8 +196,6 @@ public:
        , _read_context_holder()
        , _read_context(ctx)    // ctx is owned by the caller, who's responsible for closing it.
        , _next_row(*_schema, *_snp, false, _read_context.is_reversed())
-        , _read_time(get_read_time())
-        , _gc_before(get_gc_before(*_schema, dk, _read_time))
    {
        clogger.trace("csm {}: table={}.{}, reversed={}, snap={}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name(), _read_context.is_reversed(),
                      fmt::ptr(&*_snp));
@@ -749,51 +730,9 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
        }
    }

+    // We add the row to the buffer even when it's full.
+    // This simplifies the code. For more info see #3139.
    if (_next_row_in_range) {
-        bool remove_row = false;
-
-        if (_read_context.tombstone_gc_state() // do not compact rows when tombstone_gc_state is not set (used in some unit tests)
-            && !_next_row.dummy()
-            && _snp->at_latest_version()
-            && _snp->at_oldest_version()) {
-            deletable_row& row = _next_row.latest_row();
-            tombstone range_tomb = _next_row.range_tombstone_for_row();
-            auto t = row.deleted_at();
-            t.apply(range_tomb);
-
-            auto row_tomb_expired = [&](row_tombstone tomb) {
-                return (tomb && tomb.max_deletion_time() < _gc_before);
-            };
-
-            auto is_row_dead = [&](const deletable_row& row) {
-                auto& m = row.marker();
-                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before);
-            };
-
-            if (row_tomb_expired(t) || is_row_dead(row)) {
-                can_gc_fn always_gc = [&](tombstone) { return true; };
-                const schema& row_schema = _next_row.latest_row_schema();
-
-                _read_context.cache()._tracker.on_row_compacted();
-
-                with_allocator(_snp->region().allocator(), [&] {
-                    deletable_row row_copy(row_schema, row);
-                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, always_gc, _gc_before, nullptr);
-                    std::swap(row, row_copy);
-                });
-                remove_row = row.empty();
-
-                auto tomb_expired = [&](tombstone tomb) {
-                    return (tomb && tomb.deletion_time < _gc_before);
-                };
-
-                auto latests_range_tomb = _next_row.get_iterator_in_latest_version()->range_tombstone();
-                if (tomb_expired(latests_range_tomb)) {
-                    _next_row.get_iterator_in_latest_version()->set_range_tombstone({});
-                }
-            }
-        }
-
        if (_next_row.range_tombstone_for_row() != _current_tombstone) [[unlikely]] {
            auto tomb = _next_row.range_tombstone_for_row();
            auto new_lower_bound = position_in_partition::before_key(_next_row.position());
@@ -803,31 +742,8 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
            _current_tombstone = tomb;
            _read_context.cache()._tracker.on_range_tombstone_read();
        }
-
-        if (remove_row) {
-            _read_context.cache()._tracker.on_row_compacted_away();
-
-            _lower_bound = position_in_partition::after_key(*_schema, _next_row.position());
-
-            partition_snapshot_row_weakref row_ref(_next_row);
-            move_to_next_entry();
-
-            with_allocator(_snp->region().allocator(), [&] {
-                cache_tracker& tracker = _read_context.cache()._tracker;
-                if (row_ref->is_linked()) {
-                    tracker.get_lru().remove(*row_ref);
-                }
-                row_ref->on_evicted(tracker);
-            });
-
-            _snp->region().allocator().invalidate_references();
-            _next_row.force_valid();
-        } else {
-            // We add the row to the buffer even when it's full.
-            // This simplifies the code. For more info see #3139.
-            add_to_buffer(_next_row);
-            move_to_next_entry();
-        }
+        add_to_buffer(_next_row);
+        move_to_next_entry();
    } else {
        move_to_next_range();
    }
@@ -978,7 +894,7 @@ void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_curs
    if (!row.dummy()) {
        _read_context.cache().on_row_hit();
        if (_read_context.digest_requested()) {
-            row.latest_row_prepare_hash();
+            row.latest_row().cells().prepare_hash(table_schema(), column_kind::regular_column);
        }
        add_clustering_row_to_buffer(mutation_fragment_v2(*_schema, _permit, row.row()));
    } else {
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -13,7 +13,6 @@
 #include <seastar/core/sleep.hh>
 #include <seastar/core/coroutine.hh>

-#include "gms/endpoint_state.hh"
 #include "keys.hh"
 #include "schema/schema_builder.hh"
 #include "replica/database.hh"
@@ -26,7 +25,6 @@
 #include "gms/inet_address.hh"
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
-#include "utils/error_injection.hh"
 #include "utils/UUID_gen.hh"

 #include "cdc/generation.hh"
@@ -68,10 +66,10 @@ static constexpr auto stream_id_index_shift = stream_id_version_shift + stream_i
 static constexpr auto stream_id_random_shift = stream_id_index_shift + stream_id_index_bits;

 /**
- * Responsibility for encoding stream_id moved from the create_stream_ids
- * function to this constructor, to keep knowledge of composition in a
- * single place. Note the make_new_generation_description function
- * defines the "order" in which we view vnodes etc.
+ * Responsibilty for encoding stream_id moved from factory method to
+ * this constructor, to keep knowledge of composition in a single place.
+ * Note this is private and friended to topology_description_generator,
+ * because he is the one who defined the "order" we view vnodes etc.
 */
 stream_id::stream_id(dht::token token, size_t vnode_index)
    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
@@ -155,18 +153,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
+topology_description::topology_description(std::vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-utils::chunked_vector<token_range_description>&& topology_description::entries() && {
+std::vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -185,48 +183,98 @@ static std::vector<stream_id> create_stream_ids(
    return result;
 }

+class topology_description_generator final {
+    const std::unordered_set<dht::token>& _bootstrap_tokens;
+    const locator::token_metadata_ptr _tmptr;
+    const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& _get_sharding_info;
+
+    // Compute a set of tokens that split the token ring into vnodes
+    auto get_tokens() const {
+        auto tokens = _tmptr->sorted_tokens();
+        auto it = tokens.insert(
+                tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
+        std::sort(it, tokens.end());
+        std::inplace_merge(tokens.begin(), it, tokens.end());
+        tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
+        return tokens;
+    }
+
+    token_range_description create_description(size_t index, dht::token start, dht::token end) const {
+        token_range_description desc;
+
+        desc.token_range_end = end;
+
+        auto [shard_count, ignore_msb] = _get_sharding_info(end);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
+        desc.sharding_ignore_msb = ignore_msb;
+
+        return desc;
+    }
+public:
+    topology_description_generator(
+            const std::unordered_set<dht::token>& bootstrap_tokens,
+            const locator::token_metadata_ptr tmptr,
+            // This function must return sharding parameters for a node that owns the vnode ending with
+            // the given token. Returns <shard_count, ignore_msb> pair.
+            const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info)
+        : _bootstrap_tokens(bootstrap_tokens)
+        , _tmptr(std::move(tmptr))
+        , _get_sharding_info(get_sharding_info)
+    {}
+
+    /*
+     * Generate a set of CDC stream identifiers such that for each shard
+     * and vnode pair there exists a stream whose token falls into this vnode
+     * and is owned by this shard. It is sometimes not possible to generate
+     * a CDC stream identifier for some (vnode, shard) pair because not all
+     * shards have to own tokens in a vnode. Small vnode can be totally owned
+     * by a single shard. In such case, a stream identifier that maps to
+     * end of the vnode is generated.
+     *
+     * Then build a cdc::topology_description which maps tokens to generated
+     * stream identifiers, such that if token T is owned by shard S in vnode V,
+     * it gets mapped to the stream identifier generated for (S, V).
+     */
+    // Run in seastar::async context.
+    topology_description generate() const {
+        const auto tokens = get_tokens();
+
+        std::vector<token_range_description> vnode_descriptions;
+        vnode_descriptions.reserve(tokens.size());
+
+        vnode_descriptions.push_back(
+                create_description(0, tokens.back(), tokens.front()));
+        for (size_t idx = 1; idx < tokens.size(); ++idx) {
+            vnode_descriptions.push_back(
+                    create_description(idx, tokens[idx - 1], tokens[idx]));
+        }
+
+        return {std::move(vnode_descriptions)};
+    }
+};
+
 bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
    auto my_host_id = g.get_host_id(me);
-    return g.for_each_endpoint_state_until([&] (const gms::inet_address& node, const gms::endpoint_state& eps) {
-        return stop_iteration(my_host_id < g.get_host_id(node));
-    }) == stop_iteration::no;
+    auto& eps = g.get_endpoint_states();
+    return std::none_of(eps.begin(), eps.end(),
+            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
+        return my_host_id < g.get_host_id(ep.first);
+    });
 }

-bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm) {
-    if (tm.sorted_tokens().size() != gen.entries().size()) {
-        // We probably have garbage streams from old generations
-        cdc_log.info("Generation size does not match the token ring");
-        return false;
-    } else {
-        std::unordered_set<dht::token> gen_ends;
-        for (const auto& entry : gen.entries()) {
-            gen_ends.insert(entry.token_range_end);
-        }
-        for (const auto& metadata_token : tm.sorted_tokens()) {
-            if (!gen_ends.contains(metadata_token)) {
-                cdc_log.warn("CDC generation missing token {}", metadata_token);
-                return false;
-            }
-        }
-        return true;
-    }
-}
-
-static future<utils::chunked_vector<mutation>> get_common_cdc_generation_mutations(
+future<utils::chunked_vector<mutation>> get_cdc_generation_mutations(
        schema_ptr s,
-        const partition_key& pkey,
-        noncopyable_function<clustering_key (dht::token)>&& get_ckey_from_range_end,
+        utils::UUID id,
        const cdc::topology_description& desc,
        size_t mutation_size_threshold,
        api::timestamp_type ts) {
    utils::chunked_vector<mutation> res;
-    res.emplace_back(s, pkey);
+    res.emplace_back(s, partition_key::from_singular(*s, id));
+    res.back().set_static_cell(to_bytes("num_ranges"), int32_t(desc.entries().size()), ts);
    size_t size_estimate = 0;
-    size_t total_size_estimate = 0;
    for (auto& e : desc.entries()) {
        if (size_estimate >= mutation_size_threshold) {
-            total_size_estimate += size_estimate;
-            res.emplace_back(s, pkey);
+            res.emplace_back(s, partition_key::from_singular(*s, id));
            size_estimate = 0;
        }

@@ -237,60 +285,16 @@ static future<utils::chunked_vector<mutation>> get_common_cdc_generation_mutatio
        }

        size_estimate += e.streams.size() * 20;
-        auto ckey = get_ckey_from_range_end(e.token_range_end);
+        auto ckey = clustering_key::from_singular(*s, dht::token::to_int64(e.token_range_end));
        res.back().set_cell(ckey, to_bytes("streams"), make_set_value(db::cdc_streams_set_type, std::move(streams)), ts);
        res.back().set_cell(ckey, to_bytes("ignore_msb"), int8_t(e.sharding_ignore_msb), ts);

        co_await coroutine::maybe_yield();
    }

-    total_size_estimate += size_estimate;
-
-    // Copy mutations n times, where n is picked so that the memory size of all mutations together exceeds `max_command_size`.
-    utils::get_local_injector().inject("cdc_generation_mutations_replication", [&res, total_size_estimate, mutation_size_threshold] {
-        utils::chunked_vector<mutation> new_res;
-
-        size_t number_of_copies = (mutation_size_threshold / total_size_estimate + 1) * 2;
-        for (size_t i = 0; i < number_of_copies; ++i) {
-            std::copy(res.begin(), res.end(), std::back_inserter(new_res));
-        }
-
-        res = std::move(new_res);
-    });
-
    co_return res;
 }

-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v2(
-        schema_ptr s,
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        size_t mutation_size_threshold,
-        api::timestamp_type ts) {
-    auto pkey = partition_key::from_singular(*s, id);
-    auto get_ckey = [s] (dht::token range_end) {
-        return clustering_key::from_singular(*s, dht::token::to_int64(range_end));
-    };
-
-    auto res = co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
-    res.back().set_static_cell(to_bytes("num_ranges"), int32_t(desc.entries().size()), ts);
-    co_return res;
-}
-
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
-        schema_ptr s,
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        size_t mutation_size_threshold,
-        api::timestamp_type ts) {
-    auto pkey = partition_key::from_singular(*s, CDC_GENERATIONS_V3_KEY);
-    auto get_ckey = [&] (dht::token range_end) {
-        return clustering_key::from_exploded(*s, {timeuuid_type->decompose(id), long_type->decompose(dht::token::to_int64(range_end))}) ;
-    };
-
-    co_return co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
-}
-
 // non-static for testing
 size_t limit_of_streams_in_topology_description() {
    // Each stream takes 16B and we don't want to exceed 4MB so we can have
@@ -323,47 +327,13 @@ topology_description limit_number_of_streams_if_needed(topology_description&& de
    return topology_description(std::move(entries));
 }

-// Compute a set of tokens that split the token ring into vnodes.
-static auto get_tokens(const std::unordered_set<dht::token>& bootstrap_tokens, const locator::token_metadata_ptr tmptr) {
-    auto tokens = tmptr->sorted_tokens();
-    auto it = tokens.insert(tokens.end(), bootstrap_tokens.begin(), bootstrap_tokens.end());
-    std::sort(it, tokens.end());
-    std::inplace_merge(tokens.begin(), it, tokens.end());
-    tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
-    return tokens;
-}
-
-static token_range_description create_token_range_description(
-        size_t index,
-        dht::token start,
-        dht::token end,
-        const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info) {
-    token_range_description desc;
-
-    desc.token_range_end = end;
-
-    auto [shard_count, ignore_msb] = get_sharding_info(end);
-    desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
-    desc.sharding_ignore_msb = ignore_msb;
-
-    return desc;
-}
-
-cdc::topology_description make_new_generation_description(
+std::pair<utils::UUID, cdc::topology_description> make_new_generation_data(
        const std::unordered_set<dht::token>& bootstrap_tokens,
        const noncopyable_function<std::pair<size_t, uint8_t>(dht::token)>& get_sharding_info,
        const locator::token_metadata_ptr tmptr) {
-    const auto tokens = get_tokens(bootstrap_tokens, tmptr);
-
-    utils::chunked_vector<token_range_description> vnode_descriptions;
-    vnode_descriptions.reserve(tokens.size());
-
-    vnode_descriptions.push_back(create_token_range_description(0, tokens.back(), tokens.front(), get_sharding_info));
-    for (size_t idx = 1; idx < tokens.size(); ++idx) {
-        vnode_descriptions.push_back(create_token_range_description(idx, tokens[idx - 1], tokens[idx], get_sharding_info));
-    }
-
-    return {std::move(vnode_descriptions)};
+    auto gen = topology_description_generator(bootstrap_tokens, tmptr, get_sharding_info).generate();
+    auto uuid = utils::make_random_uuid();
+    return {uuid, std::move(gen)};
 }

 db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milliseconds ring_delay) {
@@ -395,9 +365,7 @@ future<cdc::generation_id> generation_service::legacy_make_new_generation(const
            return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
        }
    };
-
-    auto uuid = utils::make_random_uuid();
-    auto gen = make_new_generation_description(bootstrap_tokens, get_sharding_info, tmptr);
+    auto [uuid, gen] = make_new_generation_data(bootstrap_tokens, get_sharding_info, tmptr);

    // Our caller should ensure that there are normal tokens in the token ring.
    auto normal_token_owners = tmptr->count_normal_token_owners();
@@ -451,12 +419,8 @@ future<cdc::generation_id> generation_service::legacy_make_new_generation(const
 * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
 * which means it will gossip the generation's timestamp.
 */
-static std::optional<cdc::generation_id> get_generation_id_for(const gms::inet_address& endpoint, const gms::endpoint_state& eps) {
-    const auto* gen_id_ptr = eps.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
-    if (!gen_id_ptr) {
-        return std::nullopt;
-    }
-    auto gen_id_string = gen_id_ptr->value();
+static std::optional<cdc::generation_id> get_generation_id_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto gen_id_string = g.get_application_state_value(endpoint, gms::application_state::CDC_GENERATION_ID);
    cdc_log.trace("endpoint={}, gen_id_string={}", endpoint, gen_id_string);
    return gms::versioned_value::cdc_generation_id_from_string(gen_id_string);
 }
@@ -660,21 +624,21 @@ future<> generation_service::maybe_rewrite_streams_descriptions() {

    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
    std::vector<time_and_ttl> times_and_ttls;
-    _db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
-        auto& s = *t->schema();
+    for (auto& [_, cf] : _db.get_column_families()) {
+        auto& s = *cf->schema();
        auto base = cdc::get_base_table(_db, s.ks_name(), s.cf_name());
        if (!base) {
            // Not a CDC log table.
-            return;
+            continue;
        }
        auto& cdc_opts = base->cdc_options();
        if (!cdc_opts.enabled()) {
            // This table is named like a CDC log table but it's not one.
-            return;
+            continue;
        }

        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
-    });
+    }

    if (times_and_ttls.empty()) {
        // There's no point in rewriting old generations' streams (they don't contain any data).
@@ -762,8 +726,8 @@ future<> generation_service::stop() {
        cdc_log.error("CDC stream rewrite failed: ", std::current_exception());
    }

-    if (_joined && (this_shard_id() == 0)) {
-        co_await leave_ring();
+    if (this_shard_id() == 0) {
+        co_await _gossiper.unregister_(shared_from_this());
    }

    _stopped = true;
@@ -775,6 +739,7 @@ generation_service::~generation_service() {

 future<> generation_service::after_join(std::optional<cdc::generation_id>&& startup_gen_id) {
    assert_shard_zero(__PRETTY_FUNCTION__);
+    assert(_sys_ks.local().bootstrap_complete());

    _gen_id = std::move(startup_gen_id);
    _gossiper.register_(shared_from_this());
@@ -792,24 +757,18 @@ future<> generation_service::after_join(std::optional<cdc::generation_id>&& star
    _cdc_streams_rewrite_complete = maybe_rewrite_streams_descriptions();
 }

-future<> generation_service::leave_ring() {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-    _joined = false;
-    co_await _gossiper.unregister_(shared_from_this());
-}
-
-future<> generation_service::on_join(gms::inet_address ep, gms::endpoint_state_ptr ep_state, gms::permit_id pid) {
+future<> generation_service::on_join(gms::inet_address ep, gms::endpoint_state ep_state) {
    assert_shard_zero(__PRETTY_FUNCTION__);

-    auto val = ep_state->get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
+    auto val = ep_state.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
    if (!val) {
        return make_ready_future();
    }

-    return on_change(ep, gms::application_state::CDC_GENERATION_ID, *val, pid);
+    return on_change(ep, gms::application_state::CDC_GENERATION_ID, *val);
 }

-future<> generation_service::on_change(gms::inet_address ep, gms::application_state app_state, const gms::versioned_value& v, gms::permit_id) {
+future<> generation_service::on_change(gms::inet_address ep, gms::application_state app_state, const gms::versioned_value& v) {
    assert_shard_zero(__PRETTY_FUNCTION__);

    if (app_state != gms::application_state::CDC_GENERATION_ID) {
@@ -829,21 +788,22 @@ future<> generation_service::check_and_repair_cdc_streams() {
    }

    std::optional<cdc::generation_id> latest = _gen_id;
-    _gossiper.for_each_endpoint_state([&] (const gms::inet_address& addr, const gms::endpoint_state& state) {
+    const auto& endpoint_states = _gossiper.get_endpoint_states();
+    for (const auto& [addr, state] : endpoint_states) {
        if (_gossiper.is_left(addr)) {
            cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
-            return;
+            continue;
        }
        if (!_gossiper.is_normal(addr)) {
            throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
        }

-        const auto gen_id = get_generation_id_for(addr, state);
+        const auto gen_id = get_generation_id_for(addr, _gossiper);
        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
            latest = gen_id;
        }
-    });
+    }

    auto tmptr = _token_metadata.get();
    auto sys_dist_ks = get_sys_dist_ks();
@@ -898,9 +858,24 @@ future<> generation_service::check_and_repair_cdc_streams() {
                " even though some node gossiped about it.",
                latest, db_clock::now());
            should_regenerate = true;
-        } else if (!is_cdc_generation_optimal(*gen, *tmptr)) {
-            should_regenerate = true;
-            cdc_log.info("CDC generation {} needs repair, regenerating", latest);
+        } else {
+            if (tmptr->sorted_tokens().size() != gen->entries().size()) {
+                // We probably have garbage streams from old generations
+                cdc_log.info("Generation size does not match the token ring, regenerating");
+                should_regenerate = true;
+            } else {
+                std::unordered_set<dht::token> gen_ends;
+                for (const auto& entry : gen->entries()) {
+                    gen_ends.insert(entry.token_range_end);
+                }
+                for (const auto& metadata_token : tmptr->sorted_tokens()) {
+                    if (!gen_ends.contains(metadata_token)) {
+                        cdc_log.warn("CDC generation {} missing token {}. Regenerating.", latest, metadata_token);
+                        should_regenerate = true;
+                        break;
+                    }
+                }
+            }
        }
    }

@@ -960,13 +935,17 @@ future<> generation_service::legacy_handle_cdc_generation(std::optional<cdc::gen
        co_return;
    }

-    if (!_sys_dist_ks.local_is_initialized() || !_sys_dist_ks.local().started()) {
-        on_internal_error(cdc_log, "Legacy handle CDC generation with sys.dist.ks. down");
+    if (!_sys_ks.local().bootstrap_complete() || !_sys_dist_ks.local_is_initialized()
+            || !_sys_dist_ks.local().started()) {
+        // The service should not be listening for generation changes until after the node
+        // is bootstrapped. Therefore we would previously assume that this condition
+        // can never become true and call on_internal_error here, but it turns out that
+        // it may become true on decommission: the node enters NEEDS_BOOTSTRAP
+        // state before leaving the token ring, so bootstrap_complete() becomes false.
+        // In that case we can simply return.
+        co_return;
    }

-    // The service should not be listening for generation changes until after the node
-    // is bootstrapped and since the node leaves the ring on decommission
-
    if (co_await container().map_reduce(and_reducer(), [ts = get_ts(*gen_id)] (generation_service& svc) {
        return !svc._cdc_metadata.prepare(ts);
    })) {
@@ -1029,12 +1008,12 @@ future<> generation_service::legacy_scan_cdc_generations() {
    assert_shard_zero(__PRETTY_FUNCTION__);

    std::optional<cdc::generation_id> latest;
-    _gossiper.for_each_endpoint_state([&] (const gms::inet_address& node, const gms::endpoint_state& eps) {
-        auto gen_id = get_generation_id_for(node, eps);
+    for (const auto& ep: _gossiper.get_endpoint_states()) {
+        auto gen_id = get_generation_id_for(ep.first, _gossiper);
        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
            latest = gen_id;
        }
-    });
+    }

    if (latest) {
        cdc_log.info("Latest generation seen during startup: {}", *latest);
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -92,13 +92,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    utils::chunked_vector<token_range_description> _entries;
+    std::vector<token_range_description> _entries;
 public:
-    topology_description(utils::chunked_vector<token_range_description> entries);
+    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const utils::chunked_vector<token_range_description>& entries() const&;
-    utils::chunked_vector<token_range_description>&& entries() &&;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
 };

 /**
@@ -133,28 +133,7 @@ public:
 */
 bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);

-/*
- * Checks if the CDC generation is optimal, which is true if its `topology_description` is consistent
- * with `token_metadata`.
-*/
-bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm);
-
-/*
- * Generate a set of CDC stream identifiers such that for each shard
- * and vnode pair there exists a stream whose token falls into this vnode
- * and is owned by this shard. It is sometimes not possible to generate
- * a CDC stream identifier for some (vnode, shard) pair because not all
- * shards have to own tokens in a vnode. Small vnode can be totally owned
- * by a single shard. In such case, a stream identifier that maps to
- * end of the vnode is generated.
- *
- * Then build a cdc::topology_description which maps tokens to generated
- * stream identifiers, such that if token T is owned by shard S in vnode V,
- * it gets mapped to the stream identifier generated for (S, V).
- *
- * Run in seastar::async context.
- */
-cdc::topology_description make_new_generation_description(
+std::pair<utils::UUID, cdc::topology_description> make_new_generation_data(
    const std::unordered_set<dht::token>& bootstrap_tokens,
    const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info,
    const locator::token_metadata_ptr);
@@ -165,20 +144,9 @@ db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milli
 // using `mutation_size_threshold` to decide on the mutation sizes. The partition key of each mutation
 // is given by `gen_uuid`. The timestamp of each cell in each mutation is given by `mutation_timestamp`.
 //
-// Works only for the CDC_GENERATIONS_V2 schema (in system_distributed keyspace).
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v2(
-    schema_ptr, utils::UUID gen_uuid, const cdc::topology_description&,
-    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);
-
-// The partition key of all rows in the single-partition CDC_GENERATIONS_V3 schema (in system keyspace).
-static constexpr auto CDC_GENERATIONS_V3_KEY = "cdc_generations";
-
-// Translates the CDC generation data given by a `cdc::topology_description` into a vector of mutations,
-// using `mutation_size_threshold` to decide on the mutation sizes. The first clustering key column is
-// given by `gen_uuid`. The timestamp of each cell in each mutation is given by `mutation_timestamp`.
-//
-// Works only for the CDC_GENERATIONS_V3 schema (in system keyspace).
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
+// Works for only specific schemas: CDC_GENERATIONS_V2 (in system_distributed_keyspace)
+// and CDC_GENERATIONS_V3 (in system_keyspace).
+future<utils::chunked_vector<mutation>> get_cdc_generation_mutations(
    schema_ptr, utils::UUID gen_uuid, const cdc::topology_description&,
    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);

--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -98,20 +98,19 @@ public:
     * Must be called on shard 0 - that's where the generation management happens.
     */
    future<> after_join(std::optional<cdc::generation_id>&& startup_gen_id);
-    future<> leave_ring();

    cdc::metadata& get_cdc_metadata() {
        return _cdc_metadata;
    }

-    virtual future<> before_change(gms::inet_address, gms::endpoint_state_ptr, gms::application_state, const gms::versioned_value&) override { return make_ready_future(); }
-    virtual future<> on_alive(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_dead(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_remove(gms::inet_address, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_restart(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
+    virtual future<> before_change(gms::inet_address, gms::endpoint_state, gms::application_state, const gms::versioned_value&) override { return make_ready_future(); }
+    virtual future<> on_alive(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }
+    virtual future<> on_dead(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }
+    virtual future<> on_remove(gms::inet_address) override { return make_ready_future(); }
+    virtual future<> on_restart(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }

-    virtual future<> on_join(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override;
-    virtual future<> on_change(gms::inet_address, gms::application_state, const gms::versioned_value&, gms::permit_id) override;
+    virtual future<> on_join(gms::inet_address, gms::endpoint_state) override;
+    virtual future<> on_change(gms::inet_address, gms::application_state, const gms::versioned_value&) override;

    future<> check_and_repair_cdc_streams();

--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -160,7 +160,7 @@ public:
        });
    }

-    void on_before_create_column_family(const keyspace_metadata& ksm, const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+    void on_before_create_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
        if (schema.cdc_options().enabled()) {
            auto& db = _ctxt._proxy.get_db().local();
            auto logname = log_name(schema.cf_name());
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -40,7 +40,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const utils::chunked_vector<cdc::token_range_description>& entries,
+        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -21,27 +21,27 @@ public:
            : file_impl(*get_file_impl(f)),  _error_handler(error_handler), _file(f) {
    }

-    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, io_intent* intent) override {
+    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->write_dma(pos, buffer, len, intent);
+            return get_file_impl(_file)->write_dma(pos, buffer, len, pc);
        });
    }

-    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, io_intent* intent) override {
+    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->write_dma(pos, iov, intent);
+            return get_file_impl(_file)->write_dma(pos, iov, pc);
        });
    }

-    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, io_intent* intent) override {
+    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->read_dma(pos, buffer, len, intent);
+            return get_file_impl(_file)->read_dma(pos, buffer, len, pc);
        });
    }

-    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, io_intent* intent) override {
+    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->read_dma(pos, iov, intent);
+            return get_file_impl(_file)->read_dma(pos, iov, pc);
        });
    }

@@ -99,9 +99,9 @@ public:
        });
    }

-    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, io_intent* intent) override {
+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->dma_read_bulk(offset, range_size, intent);
+            return get_file_impl(_file)->dma_read_bulk(offset, range_size, pc);
        });
    }
 private:
--- a/cmake/Findrapidxml.cmake
+++ b/cmake/Findrapidxml.cmake
@@ -1,27 +0,0 @@
-#
-# Copyright 2023-present ScyllaDB
-#
-
-#
-# SPDX-License-Identifier: AGPL-3.0-or-later
-#
-find_path(rapidxml_INCLUDE_DIR
-  NAMES rapidxml.h rapidxml/rapidxml.hpp)
-
-mark_as_advanced(
-  rapidxml_INCLUDE_DIR)
-
-include(FindPackageHandleStandardArgs)
-
-find_package_handle_standard_args(rapidxml
-  REQUIRED_VARS
-    rapidxml_INCLUDE_DIR)
-
-if(rapidxml_FOUND)
-  if(NOT TARGET rapidxml::rapidxml)
-    add_library(rapidxml::rapidxml INTERFACE IMPORTED)
-    set_target_properties(rapidxml::rapidxml
-      PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES ${rapidxml_INCLUDE_DIR})
-  endif()
-endif()
--- a/cmake/add_version_library.cmake
+++ b/cmake/add_version_library.cmake
@@ -1,31 +1,20 @@
 ###
 ### Generate version file and supply appropriate compile definitions for release.cc
 ###
-function(generate_scylla_version)
+function(add_version_library name source)
  set(version_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-VERSION-FILE)
  set(release_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-RELEASE-FILE)
-  set(product_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-PRODUCT-FILE)
  execute_process(
    COMMAND ${CMAKE_SOURCE_DIR}/SCYLLA-VERSION-GEN --output-dir "${CMAKE_CURRENT_BINARY_DIR}"
    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
-
  file(STRINGS ${version_file} scylla_version)
  file(STRINGS ${release_file} scylla_release)
-  file(STRINGS ${product_file} scylla_product)

-  string(REPLACE "-" "~" scylla_version_tilde ${scylla_version})
-
-  set(Scylla_VERSION "${scylla_version_tilde}" CACHE INTERNAL "")
-  set(Scylla_RELEASE "${scylla_release}" CACHE INTERNAL "")
-  set(Scylla_PRODUCT "${scylla_product}" CACHE INTERNAL "")
-endfunction(generate_scylla_version)
-
-function(add_version_library name source)
  add_library(${name} OBJECT ${source})
  target_compile_definitions(${name}
    PRIVATE
-      SCYLLA_VERSION=\"${Scylla_VERSION}\"
-      SCYLLA_RELEASE=\"${Scylla_RELEASE}\")
+      SCYLLA_VERSION=\"${scylla_version}\"
+      SCYLLA_RELEASE=\"${scylla_release}\")
  target_link_libraries(${name}
    PRIVATE
      Seastar::seastar)
--- a/cmake/add_whole_archive.cmake
+++ b/cmake/add_whole_archive.cmake
@@ -5,6 +5,15 @@
 # actually compiling a sample program.
 function(add_whole_archive name library)
  add_library(${name} INTERFACE)
-  target_link_libraries(${name} INTERFACE
-    "$<LINK_LIBRARY:WHOLE_ARCHIVE,${library}>")
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
+    target_link_libraries(${name} INTERFACE
+      "$<LINK_LIBRARY:WHOLE_ARCHIVE,${library}>")
+  else()
+    add_dependencies(${name} ${library})
+    target_include_directories(${name} INTERFACE
+      ${CMAKE_SOURCE_DIR})
+    target_link_options(auth INTERFACE
+      "$<$<CXX_COMPILER_ID:Clang>:SHELL:LINKER:-force_load $<TARGET_LINKER_FILE:${library}>>"
+      "$<$<CXX_COMPILER_ID:GNU>:SHELL:LINKER:--whole-archive $<TARGET_LINKER_FILE:${library}> LINKER:--no-whole-archive>")
+  endif()
 endfunction()
--- a/cmake/build_submodule.cmake
+++ b/cmake/build_submodule.cmake
@@ -1,50 +0,0 @@
-function(build_submodule name dir)
-  cmake_parse_arguments(parsed_args "NOARCH" "" "" ${ARGN})
-  set(version_release "${Scylla_VERSION}-${Scylla_RELEASE}")
-  set(product_version_release
-    "${Scylla_PRODUCT}-${Scylla_VERSION}-${Scylla_RELEASE}")
-  set(working_dir ${CMAKE_CURRENT_SOURCE_DIR}/${dir})
-  if(parsed_args_NOARCH)
-    set(arch "noarch")
-  else()
-    set(arch "${CMAKE_SYSTEM_PROCESSOR}")
-  endif()
-  set(reloc_args ${parsed_args_UNPARSED_ARGUMENTS})
-  set(reloc_pkg "${working_dir}/build/${Scylla_PRODUCT}-${name}-${version_release}.${arch}.tar.gz")
-  add_custom_command(
-    OUTPUT ${reloc_pkg}
-    COMMAND reloc/build_reloc.sh --version ${product_version_release} --nodeps ${reloc_args}
-    WORKING_DIRECTORY "${working_dir}"
-    JOB_POOL submodule_pool)
-  add_custom_target(dist-${name}-tar
-    DEPENDS ${reloc_pkg})
-  add_custom_target(dist-${name}-rpm
-    COMMAND reloc/build_rpm.sh --reloc-pkg ${reloc_pkg}
-    DEPENDS ${reloc_pkg}
-    WORKING_DIRECTORY "${working_dir}")
-  add_custom_target(dist-${name}-deb
-    COMMAND reloc/build_deb.sh --reloc-pkg ${reloc_pkg}
-    DEPENDS ${reloc_pkg}
-    WORKING_DIRECTORY "${working_dir}")
-  add_custom_target(dist-${name}
-    DEPENDS dist-${name}-tar dist-${name}-rpm dist-${name}-deb)
-endfunction()
-
-macro(dist_submodule name dir pkgs)
-  # defined as a macro, so that we can append the path to the dist tarball to
-  # specfied "pkgs"
-  cmake_parse_arguments(parsed_args "NOARCH" "" "" ${ARGN})
-  if(parsed_args_NOARCH)
-    set(arch "noarch")
-  else()
-    set(arch "${CMAKE_SYSTEM_PROCESSOR}")
-  endif()
-  set(pkg_name "${Scylla_PRODUCT}-${name}-${Scylla_VERSION}-${Scylla_RELEASE}.${arch}.tar.gz")
-  set(reloc_pkg "${CMAKE_SOURCE_DIR}/tools/${dir}/build/${pkg_name}")
-  set(dist_pkg "${CMAKE_CURRENT_BINARY_DIR}/${pkg_name}")
-  add_custom_command(
-    OUTPUT ${dist_pkg}
-    COMMAND ${CMAKE_COMMAND} -E copy ${reloc_pkg} ${dist_pkg}
-    DEPENDS dist-${name}-tar)
-  list(APPEND ${pkgs} "${dist_pkg}")
-endmacro()
--- a/cmake/generate_cql_grammar.cmake
+++ b/cmake/generate_cql_grammar.cmake
@@ -1,5 +1,7 @@
-find_program (ANTLR3 antlr3
-  REQUIRED)
+find_program (ANTLR3 antlr3)
+if(NOT ANTLR3)
+  message(FATAL "antlr3 is required")
+endif()

 # Parse antlr3 grammar files and generate C++ sources
 function(generate_cql_grammar)
--- a/cmake/mode.COVERAGE.cmake
+++ b/cmake/mode.COVERAGE.cmake
@@ -1,23 +0,0 @@
-set(Seastar_OptimizationLevel_COVERAGE "g")
-set(CMAKE_CXX_FLAGS_COVERAGE
-  ""
-  CACHE
-  INTERNAL
-  "")
-string(APPEND CMAKE_CXX_FLAGS_COVERAGE
-  " -O${Seastar_OptimizationLevel_SANITIZE}")
-
-set(Seastar_DEFINITIONS_COVERAGE
-  SCYLLA_BUILD_MODE=debug
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
-
-set(CMAKE_CXX_FLAGS_COVERAGE
-  " -O${Seastar_OptimizationLevel_COVERAGE} -fprofile-instr-generate -fcoverage-mapping -g -gz")
-
-set(CMAKE_STATIC_LINKER_FLAGS_COVERAGE
-  "-fprofile-instr-generate -fcoverage-mapping")
-
-set(stack_usage_threshold_in_KB 40)
--- a/cmake/mode.RELEASE.cmake
+++ b/cmake/mode.RELEASE.cmake
@@ -12,15 +12,16 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
 else()
  set(clang_inline_threshold 2500)
 endif()
-add_compile_options(
-  "$<$<CXX_COMPILER_ID:GNU>:--param;inline-unit-growth=300>"
-  "$<$<CXX_COMPILER_ID:Clang>:-mllvm;-inline-threshold=${clang_inline_threshold}>"
+string(APPEND CMAKE_CXX_FLAGS_RELEASE
+  " $<$<CXX_COMPILER_ID:GNU>:--param inline-unit-growth=300"
+  " $<$<CXX_COMPILER_ID:Clang>:-mllvm -inline-threshold=${clang_inline_threshold}>"
  # clang generates 16-byte loads that break store-to-load forwarding
  # gcc also has some trouble: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103554
-  "-fno-slp-vectorize")
-set(Seastar_DEFINITIONS_RELEASE
+  " -fno-slp-vectorize")
+set(Seastar_DEFINITIONS_DEBUG
  SCYLLA_BUILD_MODE=release)

-add_link_options("LINKER:--gc-sections")
+set(CMAKE_STATIC_LINKER_FLAGS_RELEASE
+  "-Wl,--gc-sections")

 set(stack_usage_threshold_in_KB 13)
--- a/cmake/mode.SANITIZE.cmake
+++ b/cmake/mode.SANITIZE.cmake
@@ -1,17 +0,0 @@
-set(Seastar_OptimizationLevel_SANITIZE "s")
-set(CMAKE_CXX_FLAGS_SANITIZE
-  ""
-  CACHE
-  INTERNAL
-  "")
-string(APPEND CMAKE_CXX_FLAGS_SANITIZE
-  " -O${Seastar_OptimizationLevel_SANITIZE}")
-
-set(Seastar_DEFINITIONS_SANITIZE
-  SCYLLA_BUILD_MODE=sanitize
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
-
-set(stack_usage_threshold_in_KB 50)
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -11,117 +11,31 @@ foreach(warning ${disabled_warnings})
  endif()
 endforeach()
 list(TRANSFORM _supported_warnings PREPEND "-Wno-")
-add_compile_options(
+string(JOIN " " CMAKE_CXX_FLAGS
  "-Wall"
  "-Werror"
  "-Wno-error=deprecated-declarations"
-  "-Wimplicit-fallthrough"
  ${_supported_warnings})

 function(default_target_arch arch)
  set(x86_instruction_sets i386 i686 x86_64)
  if(CMAKE_SYSTEM_PROCESSOR IN_LIST x86_instruction_sets)
    set(${arch} "westmere" PARENT_SCOPE)
-  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-    # we always use intrinsics like vmull.p64 for speeding up crc32 calculations
-    # on the aarch64 architectures, and they require the crypto extension, so
-    # we have to add "+crypto" in the architecture flags passed to -march. the
-    # same applies to crc32 instructions, which need the ARMv8-A CRC32 extension
-    # please note, Seastar also sets -march when compiled with DPDK enabled.
+  elseif(CMAKE_SYSTEM_PROCESSOR EQUAL "aarch64")
    set(${arch} "armv8-a+crc+crypto" PARENT_SCOPE)
  else()
    set(${arch} "" PARENT_SCOPE)
  endif()
 endfunction()

-function(pad_at_begin output fill str length)
-  # pad the given `${str} with `${fill}`, right aligned. with the syntax of
-  # fmtlib:
-  #   fmt::print("{:#>{}}", str, length)
-  # where `#` is the `${fill}` char
-  string(LENGTH "${str}" str_len)
-  math(EXPR padding_len "${length} - ${str_len}")
-  if(padding_len GREATER 0)
-    string(REPEAT ${fill} ${padding_len} padding)
-  endif()
-  set(${output} "${padding}${str}" PARENT_SCOPE)
-endfunction()
-
-# The relocatable package includes its own dynamic linker. We don't
-# know the path it will be installed to, so for now use a very long
-# path so that patchelf doesn't need to edit the program headers.  The
-# kernel imposes a limit of 4096 bytes including the null. The other
-# constraint is that the build-id has to be in the first page, so we
-# can't use all 4096 bytes for the dynamic linker.
-# In here we just guess that 2000 extra / should be enough to cover
-# any path we get installed to but not so large that the build-id is
-# pushed to the second page.
-# At the end of the build we check that the build-id is indeed in the
-# first page. At install time we check that patchelf doesn't modify
-# the program headers.
-function(get_padded_dynamic_linker_option output length)
-  set(dynamic_linker_option "-dynamic-linker")
-  # capture the drive-generated command line first
-  execute_process(
-    COMMAND ${CMAKE_C_COMPILER} "-###" /dev/null -o t
-    ERROR_VARIABLE driver_command_line
-    ERROR_STRIP_TRAILING_WHITESPACE)
-  # extract the argument for the "-dynamic-linker" option
-  if(driver_command_line MATCHES ".*\"?${dynamic_linker_option}\"? \"?([^ \"]*)\"? .*")
-    set(dynamic_linker ${CMAKE_MATCH_1})
-  else()
-    message(FATAL_ERROR "Unable to find ${dynamic_linker_option} in driver-generated command: "
-      "${driver_command_line}")
-  endif()
-  # prefixing a path with "/"s does not actually change it means
-  pad_at_begin(padded_dynamic_linker "/" "${dynamic_linker}" ${length})
-  set(${output} "${dynamic_linker_option}=${padded_dynamic_linker}" PARENT_SCOPE)
-endfunction()
-
-add_compile_options("-ffile-prefix-map=${CMAKE_SOURCE_DIR}=.")
-
 default_target_arch(target_arch)
 if(target_arch)
-  add_compile_options("-march=${target_arch}")
+    string(APPEND CMAKE_CXX_FLAGS " -march=${target_arch}")
 endif()

 math(EXPR _stack_usage_threshold_in_bytes "${stack_usage_threshold_in_KB} * 1024")
 set(_stack_usage_threshold_flag "-Wstack-usage=${_stack_usage_threshold_in_bytes}")
 check_cxx_compiler_flag(${_stack_usage_threshold_flag} _stack_usage_flag_supported)
 if(_stack_usage_flag_supported)
-  add_compile_options("${_stack_usage_threshold_flag}")
+  string(APPEND CMAKE_CXX_FLAGS " ${_stack_usage_threshold_flag}")
 endif()
-
-# Force SHA1 build-id generation
-add_link_options("LINKER:--build-id=sha1")
-include(CheckLinkerFlag)
-set(Scylla_USE_LINKER
-    ""
-    CACHE
-    STRING
-    "Use specified linker instead of the default one")
-if(Scylla_USE_LINKER)
-    set(linkers "${Scylla_USE_LINKER}")
-else()
-    set(linkers "lld" "gold")
-endif()
-
-foreach(linker ${linkers})
-    set(linker_flag "-fuse-ld=${linker}")
-    check_linker_flag(CXX ${linker_flag} "CXX_LINKER_HAVE_${linker}")
-    if(CXX_LINKER_HAVE_${linker})
-        add_link_options("${linker_flag}")
-        break()
-    elseif(Scylla_USE_LINKER)
-        message(FATAL_ERROR "${Scylla_USE_LINKER} is not supported.")
-    endif()
-endforeach()
-
-if(DEFINED ENV{NIX_CC})
-  get_padded_dynamic_linker_option(dynamic_linker_option 0)
-else()
-  # gdb has a SO_NAME_MAX_PATH_SIZE of 512, so limit the path size to
-  # that. The 512 includes the null at the end, hence the 511 bellow.
-  get_padded_dynamic_linker_option(dynamic_linker_option 511)
-endif()
-add_link_options("${dynamic_linker_option}")
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -29,27 +29,32 @@
 #include <seastar/core/shared_ptr.hh>

 #include "dht/i_partitioner.hh"
-#include "sstables/exceptions.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstable_writer.hh"
 #include "sstables/progress_monitor.hh"
 #include "sstables/sstables_manager.hh"
 #include "compaction.hh"
+#include "compaction_manager.hh"
 #include "schema/schema.hh"
 #include "db/system_keyspace.hh"
+#include "service/priority_manager.hh"
 #include "db_clock.hh"
 #include "mutation/mutation_compactor.hh"
 #include "leveled_manifest.hh"
+#include "dht/token.hh"
 #include "dht/partition_filter.hh"
 #include "mutation_writer/shard_based_splitting_writer.hh"
 #include "mutation_writer/partition_based_splitting_writer.hh"
 #include "mutation/mutation_source_metadata.hh"
 #include "mutation/mutation_fragment_stream_validator.hh"
+#include "utils/UUID_gen.hh"
+#include "utils/utf8.hh"
+#include "utils/fmt-compat.hh"
 #include "utils/error_injection.hh"
-#include "readers/multi_range.hh"
+#include "readers/filtering.hh"
 #include "readers/compacting.hh"
 #include "tombstone_gc.hh"
-#include "replica/database.hh"
+#include "keys.hh"

 namespace sstables {

@@ -143,6 +148,25 @@ std::ostream& operator<<(std::ostream& os, compaction_type_options::scrub::quara
    return os << to_string(quarantine_mode);
 }

+std::ostream& operator<<(std::ostream& os, pretty_printed_data_size data) {
+    static constexpr const char* suffixes[] = { " bytes", "kB", "MB", "GB", "TB", "PB" };
+
+    unsigned exp = 0;
+    while ((data._size >= 1000) && (exp < sizeof(suffixes))) {
+        exp++;
+        data._size /= 1000;
+    }
+
+    os << data._size << suffixes[exp];
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
+    uint64_t throughput = tp._duration.count() > 0 ? tp._size / tp._duration.count() : 0;
+    os << pretty_printed_data_size(throughput) << "/s";
+    return os;
+}
+
 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
@@ -325,21 +349,16 @@ public:
    void consume_end_of_stream();
 };

-using use_backlog_tracker = bool_class<class use_backlog_tracker_tag>;
-
 struct compaction_read_monitor_generator final : public read_monitor_generator {
    class compaction_read_monitor final : public  sstables::read_monitor, public backlog_read_progress_manager {
        sstables::shared_sstable _sst;
        table_state& _table_s;
        const sstables::reader_position_tracker* _tracker = nullptr;
        uint64_t _last_position_seen = 0;
-        use_backlog_tracker _use_backlog_tracker;
    public:
        virtual void on_read_started(const sstables::reader_position_tracker& tracker) override {
            _tracker = &tracker;
-            if (_use_backlog_tracker) {
-                _table_s.get_backlog_tracker().register_compacting_sstable(_sst, *this);
-            }
+            _table_s.get_backlog_tracker().register_compacting_sstable(_sst, *this);
        }

        virtual void on_read_completed() override {
@@ -357,19 +376,19 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
        }

        void remove_sstable() {
-            if (_sst && _use_backlog_tracker) {
+            if (_sst) {
                _table_s.get_backlog_tracker().revert_charges(_sst);
            }
            _sst = {};
        }

-        compaction_read_monitor(sstables::shared_sstable sst, table_state& table_s, use_backlog_tracker use_backlog_tracker)
-            : _sst(std::move(sst)), _table_s(table_s), _use_backlog_tracker(use_backlog_tracker) { }
+        compaction_read_monitor(sstables::shared_sstable sst, table_state& table_s)
+            : _sst(std::move(sst)), _table_s(table_s) { }

        ~compaction_read_monitor() {
            // We failed to finish handling this SSTable, so we have to update the backlog_tracker
            // about it.
-            if (_sst && _use_backlog_tracker) {
+            if (_sst) {
                _table_s.get_backlog_tracker().revert_charges(_sst);
            }
        }
@@ -378,16 +397,12 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
    };

    virtual sstables::read_monitor& operator()(sstables::shared_sstable sst) override {
-        auto p = _generated_monitors.emplace(sst->generation(), compaction_read_monitor(sst, _table_s, _use_backlog_tracker));
+        auto p = _generated_monitors.emplace(sst->generation(), compaction_read_monitor(sst, _table_s));
        return p.first->second;
    }

-    explicit compaction_read_monitor_generator(table_state& table_s, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
-        : _table_s(table_s), _use_backlog_tracker(use_backlog_tracker) {}
-
-    uint64_t compacted() const {
-        return boost::accumulate(_generated_monitors | boost::adaptors::map_values | boost::adaptors::transformed([](auto& monitor) { return monitor.compacted(); }), uint64_t(0));
-    }
+    explicit compaction_read_monitor_generator(table_state& table_s)
+        : _table_s(table_s) {}

    void remove_exhausted_sstables(const std::vector<sstables::shared_sstable>& exhausted_sstables) {
        for (auto& sst : exhausted_sstables) {
@@ -400,29 +415,8 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
 private:
    table_state& _table_s;
    std::unordered_map<generation_type, compaction_read_monitor> _generated_monitors;
-    use_backlog_tracker _use_backlog_tracker;
-
-    friend class compaction_progress_monitor;
 };

-void compaction_progress_monitor::set_generator(std::unique_ptr<read_monitor_generator> generator) {
-    _generator = std::move(generator);
-}
-
-void compaction_progress_monitor::reset_generator() {
-    if (_generator) {
-        _progress = dynamic_cast<compaction_read_monitor_generator&>(*_generator).compacted();
-    }
-    _generator = nullptr;
-}
-
-uint64_t compaction_progress_monitor::get_progress() const {
-    if (_generator) {
-        return dynamic_cast<compaction_read_monitor_generator&>(*_generator).compacted();
-    }
-    return _progress;
-}
-
 class formatted_sstables_list {
    bool _include_origin = true;
    std::vector<std::string> _ssts;
@@ -453,9 +447,9 @@ class compaction {
 protected:
    compaction_data& _cdata;
    table_state& _table_s;
-    const compaction_sstable_creator_fn _sstable_creator;
-    const schema_ptr _schema;
-    const reader_permit _permit;
+    compaction_sstable_creator_fn _sstable_creator;
+    schema_ptr _schema;
+    reader_permit _permit;
    std::vector<shared_sstable> _sstables;
    std::vector<generation_type> _input_sstable_generations;
    // Unused sstables are tracked because if compaction is interrupted we can only delete them.
@@ -464,47 +458,41 @@ protected:
    std::vector<shared_sstable> _new_unused_sstables;
    std::vector<shared_sstable> _all_new_sstables;
    lw_shared_ptr<sstable_set> _compacting;
-    const sstables::compaction_type _type;
-    const uint64_t _max_sstable_size;
-    const uint32_t _sstable_level;
+    sstables::compaction_type _type;
+    uint64_t _max_sstable_size;
+    uint32_t _sstable_level;
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
-    // fully expired files, which are skipped, aren't taken into account.
-    uint64_t _compacting_data_file_size = 0;
    uint64_t _estimated_partitions = 0;
    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
-    const bool _can_split_large_partition = false;
+    bool _can_split_large_partition = false;
    bool _contains_multi_fragment_runs = false;
    mutation_source_metadata _ms_metadata = {};
-    const compaction_sstable_replacer_fn _replacer;
-    const run_id _run_identifier;
+    compaction_sstable_replacer_fn _replacer;
+    run_id _run_identifier;
+    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
    std::optional<sstable_set> _sstable_set;
    // used to incrementally calculate max purgeable timestamp, as we iterate through decorated keys.
    std::optional<sstable_set::incremental_selector> _selector;
    std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
    // optional owned_ranges vector for cleanup;
-    const owned_ranges_ptr _owned_ranges = {};
-    // required for reshard compaction.
-    const dht::sharder* _sharder = nullptr;
-    const std::optional<dht::incremental_owned_ranges_checker> _owned_ranges_checker;
+    owned_ranges_ptr _owned_ranges = {};
+    std::optional<dht::incremental_owned_ranges_checker> _owned_ranges_checker;
    // Garbage collected sstables that are sealed but were not added to SSTable set yet.
    std::vector<shared_sstable> _unused_garbage_collected_sstables;
    // Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
    std::vector<shared_sstable> _used_garbage_collected_sstables;
    utils::observable<> _stop_request_observable;
 private:
-    // Keeps track of monitors for input sstable.
-    // If _update_backlog_tracker is set to true, monitors are responsible for adjusting backlog as compaction progresses.
-    compaction_progress_monitor& _progress_monitor;
    compaction_data& init_compaction_data(compaction_data& cdata, const compaction_descriptor& descriptor) const {
        cdata.compaction_fan_in = descriptor.fan_in();
        return cdata;
    }
 protected:
-    compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
+    compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
        : _cdata(init_compaction_data(cdata, descriptor))
        , _table_s(table_s)
        , _sstable_creator(std::move(descriptor.creator))
@@ -517,13 +505,12 @@ protected:
        , _can_split_large_partition(descriptor.can_split_large_partition)
        , _replacer(std::move(descriptor.replacer))
        , _run_identifier(descriptor.run_identifier)
+        , _io_priority(descriptor.io_priority)
        , _sstable_set(std::move(descriptor.all_sstables_snapshot))
        , _selector(_sstable_set ? _sstable_set->make_incremental_selector() : std::optional<sstable_set::incremental_selector>{})
        , _compacting_for_max_purgeable_func(std::unordered_set<shared_sstable>(_sstables.begin(), _sstables.end()))
        , _owned_ranges(std::move(descriptor.owned_ranges))
-        , _sharder(descriptor.sharder)
        , _owned_ranges_checker(_owned_ranges ? std::optional<dht::incremental_owned_ranges_checker>(*_owned_ranges) : std::nullopt)
-        , _progress_monitor(progress_monitor)
    {
        for (auto& sst : _sstables) {
            _stats_collector.update(sst->get_encoding_stats_for_compaction());
@@ -532,20 +519,12 @@ protected:
        _contains_multi_fragment_runs = std::any_of(_sstables.begin(), _sstables.end(), [&ssts_run_ids] (shared_sstable& sst) {
            return !ssts_run_ids.insert(sst->run_identifier()).second;
        });
-        _progress_monitor.set_generator(std::make_unique<compaction_read_monitor_generator>(_table_s, use_backlog_tracker));
-    }
-
-    read_monitor_generator& unwrap_monitor_generator() const {
-        if (_progress_monitor._generator) {
-            return *_progress_monitor._generator;
-        }
-        return default_read_monitor_generator();
    }

    virtual uint64_t partitions_per_sstable() const {
        // some tests use _max_sstable_size == 0 for force many one partition per sstable
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
-        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_compacting_data_file_size) / max_sstable_size)));
+        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
    }
@@ -602,14 +581,15 @@ protected:
        return bool(_sstable_set) && _table_s.tombstone_gc_enabled();
    }

-    compaction_writer create_gc_compaction_writer(run_id gc_run) const {
+    compaction_writer create_gc_compaction_writer() const {
        auto sst = _sstable_creator(this_shard_id());

+        auto&& priority = _io_priority;
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
-        cfg.run_identifier = gc_run;
+        cfg.run_identifier = _run_identifier;
        cfg.monitor = monitor.get();
-        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats());
+        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }

@@ -628,14 +608,8 @@ protected:
    // When compaction finishes, all the temporary sstables generated here will be deleted and removed
    // from table's sstable set.
    compacted_fragments_writer get_gc_compacted_fragments_writer() {
-        // because the temporary sstable run can overlap with the non-gc sstables run created by
-        // get_compacted_fragments_writer(), we have to use a different run_id. the gc_run_id is
-        // created here as:
-        // 1. it can be shared across all sstables created by this writer
-        // 2. it is optional, as gc writer is not always used
-        auto gc_run = run_id::create_random_id();
        return compacted_fragments_writer(*this,
-             [this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
+             [this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
             [this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
             _stop_request_observable);
    }
@@ -652,8 +626,18 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    }
+
+    flat_mutation_reader_v2::filter make_partition_filter() const {
+        return [this] (const dht::decorated_key& dk) {
+            if (!_owned_ranges_checker->belongs_to_current_node(dk.token())) {
+                log_trace("Token {} does not belong to this node, skipping", dk.token());
+                return false;
+            }
+            return true;
+        };
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -663,59 +647,20 @@ public:
    compaction& operator=(compaction&& other) = delete;

    virtual ~compaction() {
-        _progress_monitor.reset_generator();
    }
 private:
    // Default range sstable reader that will only return mutation that belongs to current shard.
-    virtual flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                        reader_permit permit,
-                                                        const dht::partition_range& range,
-                                                        const query::partition_slice& slice,
-                                                        tracing::trace_state_ptr,
-                                                        streamed_mutation::forwarding fwd,
-                                                        mutation_reader::forwarding) const = 0;
+    virtual flat_mutation_reader_v2 make_sstable_reader() const = 0;

+    // Make a filtering reader if needed
+    // FIXME: the sstable reader itself should be pass the owned ranges
+    // so it can skip over the disowned ranges efficiently using the index.
+    // Ref https://github.com/scylladb/scylladb/issues/12998
    flat_mutation_reader_v2 setup_sstable_reader() const {
        if (!_owned_ranges_checker) {
-            return make_sstable_reader(_schema,
-                                       _permit,
-                                       query::full_partition_range,
-                                       _schema->full_slice(),
-                                       tracing::trace_state_ptr(),
-                                       ::streamed_mutation::forwarding::no,
-                                       ::mutation_reader::forwarding::no);
+            return make_sstable_reader();
        }
-
-        auto source = mutation_source([this] (schema_ptr s,
-                reader_permit permit,
-                const dht::partition_range& range,
-                const query::partition_slice& slice,
-                tracing::trace_state_ptr trace_state,
-                streamed_mutation::forwarding fwd,
-                mutation_reader::forwarding fwd_mr) {
-            log_trace("Creating sstable set reader with range {}", range);
-            return make_sstable_reader(std::move(s),
-                                       std::move(permit),
-                                       range,
-                                       slice,
-                                       std::move(trace_state),
-                                       fwd,
-                                       fwd_mr);
-        });
-
-        auto owned_range_generator = [this] () -> std::optional<dht::partition_range> {
-            auto r = _owned_ranges_checker->next_owned_range();
-            if (r == nullptr) {
-                return std::nullopt;
-            }
-            log_trace("Skipping to the next owned range {}", *r);
-            return dht::to_partition_range(*r);
-        };
-
-        return make_flat_multi_range_reader(_schema, _permit, std::move(source),
-                                            std::move(owned_range_generator),
-                                            _schema->full_slice(),
-                                            tracing::trace_state_ptr());
+        return make_filtering_reader(make_sstable_reader(), make_partition_filter());
    }

    virtual sstables::sstable_set make_sstable_set_for_input() const {
@@ -749,14 +694,12 @@ private:
                continue;
            }

-            _cdata.compaction_size += sst->data_size();
            // We also capture the sstable, so we keep it alive while the read isn't done
            ssts->insert(sst);
            // FIXME: If the sstables have cardinality estimation bitmaps, use that
            // for a better estimate for the number of partitions in the merged
            // sstable than just adding up the lengths of individual sstables.
            _estimated_partitions += sst->get_estimated_key_count();
-            _compacting_data_file_size += sst->ondisk_data_size();
            // TODO:
            // Note that this is not fully correct. Since we might be merging sstables that originated on
            // another shard (#cpu changed), we might be comparing RP:s with differing shard ids,
@@ -785,7 +728,7 @@ private:
        auto consumer = make_interposer_consumer([this] (flat_mutation_reader_v2 reader) mutable {
            return seastar::async([this, reader = std::move(reader)] () mutable {
                auto close_reader = deferred_close(reader);
-                auto cfc = get_compacted_fragments_writer();
+                auto cfc = compacted_fragments_writer(get_compacted_fragments_writer());
                reader.consume_in_thread(std::move(cfc));
            });
        });
@@ -863,8 +806,8 @@ protected:
        // By the time being, using estimated key count.
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
-                _input_sstable_generations.size(), new_sstables_msg, utils::pretty_printed_data_size(_start_size), utils::pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), utils::pretty_printed_throughput(_start_size, duration),
+                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -1001,7 +944,7 @@ void compacted_fragments_writer::split_large_partition() {
        _c.log_debug("Closing active tombstone {} with {} for partition {}", _current_partition.current_emitted_tombstone, rtc, *_current_partition.dk);
        _compaction_writer->writer.consume(std::move(rtc));
    }
-    _c.log_debug("Splitting large partition {} in order to respect SSTable size limit of {}", *_current_partition.dk, utils::pretty_printed_data_size(_c._max_sstable_size));
+    _c.log_debug("Splitting large partition {} in order to respect SSTable size limit of {}", *_current_partition.dk, pretty_printed_data_size(_c._max_sstable_size));
    // Close partition in current writer, and open it again in a new writer.
    do_consume_end_of_partition();
    stop_current_writer();
@@ -1085,29 +1028,72 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

+class reshape_compaction : public compaction {
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+        : compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            finish_new_sstable(writer);
+        }
+    }
+};
+
 class regular_compaction : public compaction {
+    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
+    mutable compaction_read_monitor_generator _monitor_generator;
    seastar::semaphore _replacer_lock = {1};
 public:
-    regular_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker = use_backlog_tracker::yes)
-        : compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker)
+    regular_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+        : compaction(table_s, std::move(descriptor), cdata)
+        , _monitor_generator(_table_s)
    {
    }

-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_local_shard_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
-                std::move(trace),
-                sm_fwd,
-                mr_fwd,
-                unwrap_monitor_generator());
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                _monitor_generator);
    }

    std::string_view report_start_desc() const override {
@@ -1125,7 +1111,7 @@ public:
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = make_sstable_writer_config(_type);
        cfg.monitor = monitor.get();
-        return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
+        return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
    }

    virtual void stop_sstable_writer(compaction_writer* writer) override {
@@ -1178,7 +1164,7 @@ private:
            log_debug("Replacing earlier exhausted sstable(s) {} by new sstable(s) {}", formatted_sstables_list(exhausted_ssts, false), formatted_sstables_list(_new_unused_sstables, true));
            _replacer(get_compaction_completion_desc(exhausted_ssts, std::move(_new_unused_sstables)));
            _sstables.erase(exhausted, _sstables.end());
-            dynamic_cast<compaction_read_monitor_generator&>(unwrap_monitor_generator()).remove_exhausted_sstables(exhausted_ssts);
+            _monitor_generator.remove_exhausted_sstables(exhausted_ssts);
        }
    }

@@ -1196,13 +1182,12 @@ private:
    }

    void update_pending_ranges() {
-        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
-        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
+        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : pending_replacements) {
+        for (auto& pending_replacement : _cdata.pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1216,75 +1201,7 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-    }
-};
-
-class reshape_compaction : public regular_compaction {
-private:
-    bool has_sstable_replacer() const noexcept {
-        return bool(_replacer);
-    }
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
-        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
-    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
-        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_local_shard_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
-                std::move(trace),
-                sm_fwd,
-                mr_fwd,
-                unwrap_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            if (has_sstable_replacer()) {
-                regular_compaction::stop_sstable_writer(writer);
-            } else {
-                finish_new_sstable(writer);
-            }
-        }
-    }
-
-    virtual void on_end_of_compaction() override {
-        if (has_sstable_replacer()) {
-            regular_compaction::on_end_of_compaction();
-        }
+        _cdata.pending_replacements.clear();
    }
 };

@@ -1312,8 +1229,8 @@ protected:
    }

 public:
-    cleanup_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
-        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor)
+    cleanup_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+        : regular_compaction(table_s, std::move(descriptor), cdata)
    {
    }

@@ -1542,8 +1459,8 @@ private:
    mutable uint64_t _validation_errors = 0;

 public:
-    scrub_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
-        : regular_compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
+    scrub_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options)
+        : regular_compaction(table_s, std::move(descriptor), cdata)
        , _options(options)
        , _scrub_start_description(fmt::format("Scrubbing in {} mode", _options.operation_mode))
        , _scrub_finish_description(fmt::format("Finished scrubbing in {} mode", _options.operation_mode)) {
@@ -1560,17 +1477,8 @@ public:
        return _scrub_finish_description;
    }

-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        if (!range.is_full()) {
-            on_internal_error(clogger, fmt::format("Scrub compaction in mode {} expected full partition range, but got {} instead", _options.operation_mode, range));
-        }
-        auto crawling_reader = _compacting->make_crawling_reader(std::move(s), std::move(permit), nullptr, unwrap_monitor_generator());
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        auto crawling_reader = _compacting->make_crawling_reader(_schema, _permit, _io_priority, nullptr);
        return make_flat_mutation_reader_v2<reader>(std::move(crawling_reader), _options.operation_mode, _validation_errors);
    }

@@ -1589,7 +1497,7 @@ public:
            return end_consumer;
        }
        return [this, end_consumer = std::move(end_consumer)] (flat_mutation_reader_v2 reader) mutable -> future<> {
-            auto cfg = mutation_writer::segregate_config{memory::stats().total_memory() / 10};
+            auto cfg = mutation_writer::segregate_config{_io_priority, memory::stats().total_memory() / 10};
            return mutation_writer::segregate_by_partition(std::move(reader), cfg,
                    [consumer = std::move(end_consumer), this] (flat_mutation_reader_v2 rd) {
                ++_bucket_count;
@@ -1639,8 +1547,8 @@ private:
                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
    }
 public:
-    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
-        : compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
+    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
+        : compaction(table_s, std::move(descriptor), cdata)
        , _estimation_per_shard(smp::count)
        , _run_identifiers(smp::count)
    {
@@ -1661,21 +1569,15 @@ public:
    ~resharding_compaction() { }

    // Use reader that makes sure no non-local mutation will not be filtered out.
-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_range_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_range_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
                nullptr,
-                sm_fwd,
-                mr_fwd,
-                unwrap_monitor_generator());
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no);

    }

@@ -1698,14 +1600,14 @@ public:
    }

    compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto shard = _sharder->shard_of(dk.token());
+        auto shard = dht::shard_of(*_schema, dk.token());
        auto sst = _sstable_creator(shard);
        setup_new_sstable(sst);

        auto cfg = make_sstable_writer_config(compaction_type::Reshard);
        // sstables generated for a given shard will share the same run identifier.
        cfg.run_identifier = _run_identifiers.at(shard);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), shard), sst};
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), _io_priority, shard), sst};
    }

    void stop_sstable_writer(compaction_writer* writer) override {
@@ -1747,49 +1649,47 @@ compaction_type compaction_type_options::type() const {
    return index_to_type[_options.index()];
 }

-static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor) {
+static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata) {
    struct {
        table_state& table_s;
        sstables::compaction_descriptor&& descriptor;
        compaction_data& cdata;
-        compaction_progress_monitor& progress_monitor;

        std::unique_ptr<compaction> operator()(compaction_type_options::reshape) {
-            return std::make_unique<reshape_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
+            return std::make_unique<reshape_compaction>(table_s, std::move(descriptor), cdata);
        }
        std::unique_ptr<compaction> operator()(compaction_type_options::reshard) {
-            return std::make_unique<resharding_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
+            return std::make_unique<resharding_compaction>(table_s, std::move(descriptor), cdata);
        }
        std::unique_ptr<compaction> operator()(compaction_type_options::regular) {
-            return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
+            return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata);
        }
        std::unique_ptr<compaction> operator()(compaction_type_options::cleanup) {
-            return std::make_unique<cleanup_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
+            return std::make_unique<cleanup_compaction>(table_s, std::move(descriptor), cdata);
        }
        std::unique_ptr<compaction> operator()(compaction_type_options::upgrade) {
-            return std::make_unique<cleanup_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
+            return std::make_unique<cleanup_compaction>(table_s, std::move(descriptor), cdata);
        }
        std::unique_ptr<compaction> operator()(compaction_type_options::scrub scrub_options) {
-            return std::make_unique<scrub_compaction>(table_s, std::move(descriptor), cdata, scrub_options, progress_monitor);
+            return std::make_unique<scrub_compaction>(table_s, std::move(descriptor), cdata, scrub_options);
        }
-    } visitor_factory{table_s, std::move(descriptor), cdata, progress_monitor};
+    } visitor_factory{table_s, std::move(descriptor), cdata};

    return descriptor.options.visit(visitor_factory);
 }

-static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, read_monitor_generator& monitor_generator) {
+static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s) {
    auto schema = table_s.schema();
    auto permit = table_s.make_compaction_reader_permit();

    uint64_t validation_errors = 0;
-    cdata.compaction_size = boost::accumulate(descriptor.sstables | boost::adaptors::transformed([] (auto& sst) { return sst->data_size(); }), int64_t(0));

    for (const auto& sst : descriptor.sstables) {
        clogger.info("Scrubbing in validate mode {}", sst->get_filename());

-        validation_errors += co_await sst->validate(permit, cdata.abort, [&schema] (sstring what) {
+        validation_errors += co_await sst->validate(permit, descriptor.io_priority, cdata.abort, [&schema] (sstring what) {
            scrub_compaction::report_validation_error(compaction_type::Scrub, *schema, what);
-        }, monitor_generator(sst));
+        });
        // Did validation actually finish because aborted?
        if (cdata.is_stop_requested()) {
            // Compaction manager will catch this exception and re-schedule the compaction.
@@ -1799,10 +1699,9 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
        clogger.info("Finished scrubbing in validate mode {} - sstable is {}", sst->get_filename(), validation_errors == 0 ? "valid" : "invalid");
    }

-    using scrub = sstables::compaction_type_options::scrub;
-    if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
+    if (validation_errors != 0) {
        for (auto& sst : descriptor.sstables) {
-            co_await sst->change_state(sstables::sstable_state::quarantine);
+            co_await sst->change_state(sstables::quarantine_dir);
        }
    }

@@ -1815,15 +1714,8 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    };
 }

-future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor) {
-    progress_monitor.set_generator(std::make_unique<compaction_read_monitor_generator>(table_s, use_backlog_tracker::no));
-    auto d = defer([&] { progress_monitor.reset_generator(); });
-    auto res = co_await scrub_sstables_validate_mode(descriptor, cdata, table_s, *progress_monitor._generator);
-    co_return res;
-}
-
 future<compaction_result>
-compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor) {
+compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s) {
    if (descriptor.sstables.empty()) {
        return make_exception_future<compaction_result>(std::runtime_error(format("Called {} compaction with empty set on behalf of {}.{}",
                compaction_name(descriptor.options.type()), table_s.schema()->ks_name(), table_s.schema()->cf_name())));
@@ -1831,9 +1723,9 @@ compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cd
    if (descriptor.options.type() == compaction_type::Scrub
            && std::get<compaction_type_options::scrub>(descriptor.options.options()).operation_mode == compaction_type_options::scrub::mode::validate) {
        // Bypass the usual compaction machinery for dry-mode scrub
-        return scrub_sstables_validate_mode(std::move(descriptor), cdata, table_s, progress_monitor);
+        return scrub_sstables_validate_mode(std::move(descriptor), cdata, table_s);
    }
-    return compaction::run(make_compaction(table_s, std::move(descriptor), cdata, progress_monitor));
+    return compaction::run(make_compaction(table_s, std::move(descriptor), cdata));
 }

 std::unordered_set<sstables::shared_sstable>
@@ -1851,7 +1743,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
    int64_t min_timestamp = std::numeric_limits<int64_t>::max();

    for (auto& sstable : overlapping) {
-        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        if (sstable->get_max_local_deletion_time() >= gc_before) {
            min_timestamp = std::min(min_timestamp, sstable->get_stats_metadata().min_timestamp);
        }
@@ -1870,7 +1762,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable

    // SStables that do not contain live data is added to list of possibly expired sstables.
    for (auto& candidate : compacting) {
-        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        clogger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}",
                    candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before);
        // A fully expired sstable which has an ancestor undeleted shouldn't be compacted because
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -13,8 +13,8 @@
 #include "compaction/compaction_descriptor.hh"
 #include "gc_clock.hh"
 #include "compaction_weight_registration.hh"
+#include "service/priority_manager.hh"
 #include "utils/UUID.hh"
-#include "utils/pretty_printers.hh"
 #include "table_state.hh"
 #include <seastar/core/thread.hh>
 #include <seastar/core/abort_source.hh>
@@ -25,6 +25,21 @@ namespace sstables {

 bool is_eligible_for_compaction(const sstables::shared_sstable& sst) noexcept;

+class pretty_printed_data_size {
+    uint64_t _size;
+public:
+    pretty_printed_data_size(uint64_t size) : _size(size) {}
+    friend std::ostream& operator<<(std::ostream&, pretty_printed_data_size);
+};
+
+class pretty_printed_throughput {
+    uint64_t _size;
+    std::chrono::duration<float> _duration;
+public:
+    pretty_printed_throughput(uint64_t size, std::chrono::duration<float> dur) : _size(size), _duration(std::move(dur)) {}
+    friend std::ostream& operator<<(std::ostream&, pretty_printed_throughput);
+};
+
 // Return the name of the compaction type
 // as used over the REST api, e.g. "COMPACTION" or "CLEANUP".
 sstring compaction_name(compaction_type type);
@@ -48,7 +63,6 @@ struct compaction_info {
 };

 struct compaction_data {
-    uint64_t compaction_size = 0;
    uint64_t total_partitions = 0;
    uint64_t total_keys_written = 0;
    sstring stop_requested;
@@ -101,27 +115,12 @@ struct compaction_result {
    compaction_stats stats;
 };

-class read_monitor_generator;
-
-class compaction_progress_monitor {
-    std::unique_ptr<read_monitor_generator> _generator = nullptr;
-    uint64_t _progress = 0;
-public:
-    void set_generator(std::unique_ptr<read_monitor_generator> generator);
-    void reset_generator();
-    // Returns number of bytes processed with _generator.
-    uint64_t get_progress() const;
-
-    friend class compaction;
-    friend future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor, compaction_data&, table_state&, compaction_progress_monitor&);
-};
-
 // Compact a list of N sstables into M sstables.
 // Returns info about the finished compaction, which includes vector to new sstables.
 //
 // compaction_descriptor is responsible for specifying the type of compaction, and influencing
 // compaction behavior through its available member fields.
-future<compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s, compaction_progress_monitor& progress_monitor);
+future<compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s);

 // Return list of expired sstables for column family cf.
 // A sstable is fully expired *iff* its max_local_deletion_time precedes gc_before and its
--- a/compaction/compaction_backlog_manager.hh
+++ b/compaction/compaction_backlog_manager.hh
@@ -12,6 +12,7 @@
 #include <memory>
 #include <seastar/core/shared_ptr.hh>
 #include "sstables/shared_sstable.hh"
+#include "sstables/progress_monitor.hh"
 #include "timestamp.hh"

 class compaction_backlog_manager;
@@ -59,20 +60,18 @@ public:
    using ongoing_compactions = std::unordered_map<sstables::shared_sstable, backlog_read_progress_manager*>;

    struct impl {
-        // FIXME: Should provide strong exception safety guarantees
-        virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) = 0;
+        virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) = 0;
        virtual double backlog(const ongoing_writes& ow, const ongoing_compactions& oc) const = 0;
        virtual ~impl() { }
    };

    compaction_backlog_tracker(std::unique_ptr<impl> impl) : _impl(std::move(impl)) {}
    compaction_backlog_tracker(compaction_backlog_tracker&&);
-    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) = delete;
+    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) noexcept;
    compaction_backlog_tracker(const compaction_backlog_tracker&) = delete;
    ~compaction_backlog_tracker();

    double backlog() const;
-    // FIXME: Should provide strong exception safety guarantees
    void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts);
    void register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp);
    void register_compacting_sstable(sstables::shared_sstable sst, backlog_read_progress_manager& rp);
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -18,6 +18,7 @@
 #include "sstables/sstable_set.hh"
 #include "utils/UUID.hh"
 #include "dht/i_partitioner.hh"
+#include "compaction_weight_registration.hh"
 #include "compaction_fwd.hh"

 namespace sstables {
@@ -72,12 +73,6 @@ public:
            only, // scrub only quarantined sstables
        };
        quarantine_mode quarantine_operation_mode = quarantine_mode::include;
-
-        using quarantine_invalid_sstables = bool_class<class quarantine_invalid_sstables_tag>;
-
-        // Should invalid sstables be moved into quarantine.
-        // Only applies to validate-mode.
-        quarantine_invalid_sstables quarantine_sstables = quarantine_invalid_sstables::yes;
    };
    struct reshard {
    };
@@ -114,8 +109,8 @@ public:
        return compaction_type_options(upgrade{});
    }

-    static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes) {
-        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables});
+    static compaction_type_options make_scrub(scrub::mode mode) {
+        return compaction_type_options(scrub{mode});
    }

    template <typename... Visitor>
@@ -123,11 +118,6 @@ public:
        return std::visit(std::forward<Visitor>(visitor)..., _options);
    }

-    template <typename OptionType>
-    const auto& as() const {
-        return std::get<OptionType>(_options);
-    }
-
    const options_variant& options() const { return _options; }

    compaction_type type() const;
@@ -161,12 +151,12 @@ struct compaction_descriptor {
    compaction_type_options options = compaction_type_options::make_regular();
    // If engaged, compaction will cleanup the input sstables by skipping non-owned ranges.
    compaction::owned_ranges_ptr owned_ranges;
-    // Required for reshard compaction.
-    const dht::sharder* sharder;

    compaction_sstable_creator_fn creator;
    compaction_sstable_replacer_fn replacer;

+    ::io_priority_class io_priority = default_priority_class();
+
    // Denotes if this compaction task is comprised solely of completely expired SSTables
    sstables::has_only_fully_expired has_only_fully_expired = has_only_fully_expired::no;

@@ -176,6 +166,7 @@ struct compaction_descriptor {
    static constexpr uint64_t default_max_sstable_bytes = std::numeric_limits<uint64_t>::max();

    explicit compaction_descriptor(std::vector<sstables::shared_sstable> sstables,
+                                   ::io_priority_class io_priority,
                                   int level = default_level,
                                   uint64_t max_sstable_bytes = default_max_sstable_bytes,
                                   run_id run_identifier = run_id::create_random_id(),
@@ -187,15 +178,18 @@ struct compaction_descriptor {
        , run_identifier(run_identifier)
        , options(options)
        , owned_ranges(std::move(owned_ranges_))
+        , io_priority(io_priority)
    {}

    explicit compaction_descriptor(sstables::has_only_fully_expired has_only_fully_expired,
-                                   std::vector<sstables::shared_sstable> sstables)
+                                   std::vector<sstables::shared_sstable> sstables,
+                                   ::io_priority_class io_priority)
        : sstables(std::move(sstables))
        , level(default_level)
        , max_sstable_bytes(default_max_sstable_bytes)
        , run_identifier(run_id::create_random_id())
        , options(compaction_type_options::make_regular())
+        , io_priority(io_priority)
        , has_only_fully_expired(has_only_fully_expired)
    {}

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -31,8 +31,8 @@
 #include <functional>
 #include <algorithm>
 #include "compaction.hh"
+#include "compaction_weight_registration.hh"
 #include "compaction_backlog_manager.hh"
-#include "compaction/compaction_descriptor.hh"
 #include "compaction/task_manager_module.hh"
 #include "compaction_state.hh"
 #include "strategy_control.hh"
@@ -46,14 +46,14 @@ class system_keyspace;
 class compaction_history_entry;
 }

+class compacting_sstable_registration;
+
 class repair_history_map {
 public:
    boost::icl::interval_map<dht::token, gc_clock::time_point, boost::icl::partial_absorber, std::less, boost::icl::inplace_max> map;
 };

 namespace compaction {
-using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
 class compaction_task_executor;
 class sstables_task_executor;
 class major_compaction_task_executor;
@@ -64,6 +64,8 @@ class rewrite_sstables_compaction_task_executor;
 class cleanup_sstables_compaction_task_executor;
 class validate_sstables_compaction_task_executor;
 }
+class compaction_manager_test_task_executor;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -161,21 +163,7 @@ private:
    per_table_history_maps _repair_history_maps;
    tombstone_gc_state _tombstone_gc_state;
 private:
-    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
-    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
-
-    // Return nullopt if compaction cannot be started
-    std::optional<gate::holder> start_compaction(table_state& t);
-
-    // parent_info set to std::nullopt means that task manager should not register this task executor.
-    // To create a task manager task with no parent, parent_info argument should contain empty task_info.
-    template<typename TaskExecutor, typename... Args>
-    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
-            std::is_base_of_v<compaction_task_impl, TaskExecutor> &&
-    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
-        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
-    }
-    future<compaction_manager::compaction_stats_opt> perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
+    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor>);

    future<> stop_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -194,20 +182,17 @@ private:
    // Get candidates for compaction strategy, which are all sstables but the ones being compacted.
    std::vector<sstables::shared_sstable> get_candidates(compaction::table_state& t) const;

-    bool eligible_for_compaction(const sstables::shared_sstable& sstable) const;
-    bool eligible_for_compaction(const sstables::frozen_sstable_run& sstable_run) const;
-
    template <std::ranges::range Range>
-    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable> || std::convertible_to<std::ranges::range_value_t<Range>, sstables::frozen_sstable_run>
-    std::vector<std::ranges::range_value_t<Range>> get_candidates(table_state& t, const Range& sstables) const;
+    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable>
+    std::vector<sstables::shared_sstable> get_candidates(table_state& t, const Range& sstables) const;

-    template <std::ranges::range Range>
-    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
-    void register_compacting_sstables(const Range& range);
+    template <typename Iterator, typename Sentinel>
+    requires std::same_as<Sentinel, Iterator> || std::sentinel_for<Sentinel, Iterator>
+    void register_compacting_sstables(Iterator first, Sentinel last);

-    template <std::ranges::range Range>
-    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
-    void deregister_compacting_sstables(const Range& range);
+    template <typename Iterator, typename Sentinel>
+    requires std::same_as<Sentinel, Iterator> || std::sentinel_for<Sentinel, Iterator>
+    void deregister_compacting_sstables(Iterator first, Sentinel last);

    // gets the table's compaction state
    // throws std::out_of_range exception if not found.
@@ -226,7 +211,7 @@ private:
    // similar-sized compaction.
    void postpone_compaction_for_table(compaction::table_state* t);

-    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t);
    future<> update_static_shares(float shares);

    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
@@ -234,11 +219,10 @@ private:
    // Guarantees that a maintenance task, e.g. cleanup, will be performed on all files available at the time
    // by retrieving set of candidates only after all compactions for table T were stopped, if any.
    template<typename TaskType, typename... Args>
-    requires std::derived_from<TaskType, compaction_task_executor> &&
-            std::derived_from<TaskType, compaction_task_impl>
-    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(std::optional<tasks::task_info> info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);
+    requires std::derived_from<TaskType, compaction::compaction_task_executor>
+    future<compaction_stats_opt> perform_task_on_all_files(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, Args... args);

-    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, std::optional<tasks::task_info> info, can_purge_tombstones can_purge = can_purge_tombstones::yes);
+    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, can_purge_tombstones can_purge = can_purge_tombstones::yes);

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
@@ -310,7 +294,7 @@ public:

    // Submit a table to be off-strategy compacted.
    // Returns true iff off-strategy compaction was required and performed.
-    future<bool> perform_offstrategy(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<bool> perform_offstrategy(compaction::table_state& t);

    // Submit a table to be cleaned up and wait for its termination.
    //
@@ -319,23 +303,21 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t);
 private:
-    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t);

    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
-
-    future<> on_compaction_completion(table_state& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
 public:
    // Submit a table to be upgraded and wait for its termination.
-    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version);

    // Submit a table to be scrubbed and wait for its termination.
-    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts, std::optional<tasks::task_info> info = std::nullopt);
+    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts);

    // Submit a table for major compaction.
-    future<> perform_major_compaction(compaction::table_state& t, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_major_compaction(compaction::table_state& t);


    // Run a custom job for a given table, defined by a function
@@ -345,7 +327,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, std::optional<tasks::task_info> info, throw_if_stopping do_throw_if_stopping);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);

    class compaction_reenabler {
        compaction_manager& _cm;
@@ -433,7 +415,6 @@ public:

    // checks if the sstable is in the respective compaction_state.sstables_requiring_cleanup set.
    bool requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const;
-    const std::unordered_set<sstables::shared_sstable>& sstables_requiring_cleanup(table_state& t) const;

    friend class compacting_sstable_registration;
    friend class compaction_weight_registration;
@@ -448,11 +429,12 @@ public:
    friend class compaction::rewrite_sstables_compaction_task_executor;
    friend class compaction::cleanup_sstables_compaction_task_executor;
    friend class compaction::validate_sstables_compaction_task_executor;
+    friend class compaction_manager_test_task_executor;
 };

 namespace compaction {

-class compaction_task_executor : public enable_shared_from_this<compaction_task_executor> {
+class compaction_task_executor {
 public:
    enum class state {
        none,       // initial and final state
@@ -460,55 +442,42 @@ public:
                    // counted in compaction_manager::stats::pending_tasks
        active,     // task initiated active compaction, may alternate with pending
                    // counted in compaction_manager::stats::active_tasks
-        done,       // task completed successfully (may transition only to state::none, or
-                    // state::pending for regular compaction)
+        done,       // task completed successfully (may transition only to state::none)
                    // counted in compaction_manager::stats::completed_tasks
        postponed,  // task was postponed (may transition only to state::none)
                    // represented by the postponed_compactions metric
        failed,     // task failed (may transition only to state::none)
                    // counted in compaction_manager::stats::errors
    };
+    static std::string_view to_string(state);
 protected:
    compaction_manager& _cm;
    ::compaction::table_state* _compacting_table = nullptr;
    compaction::compaction_state& _compaction_state;
    sstables::compaction_data _compaction_data;
    state _state = state::none;
-    throw_if_stopping _do_throw_if_stopping;
-    sstables::compaction_progress_monitor _progress_monitor;

 private:
    shared_future<compaction_manager::compaction_stats_opt> _compaction_done = make_ready_future<compaction_manager::compaction_stats_opt>();
    exponential_backoff_retry _compaction_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
    sstables::compaction_type _type;
    sstables::run_id _output_run_identifier;
+    gate::holder _gate_holder;
    sstring _description;
-    compaction_manager::compaction_stats_opt _stats = std::nullopt;

 public:
-    explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);
+    explicit compaction_task_executor(compaction_manager& mgr, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);

    compaction_task_executor(compaction_task_executor&&) = delete;
    compaction_task_executor(const compaction_task_executor&) = delete;

-    virtual ~compaction_task_executor() = default;
-
-    // called when a compaction replaces the exhausted sstables with the new set
-    struct on_replacement {
-        virtual ~on_replacement() {}
-        // called after the replacement completes
-        // @param sstables the old sstable which are replaced in this replacement
-        virtual void on_removal(const std::vector<sstables::shared_sstable>& sstables) = 0;
-        // called before the replacement happens
-        // @param sstables the new sstables to be added to the table's sstable set
-        virtual void on_addition(const std::vector<sstables::shared_sstable>& sstables) = 0;
-    };
+    virtual ~compaction_task_executor();

 protected:
-    future<> perform();
-
    virtual future<compaction_manager::compaction_stats_opt> do_run() = 0;

+    using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
    state switch_state(state new_state);

    future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -525,27 +494,24 @@ protected:
    // otherwise, returns stop_iteration::no after sleep for exponential retry.
    future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);

-    future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
+    // Compacts set of SSTables according to the descriptor.
+    using release_exhausted_func_t = std::function<void(const std::vector<sstables::shared_sstable>& exhausted_sstables)>;
+    future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
+                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes);
+    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes);
-    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
-                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes,
-                                sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> update_history(::compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
    bool should_update_history(sstables::compaction_type ct) {
        return ct == sstables::compaction_type::Compaction;
    }
 public:
-    compaction_manager::compaction_stats_opt get_stats() const noexcept {
-        return _stats;
-    }
-
-    future<compaction_manager::compaction_stats_opt> run_compaction() noexcept;
+    future<compaction_manager::compaction_stats_opt> run() noexcept;

    const ::compaction::table_state* compacting_table() const noexcept {
        return _compacting_table;
    }

-    sstables::compaction_type compaction_type() const noexcept {
+    sstables::compaction_type type() const noexcept {
        return _type;
    }

@@ -571,46 +537,27 @@ public:
    const sstring& description() const noexcept {
        return _description;
    }
-private:
-    // Before _compaction_done is set in compaction_task_executor::run_compaction(), compaction_done() returns ready future.
+
    future<compaction_manager::compaction_stats_opt> compaction_done() noexcept {
        return _compaction_done.get_future();
    }
-public:
+
    bool stopping() const noexcept {
        return _compaction_data.abort.abort_requested();
    }

-    void stop_compaction(sstring reason) noexcept;
+    void stop(sstring reason) noexcept;

    sstables::compaction_stopped_exception make_compaction_stopped_exception() const;

-    template<typename TaskExecutor, typename... Args>
-    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
-            std::is_base_of_v<compaction_task_impl, TaskExecutor> &&
-    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
-        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
-    }
-    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
-    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
-    friend fmt::formatter<compaction_task_executor>;
-    friend future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason);
+    std::string describe() const;
 };

+std::ostream& operator<<(std::ostream& os, compaction::compaction_task_executor::state s);
+std::ostream& operator<<(std::ostream& os, const compaction::compaction_task_executor& task);
+
 }

-template <>
-struct fmt::formatter<compaction::compaction_task_executor::state> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    auto format(compaction::compaction_task_executor::state c, fmt::format_context& ctx) const -> decltype(ctx.out());
-};
-
-template <>
-struct fmt::formatter<compaction::compaction_task_executor> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    auto format(const compaction::compaction_task_executor& ex, fmt::format_context& ctx) const  -> decltype(ctx.out());
-};
-
 bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges);

 // Return all sstables but those that are off-strategy like the ones in maintenance set and staging dir.
--- a/compaction/compaction_state.hh
+++ b/compaction/compaction_state.hh
@@ -32,7 +32,7 @@ struct compaction_state {
    // Signaled whenever a compaction task completes.
    condition_variable compaction_done;

-    std::optional<compaction_backlog_tracker> backlog_tracker;
+    compaction_backlog_tracker backlog_tracker;

    std::unordered_set<sstables::shared_sstable> sstables_requiring_cleanup;
    compaction::owned_ranges_ptr owned_ranges_ptr;
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -12,8 +12,6 @@
 #include <vector>
 #include <chrono>
 #include <seastar/core/shared_ptr.hh>
-#include "seastar/core/on_internal_error.hh"
-#include "sstables/shared_sstable.hh"
 #include "sstables/sstables.hh"
 #include "compaction.hh"
 #include "compaction_strategy.hh"
@@ -26,6 +24,7 @@
 #include <boost/range/adaptors.hpp>
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include "size_tiered_compaction_strategy.hh"
+#include "date_tiered_compaction_strategy.hh"
 #include "leveled_compaction_strategy.hh"
 #include "time_window_compaction_strategy.hh"
 #include "backlog_controller.hh"
@@ -33,25 +32,26 @@
 #include "size_tiered_backlog_tracker.hh"
 #include "leveled_manifest.hh"

+logging::logger date_tiered_manifest::logger = logging::logger("DateTieredCompactionStrategy");
 logging::logger leveled_manifest::logger("LeveledManifest");

 namespace sstables {

 compaction_descriptor compaction_strategy_impl::make_major_compaction_job(std::vector<sstables::shared_sstable> candidates, int level, uint64_t max_sstable_bytes) {
    // run major compaction in maintenance priority
-    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes);
+    return compaction_descriptor(std::move(candidates), service::get_local_streaming_priority(), level, max_sstable_bytes);
 }

 std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const {
    // The default implementation is suboptimal and causes the writeamp problem described issue in #10097.
    // The compaction strategy relying on it should strive to implement its own method, to make cleanup bucket aware.
    return boost::copy_range<std::vector<compaction_descriptor>>(candidates | boost::adaptors::transformed([] (const shared_sstable& sst) {
-        return compaction_descriptor({ sst },
+        return compaction_descriptor({ sst }, service::get_local_compaction_priority(),
            sst->get_sstable_level(), sstables::compaction_descriptor::default_max_sstable_bytes, sst->run_identifier());
    }));
 }

-bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t) {
+bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state) {
    if (_disable_tombstone_compaction) {
        return false;
    }
@@ -62,7 +62,7 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    if (db_clock::now()-_tombstone_compaction_interval < sst->data_file_write_time()) {
        return false;
    }
-    auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time, t.get_tombstone_gc_state(), t.schema());
+    auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time, gc_state);
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

@@ -75,7 +75,7 @@ reader_consumer_v2 compaction_strategy_impl::make_interposer_consumer(const muta
 }

 compaction_descriptor
-compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const {
    return compaction_descriptor();
 }

@@ -87,96 +87,17 @@ std::optional<sstring> compaction_strategy_impl::get_value(const std::map<sstrin
    return it->second;
 }

-void compaction_strategy_impl::validate_min_max_threshold(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto min_threshold_key = "min_threshold", max_threshold_key = "max_threshold";
-
-    auto tmp_value = compaction_strategy_impl::get_value(options, min_threshold_key);
-    auto min_threshold = cql3::statements::property_definitions::to_long(min_threshold_key, tmp_value, DEFAULT_MIN_COMPACTION_THRESHOLD);
-    if (min_threshold < 2) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be bigger or equal to 2", min_threshold_key, min_threshold));
-    }
-
-    tmp_value = compaction_strategy_impl::get_value(options, max_threshold_key);
-    auto max_threshold = cql3::statements::property_definitions::to_long(max_threshold_key, tmp_value, DEFAULT_MAX_COMPACTION_THRESHOLD);
-    if (max_threshold < 2) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be bigger or equal to 2", max_threshold_key, max_threshold));
-    }
-
-    unchecked_options.erase(min_threshold_key);
-    unchecked_options.erase(max_threshold_key);
-}
-
-static double validate_tombstone_threshold(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION);
-    auto tombstone_threshold = cql3::statements::property_definitions::to_double(compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION, tmp_value, compaction_strategy_impl::DEFAULT_TOMBSTONE_THRESHOLD);
-    if (tombstone_threshold < 0.0 || tombstone_threshold > 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be between 0.0 and 1.0", compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION, tombstone_threshold));
-    }
-    return tombstone_threshold;
-}
-
-static double validate_tombstone_threshold(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto tombstone_threshold = validate_tombstone_threshold(options);
-    unchecked_options.erase(compaction_strategy_impl::TOMBSTONE_THRESHOLD_OPTION);
-    return tombstone_threshold;
-}
-
-static db_clock::duration validate_tombstone_compaction_interval(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION);
-    auto interval = cql3::statements::property_definitions::to_long(compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION, tmp_value, compaction_strategy_impl::DEFAULT_TOMBSTONE_COMPACTION_INTERVAL().count());
-    auto tombstone_compaction_interval = db_clock::duration(std::chrono::seconds(interval));
-    if (interval <= 0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be positive", compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION, tombstone_compaction_interval));
-    }
-    return tombstone_compaction_interval;
-}
-
-static db_clock::duration validate_tombstone_compaction_interval(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto tombstone_compaction_interval = validate_tombstone_compaction_interval(options);
-    unchecked_options.erase(compaction_strategy_impl::TOMBSTONE_COMPACTION_INTERVAL_OPTION);
-    return tombstone_compaction_interval;
-}
-
-void compaction_strategy_impl::validate_options_for_strategy_type(const std::map<sstring, sstring>& options, sstables::compaction_strategy_type type) {
-    auto unchecked_options = options;
-    compaction_strategy_impl::validate_options(options, unchecked_options);
-    switch (type) {
-        case compaction_strategy_type::size_tiered:
-            size_tiered_compaction_strategy::validate_options(options, unchecked_options);
-            break;
-        case compaction_strategy_type::leveled:
-            leveled_compaction_strategy::validate_options(options, unchecked_options);
-            break;
-        case compaction_strategy_type::time_window:
-            time_window_compaction_strategy::validate_options(options, unchecked_options);
-            break;
-        default:
-            break;
-    }
-
-    unchecked_options.erase("class");
-    if (!unchecked_options.empty()) {
-        throw exceptions::configuration_exception(fmt::format("Invalid compaction strategy options {} for chosen strategy type", unchecked_options));
-    }
-}
-
-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void compaction_strategy_impl::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    validate_tombstone_threshold(options, unchecked_options);
-    validate_tombstone_compaction_interval(options, unchecked_options);
-
-    auto it = options.find("enabled");
-    if (it != options.end() && it->second != "true" && it->second != "false") {
-        throw exceptions::configuration_exception(fmt::format("enabled value ({}) must be \"true\" or \"false\"", it->second));
-    }
-    unchecked_options.erase("enabled");
-}
-
 compaction_strategy_impl::compaction_strategy_impl(const std::map<sstring, sstring>& options) {
-    _tombstone_threshold = validate_tombstone_threshold(options);
-    _tombstone_compaction_interval = validate_tombstone_compaction_interval(options);
+    using namespace cql3::statements;
+
+    auto tmp_value = get_value(options, TOMBSTONE_THRESHOLD_OPTION);
+    _tombstone_threshold = property_definitions::to_double(TOMBSTONE_THRESHOLD_OPTION, tmp_value, DEFAULT_TOMBSTONE_THRESHOLD);
+
+    tmp_value = get_value(options, TOMBSTONE_COMPACTION_INTERVAL_OPTION);
+    auto interval = property_definitions::to_long(TOMBSTONE_COMPACTION_INTERVAL_OPTION, tmp_value, DEFAULT_TOMBSTONE_COMPACTION_INTERVAL().count());
+    _tombstone_compaction_interval = db_clock::duration(std::chrono::seconds(interval));
+
+    // FIXME: validate options.
 }

 } // namespace sstables
@@ -188,7 +109,7 @@ size_tiered_backlog_tracker::compacted_backlog(const compaction_backlog_tracker:
        // A SSTable being compacted may not contribute to backlog if compaction strategy decided
        // to perform a low-efficiency compaction when system is under little load, or when user
        // performs major even though strategy is completely satisfied
-        if (!_contrib.sstables.contains(crp.first)) {
+        if (!_sstables_contributing_backlog.contains(crp.first)) {
            continue;
        }
        auto compacted = crp.second->compacted();
@@ -198,11 +119,11 @@ size_tiered_backlog_tracker::compacted_backlog(const compaction_backlog_tracker:
    return in;
 }

-// Provides strong exception safety guarantees.
-size_tiered_backlog_tracker::sstables_backlog_contribution size_tiered_backlog_tracker::calculate_sstables_backlog_contribution(const std::vector<sstables::shared_sstable>& all, const sstables::size_tiered_compaction_strategy_options& stcs_options) {
-    sstables_backlog_contribution contrib;
-    if (all.empty()) {
-        return contrib;
+void size_tiered_backlog_tracker::refresh_sstables_backlog_contribution() {
+    _sstables_backlog_contribution = 0.0f;
+    _sstables_contributing_backlog = {};
+    if (_all.empty()) {
+        return;
    }
    using namespace sstables;

@@ -212,27 +133,25 @@ size_tiered_backlog_tracker::sstables_backlog_contribution size_tiered_backlog_t
    // in efficient jobs acting more aggressive than they really have to.
    // TODO: potentially switch to compaction manager's fan-in threshold, so to account for the dynamic
    //  fan-in threshold behavior.
-    const auto& newest_sst = std::ranges::max(all, std::less<generation_type>(), std::mem_fn(&sstable::generation));
+    const auto& newest_sst = std::ranges::max(_all, std::less<generation_type>(), std::mem_fn(&sstable::generation));
    auto threshold = newest_sst->get_schema()->min_compaction_threshold();

-    for (auto& bucket : size_tiered_compaction_strategy::get_buckets(all, stcs_options)) {
+    for (auto& bucket : size_tiered_compaction_strategy::get_buckets(boost::copy_range<std::vector<shared_sstable>>(_all), _stcs_options)) {
        if (!size_tiered_compaction_strategy::is_bucket_interesting(bucket, threshold)) {
            continue;
        }
-        contrib.value += boost::accumulate(bucket | boost::adaptors::transformed([] (const shared_sstable& sst) -> double {
+        _sstables_backlog_contribution += boost::accumulate(bucket | boost::adaptors::transformed([this] (const shared_sstable& sst) -> double {
            return sst->data_size() * log4(sst->data_size());
        }), double(0.0f));
        // Controller is disabled if exception is caught during add / remove calls, so not making any effort to make this exception safe
-        contrib.sstables.insert(bucket.begin(), bucket.end());
+        _sstables_contributing_backlog.insert(bucket.begin(), bucket.end());
    }
-
-    return contrib;
 }

 double size_tiered_backlog_tracker::backlog(const compaction_backlog_tracker::ongoing_writes& ow, const compaction_backlog_tracker::ongoing_compactions& oc) const {
    inflight_component compacted = compacted_backlog(oc);

-    auto total_backlog_bytes = boost::accumulate(_contrib.sstables | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::data_size)), uint64_t(0));
+    auto total_backlog_bytes = boost::accumulate(_sstables_contributing_backlog | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::data_size)), uint64_t(0));

    // Bail out if effective backlog is zero, which happens in a small window where ongoing compaction exhausted
    // input files but is still sealing output files or doing managerial stuff like updating history table
@@ -249,41 +168,26 @@ double size_tiered_backlog_tracker::backlog(const compaction_backlog_tracker::on
    auto effective_backlog_bytes = total_backlog_bytes - compacted.total_bytes;

    // Sum of (Si - Ci) * log (Si) for all SSTables contributing backlog
-    auto sstables_contribution = _contrib.value - compacted.contribution;
+    auto sstables_contribution = _sstables_backlog_contribution - compacted.contribution;
    // This is subtracting ((Si - Ci) * log (Si)) from ((Si - Ci) * log(T)), yielding the final backlog
    auto b = (effective_backlog_bytes * log4(_total_bytes)) - sstables_contribution;
    return b > 0 ? b : 0;
 }

-// Provides strong exception safety guarantees.
-void size_tiered_backlog_tracker::replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) {
-    auto tmp_all = _all;
-    auto tmp_total_bytes = _total_bytes;
-    tmp_all.reserve(_all.size() + new_ssts.size());
-
+void size_tiered_backlog_tracker::replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) {
    for (auto& sst : old_ssts) {
        if (sst->data_size() > 0) {
-            auto erased = tmp_all.erase(sst);
-            if (erased) {
-                tmp_total_bytes -= sst->data_size();
-            }
+            _total_bytes -= sst->data_size();
+            _all.erase(sst);
        }
    }
    for (auto& sst : new_ssts) {
        if (sst->data_size() > 0) {
-            auto [_, inserted] = tmp_all.insert(sst);
-            if (inserted) {
-                tmp_total_bytes += sst->data_size();
-            }
+            _total_bytes += sst->data_size();
+            _all.insert(std::move(sst));
        }
    }
-    auto tmp_contrib = calculate_sstables_backlog_contribution(boost::copy_range<std::vector<shared_sstable>>(tmp_all), _stcs_options);
-
-    std::invoke([&] () noexcept {
-        _all = std::move(tmp_all);
-        _total_bytes = tmp_total_bytes;
-        _contrib = std::move(tmp_contrib);
-    });
+    refresh_sstables_backlog_contribution();
 }

 namespace sstables {
@@ -361,25 +265,23 @@ public:
        return b;
    }

-    // Provides strong exception safety guarantees
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {
        struct replacement {
            std::vector<sstables::shared_sstable> old_ssts;
            std::vector<sstables::shared_sstable> new_ssts;
        };
        std::unordered_map<api::timestamp_type, replacement> per_window_replacement;
-        auto tmp_windows = _windows;

        for (auto& sst : new_ssts) {
            auto bound = lower_bound_of(sst->get_stats_metadata().max_timestamp);
-            if (!tmp_windows.contains(bound)) {
-                tmp_windows.emplace(bound, size_tiered_backlog_tracker(_stcs_options));
+            if (!_windows.contains(bound)) {
+                _windows.emplace(bound, size_tiered_backlog_tracker(_stcs_options));
            }
            per_window_replacement[bound].new_ssts.push_back(std::move(sst));
        }
        for (auto& sst : old_ssts) {
            auto bound = lower_bound_of(sst->get_stats_metadata().max_timestamp);
-            if (tmp_windows.contains(bound)) {
+            if (_windows.contains(bound)) {
                per_window_replacement[bound].old_ssts.push_back(std::move(sst));
            }
        }
@@ -387,20 +289,12 @@ public:
        for (auto& [bound, r] : per_window_replacement) {
            // All windows must exist here, as windows are created for new files and will
            // remain alive as long as there's a single file in them
-            auto it = tmp_windows.find(bound);
-            if (it == tmp_windows.end()) {
-                on_internal_error(clogger, fmt::format("window for bound {} not found", bound));
-            }
-            auto& w = it->second;
-            w.replace_sstables(r.old_ssts, r.new_ssts);
+            auto& w = _windows.at(bound);
+            w.replace_sstables(std::move(r.old_ssts), std::move(r.new_ssts));
            if (w.total_bytes() <= 0) {
-                tmp_windows.erase(bound);
+                _windows.erase(bound);
            }
        }
-
-        std::invoke([&] () noexcept {
-            _windows = std::move(tmp_windows);
-        });
    }
 };

@@ -500,31 +394,25 @@ public:
        return b;
    }

-    // Provides strong exception safety guarantees
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {
-        auto tmp_size_per_level = _size_per_level;
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {
        std::vector<sstables::shared_sstable> l0_old_ssts, l0_new_ssts;
        for (auto& sst : new_ssts) {
            auto level = sst->get_sstable_level();
-            tmp_size_per_level[level] += sst->data_size();
+            _size_per_level[level] += sst->data_size();
            if (level == 0) {
                l0_new_ssts.push_back(std::move(sst));
            }
        }
        for (auto& sst : old_ssts) {
            auto level = sst->get_sstable_level();
-            tmp_size_per_level[level] -= sst->data_size();
+            _size_per_level[level] -= sst->data_size();
            if (level == 0) {
                l0_old_ssts.push_back(std::move(sst));
            }
        }
        if (l0_old_ssts.size() || l0_new_ssts.size()) {
-            // stcs replace_sstables guarantees strong exception safety
            _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
        }
-        std::invoke([&] () noexcept {
-            _size_per_level = std::move(tmp_size_per_level);
-        });
    }
 };

@@ -532,14 +420,14 @@ struct unimplemented_backlog_tracker final : public compaction_backlog_tracker::
    virtual double backlog(const compaction_backlog_tracker::ongoing_writes& ow, const compaction_backlog_tracker::ongoing_compactions& oc) const override {
        return compaction_controller::disable_backlog;
    }
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {}
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {}
 };

 struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
    virtual double backlog(const compaction_backlog_tracker::ongoing_writes& ow, const compaction_backlog_tracker::ongoing_compactions& oc) const override {
        return 0;
    }
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override {}
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {}
 };

 //
@@ -548,7 +436,7 @@ struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
 //
 class null_compaction_strategy : public compaction_strategy_impl {
 public:
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override {
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override {
        return sstables::compaction_descriptor();
    }

@@ -572,20 +460,6 @@ leveled_compaction_strategy::leveled_compaction_strategy(const std::map<sstring,
 {
 }

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void leveled_compaction_strategy::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    size_tiered_compaction_strategy_options::validate(options, unchecked_options);
-
-    auto tmp_value = compaction_strategy_impl::get_value(options, SSTABLE_SIZE_OPTION);
-    auto min_sstables_size = cql3::statements::property_definitions::to_long(SSTABLE_SIZE_OPTION, tmp_value, DEFAULT_MAX_SSTABLE_SIZE_IN_MB);
-    if (min_sstables_size <= 0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be positive", SSTABLE_SIZE_OPTION, min_sstables_size));
-    }
-    unchecked_options.erase(SSTABLE_SIZE_OPTION);
-}
-
 std::unique_ptr<compaction_backlog_tracker::impl> leveled_compaction_strategy::make_backlog_tracker() const {
    return std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options);
 }
@@ -619,22 +493,201 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
    _use_clustering_key_filter = true;
 }

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void time_window_compaction_strategy::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    time_window_compaction_strategy_options::validate(options, unchecked_options);
-    size_tiered_compaction_strategy_options::validate(options, unchecked_options);
-}
-
 std::unique_ptr<compaction_backlog_tracker::impl> time_window_compaction_strategy::make_backlog_tracker() const {
    return std::make_unique<time_window_backlog_tracker>(_options, _stcs_options);
 }

 } // namespace sstables

+std::vector<sstables::shared_sstable>
+date_tiered_manifest::get_next_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& uncompacting, gc_clock::time_point compaction_time) {
+    if (table_s.main_sstable_set().all()->empty()) {
+        return {};
+    }
+
+    // Find fully expired SSTables. Those will be included no matter what.
+    auto expired = table_s.fully_expired_sstables(uncompacting, compaction_time);
+
+    if (!expired.empty()) {
+        auto is_expired = [&] (const sstables::shared_sstable& s) { return expired.contains(s); };
+        uncompacting.erase(boost::remove_if(uncompacting, is_expired), uncompacting.end());
+    }
+
+    auto compaction_candidates = get_next_non_expired_sstables(table_s, uncompacting, compaction_time);
+    if (!expired.empty()) {
+        compaction_candidates.insert(compaction_candidates.end(), expired.begin(), expired.end());
+    }
+    return compaction_candidates;
+}
+
+int64_t date_tiered_manifest::get_estimated_tasks(table_state& table_s) const {
+    int base = table_s.schema()->min_compaction_threshold();
+    int64_t now = get_now(table_s.main_sstable_set().all());
+    std::vector<sstables::shared_sstable> sstables;
+    int64_t n = 0;
+
+    auto all_sstables = table_s.main_sstable_set().all();
+    sstables.reserve(all_sstables->size());
+    for (auto& entry : *all_sstables) {
+        sstables.push_back(entry);
+    }
+    auto candidates = filter_old_sstables(sstables, _options.max_sstable_age, now);
+    auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);
+
+    for (auto& bucket : buckets) {
+        if (bucket.size() >= size_t(table_s.schema()->min_compaction_threshold())) {
+            n += std::ceil(double(bucket.size()) / table_s.schema()->max_compaction_threshold());
+        }
+    }
+    return n;
+}
+
+std::vector<sstables::shared_sstable>
+date_tiered_manifest::get_next_non_expired_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& non_expiring_sstables, gc_clock::time_point compaction_time) {
+    int base = table_s.schema()->min_compaction_threshold();
+    int64_t now = get_now(table_s.main_sstable_set().all());
+    auto most_interesting = get_compaction_candidates(table_s, non_expiring_sstables, now, base);
+
+    return most_interesting;
+
+    // FIXME: implement functionality below that will look for a single sstable with worth dropping tombstone,
+    // iff strategy didn't find anything to compact. So it's not essential.
+#if 0
+    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
+    // ratio is greater than threshold.
+
+    List<SSTableReader> sstablesWithTombstones = Lists.newArrayList();
+    for (SSTableReader sstable : nonExpiringSSTables)
+    {
+        if (worthDroppingTombstones(sstable, gcBefore))
+            sstablesWithTombstones.add(sstable);
+    }
+    if (sstablesWithTombstones.isEmpty())
+        return Collections.emptyList();
+
+    return Collections.singletonList(Collections.min(sstablesWithTombstones, new SSTableReader.SizeComparator()));
+#endif
+}
+
+std::vector<sstables::shared_sstable>
+date_tiered_manifest::get_compaction_candidates(table_state& table_s, std::vector<sstables::shared_sstable> candidate_sstables, int64_t now, int base) {
+    int min_threshold = table_s.schema()->min_compaction_threshold();
+    int max_threshold = table_s.schema()->max_compaction_threshold();
+    auto candidates = filter_old_sstables(candidate_sstables, _options.max_sstable_age, now);
+
+    auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);
+
+    return newest_bucket(buckets, min_threshold, max_threshold, now, _options.base_time);
+}
+
+int64_t date_tiered_manifest::get_now(lw_shared_ptr<const sstables::sstable_list> shared_set) {
+    int64_t max_timestamp = 0;
+    for (auto& sst : *shared_set) {
+        int64_t candidate = sst->get_stats_metadata().max_timestamp;
+        max_timestamp = candidate > max_timestamp ? candidate : max_timestamp;
+    }
+    return max_timestamp;
+}
+
+std::vector<sstables::shared_sstable>
+date_tiered_manifest::filter_old_sstables(std::vector<sstables::shared_sstable> sstables, api::timestamp_type max_sstable_age, int64_t now) {
+    if (max_sstable_age == 0) {
+        return sstables;
+    }
+    int64_t cutoff = now - max_sstable_age;
+
+    std::erase_if(sstables, [cutoff] (auto& sst) {
+        return sst->get_stats_metadata().max_timestamp < cutoff;
+    });
+
+    return sstables;
+}
+
+std::vector<std::pair<sstables::shared_sstable,int64_t>>
+date_tiered_manifest::create_sst_and_min_timestamp_pairs(const std::vector<sstables::shared_sstable>& sstables) {
+    std::vector<std::pair<sstables::shared_sstable,int64_t>> sstable_min_timestamp_pairs;
+    sstable_min_timestamp_pairs.reserve(sstables.size());
+    for (auto& sst : sstables) {
+        sstable_min_timestamp_pairs.emplace_back(sst, sst->get_stats_metadata().min_timestamp);
+    }
+    return sstable_min_timestamp_pairs;
+}
+
+date_tiered_compaction_strategy_options::date_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options) {
+    using namespace cql3::statements;
+
+    auto tmp_value = sstables::compaction_strategy_impl::get_value(options, TIMESTAMP_RESOLUTION_KEY);
+    auto target_unit = tmp_value ? tmp_value.value() : DEFAULT_TIMESTAMP_RESOLUTION;
+
+    tmp_value = sstables::compaction_strategy_impl::get_value(options, MAX_SSTABLE_AGE_KEY);
+    auto fractional_days = property_definitions::to_double(MAX_SSTABLE_AGE_KEY, tmp_value, DEFAULT_MAX_SSTABLE_AGE_DAYS);
+    int64_t max_sstable_age_in_hours = std::lround(fractional_days * 24);
+    max_sstable_age = duration_conversor::convert(target_unit, std::chrono::hours(max_sstable_age_in_hours));
+
+    tmp_value = sstables::compaction_strategy_impl::get_value(options, BASE_TIME_KEY);
+    auto base_time_seconds = property_definitions::to_long(BASE_TIME_KEY, tmp_value, DEFAULT_BASE_TIME_SECONDS);
+    base_time = duration_conversor::convert(target_unit, std::chrono::seconds(base_time_seconds));
+}
+
+date_tiered_compaction_strategy_options::date_tiered_compaction_strategy_options() {
+    auto max_sstable_age_in_hours = int64_t(DEFAULT_MAX_SSTABLE_AGE_DAYS * 24);
+    max_sstable_age = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::hours(max_sstable_age_in_hours)).count();
+    base_time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::seconds(DEFAULT_BASE_TIME_SECONDS)).count();
+}
+
 namespace sstables {

+date_tiered_compaction_strategy::date_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
+    : compaction_strategy_impl(options)
+    , _manifest(options)
+{
+    clogger.warn("DateTieredCompactionStrategy is deprecated. Usually cases for which it is used are better handled by TimeWindowCompactionStrategy."
+            " Please change your compaction strategy to TWCS as DTCS will be retired in the near future");
+
+    // tombstone compaction is disabled by default because:
+    // - deletion shouldn't be used with DTCS; rather data is deleted through TTL.
+    // - with time series workloads, it's usually better to wait for whole sstable to be expired rather than
+    // compacting a single sstable when it's more than 20% (default value) expired.
+    // For more details, see CASSANDRA-9234
+    if (!options.contains(TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.contains(TOMBSTONE_THRESHOLD_OPTION)) {
+        _disable_tombstone_compaction = true;
+        date_tiered_manifest::logger.debug("Disabling tombstone compactions for DTCS");
+    } else {
+        date_tiered_manifest::logger.debug("Enabling tombstone compactions for DTCS");
+    }
+
+    _use_clustering_key_filter = true;
+}
+
+compaction_descriptor date_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
+    auto compaction_time = gc_clock::now();
+    auto sstables = _manifest.get_next_sstables(table_s, candidates, compaction_time);
+
+    if (!sstables.empty()) {
+        date_tiered_manifest::logger.debug("datetiered: Compacting {} out of {} sstables", sstables.size(), candidates.size());
+        return sstables::compaction_descriptor(std::move(sstables), service::get_local_compaction_priority());
+    }
+
+    // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
+    auto e = boost::range::remove_if(candidates, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
+        return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
+    });
+    candidates.erase(e, candidates.end());
+    if (candidates.empty()) {
+        return sstables::compaction_descriptor();
+    }
+    // find oldest sstable which is worth dropping tombstones because they are more unlikely to
+    // shadow data from other sstables, and it also tends to be relatively big.
+    auto it = std::min_element(candidates.begin(), candidates.end(), [] (auto& i, auto& j) {
+        return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
+    });
+    return sstables::compaction_descriptor({ *it }, service::get_local_compaction_priority());
+}
+
+std::unique_ptr<compaction_backlog_tracker::impl> date_tiered_compaction_strategy::make_backlog_tracker() const {
+    return std::make_unique<unimplemented_backlog_tracker>();
+}
+
 size_tiered_compaction_strategy::size_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
    : compaction_strategy_impl(options)
    , _options(options)
@@ -644,13 +697,6 @@ size_tiered_compaction_strategy::size_tiered_compaction_strategy(const size_tier
    : _options(options)
 {}

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void size_tiered_compaction_strategy::validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    size_tiered_compaction_strategy_options::validate(options, unchecked_options);
-}
-
 std::unique_ptr<compaction_backlog_tracker::impl> size_tiered_compaction_strategy::make_backlog_tracker() const {
    return std::make_unique<size_tiered_backlog_tracker>(_options);
 }
@@ -667,8 +713,8 @@ compaction_strategy_type compaction_strategy::type() const {
    return _compaction_strategy_impl->type();
 }

-compaction_descriptor compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
-    return _compaction_strategy_impl->get_sstables_for_compaction(table_s, control);
+compaction_descriptor compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
+    return _compaction_strategy_impl->get_sstables_for_compaction(table_s, control, std::move(candidates));
 }

 compaction_descriptor compaction_strategy::get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
@@ -700,8 +746,8 @@ compaction_backlog_tracker compaction_strategy::make_backlog_tracker() const {
 }

 sstables::compaction_descriptor
-compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
-    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, mode);
+compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const {
+    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

 uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) const {
@@ -729,6 +775,9 @@ compaction_strategy make_compaction_strategy(compaction_strategy_type strategy,
    case compaction_strategy_type::leveled:
        impl = ::make_shared<leveled_compaction_strategy>(options);
        break;
+    case compaction_strategy_type::date_tiered:
+        impl = ::make_shared<date_tiered_compaction_strategy>(options);
+        break;
    case compaction_strategy_type::time_window:
        impl = ::make_shared<time_window_compaction_strategy>(options);
        break;
@@ -747,6 +796,7 @@ compaction_strategy_state compaction_strategy_state::make(const compaction_strat
    switch (cs.type()) {
        case compaction_strategy_type::null:
        case compaction_strategy_type::size_tiered:
+        case compaction_strategy_type::date_tiered:
            return compaction_strategy_state(default_empty_state{});
        case compaction_strategy_type::leveled:
            return compaction_strategy_state(leveled_compaction_strategy_state{});
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -44,7 +44,7 @@ public:
    compaction_strategy& operator=(compaction_strategy&&);

    // Return a list of sstables to be compacted after applying the strategy.
-    compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control);
+    compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidates);

    compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<shared_sstable> candidates);

@@ -71,6 +71,8 @@ public:
            return "SizeTieredCompactionStrategy";
        case compaction_strategy_type::leveled:
            return "LeveledCompactionStrategy";
+        case compaction_strategy_type::date_tiered:
+            return "DateTieredCompactionStrategy";
        case compaction_strategy_type::time_window:
            return "TimeWindowCompactionStrategy";
        default:
@@ -87,6 +89,8 @@ public:
            return compaction_strategy_type::size_tiered;
        } else if (short_name == "LeveledCompactionStrategy") {
            return compaction_strategy_type::leveled;
+        } else if (short_name == "DateTieredCompactionStrategy") {
+            return compaction_strategy_type::date_tiered;
        } else if (short_name == "TimeWindowCompactionStrategy") {
            return compaction_strategy_type::time_window;
        } else {
@@ -122,7 +126,7 @@ public:
    //
    // The caller should also pass a maximum number of SSTables which is the maximum amount of
    // SSTables that can be added into a single job.
-    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const;
+    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const;

 };

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -21,23 +21,20 @@ class sstable_set_impl;
 class resharding_descriptor;

 class compaction_strategy_impl {
-public:
    static constexpr float DEFAULT_TOMBSTONE_THRESHOLD = 0.2f;
    // minimum interval needed to perform tombstone removal compaction in seconds, default 86400 or 1 day.
    static constexpr std::chrono::seconds DEFAULT_TOMBSTONE_COMPACTION_INTERVAL() { return std::chrono::seconds(86400); }
-    static constexpr auto TOMBSTONE_THRESHOLD_OPTION = "tombstone_threshold";
-    static constexpr auto TOMBSTONE_COMPACTION_INTERVAL_OPTION = "tombstone_compaction_interval";
 protected:
+    const sstring TOMBSTONE_THRESHOLD_OPTION = "tombstone_threshold";
+    const sstring TOMBSTONE_COMPACTION_INTERVAL_OPTION = "tombstone_compaction_interval";
+
    bool _use_clustering_key_filter = false;
    bool _disable_tombstone_compaction = false;
    float _tombstone_threshold = DEFAULT_TOMBSTONE_THRESHOLD;
    db_clock::duration _tombstone_compaction_interval = DEFAULT_TOMBSTONE_COMPACTION_INTERVAL();
 public:
    static std::optional<sstring> get_value(const std::map<sstring, sstring>& options, const sstring& name);
-    static void validate_min_max_threshold(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
-    static void validate_options_for_strategy_type(const std::map<sstring, sstring>& options, sstables::compaction_strategy_type type);
 protected:
-    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
    compaction_strategy_impl() = default;
    explicit compaction_strategy_impl(const std::map<sstring, sstring>& options);
    static compaction_descriptor make_major_compaction_job(std::vector<sstables::shared_sstable> candidates,
@@ -45,7 +42,7 @@ protected:
            uint64_t max_sstable_bytes = compaction_descriptor::default_max_sstable_bytes);
 public:
    virtual ~compaction_strategy_impl() {}
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) = 0;
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) = 0;
    virtual compaction_descriptor get_major_compaction_job(table_state& table_s, std::vector<sstables::shared_sstable> candidates) {
        return make_major_compaction_job(std::move(candidates));
    }
@@ -64,7 +61,7 @@ public:

    // Check if a given sstable is entitled for tombstone compaction based on its
    // droppable tombstone histogram and gc_before.
-    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t);
+    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state);

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const = 0;

@@ -76,6 +73,6 @@ public:
        return false;
    }

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const;
 };
 }
--- a/compaction/compaction_strategy_type.hh
+++ b/compaction/compaction_strategy_type.hh
@@ -14,6 +14,7 @@ enum class compaction_strategy_type {
    null,
    size_tiered,
    leveled,
+    date_tiered,
    time_window,
 };

--- a/compaction/date_tiered_compaction_strategy.hh
+++ b/compaction/date_tiered_compaction_strategy.hh
@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2016-present-2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
+ */
+
+#pragma once
+
+#include <map>
+#include <chrono>
+#include <algorithm>
+#include <vector>
+#include <iterator>
+#include "sstables/sstables.hh"
+#include "compaction.hh"
+#include "timestamp.hh"
+#include "cql3/statements/property_definitions.hh"
+#include "compaction_strategy_impl.hh"
+
+static constexpr double DEFAULT_MAX_SSTABLE_AGE_DAYS = 365;
+static constexpr int64_t DEFAULT_BASE_TIME_SECONDS = 60;
+
+struct duration_conversor {
+    // Convert given duration to TargetDuration and return value as timestamp.
+    template <typename TargetDuration, typename SourceDuration>
+    static api::timestamp_type convert(SourceDuration d) {
+        return std::chrono::duration_cast<TargetDuration>(d).count();
+    }
+
+    // Convert given duration to duration that is represented by the string
+    // target_duration, and return value as timestamp.
+    template <typename SourceDuration>
+    static api::timestamp_type convert(const sstring& target_duration, SourceDuration d) {
+        if (target_duration == "HOURS") {
+            return convert<std::chrono::hours>(d);
+        } else if (target_duration == "MICROSECONDS") {
+            return convert<std::chrono::microseconds>(d);
+        } else if (target_duration == "MILLISECONDS") {
+            return convert<std::chrono::milliseconds>(d);
+        } else if (target_duration == "MINUTES") {
+            return convert<std::chrono::minutes>(d);
+        } else if (target_duration == "NANOSECONDS") {
+            return convert<std::chrono::nanoseconds>(d);
+        } else if (target_duration == "SECONDS") {
+            return convert<std::chrono::seconds>(d);
+        } else {
+            throw std::runtime_error(format("target duration {} is not available", target_duration));
+        }
+    }
+};
+
+class date_tiered_compaction_strategy_options {
+    const sstring DEFAULT_TIMESTAMP_RESOLUTION = "MICROSECONDS";
+    const sstring TIMESTAMP_RESOLUTION_KEY = "timestamp_resolution";
+    const sstring MAX_SSTABLE_AGE_KEY = "max_sstable_age_days";
+    const sstring BASE_TIME_KEY = "base_time_seconds";
+
+    api::timestamp_type max_sstable_age;
+    api::timestamp_type base_time;
+public:
+    date_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options);
+
+    date_tiered_compaction_strategy_options();
+private:
+
+    friend class date_tiered_manifest;
+};
+
+class date_tiered_manifest {
+    date_tiered_compaction_strategy_options _options;
+public:
+    static logging::logger logger;
+
+    date_tiered_manifest() = delete;
+
+    date_tiered_manifest(const std::map<sstring, sstring>& options)
+        : _options(options) {}
+
+    std::vector<sstables::shared_sstable>
+    get_next_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& uncompacting, gc_clock::time_point compaction_time);
+
+    int64_t get_estimated_tasks(table_state& table_s) const;
+private:
+    std::vector<sstables::shared_sstable>
+    get_next_non_expired_sstables(table_state& table_s, std::vector<sstables::shared_sstable>& non_expiring_sstables, gc_clock::time_point compaction_time);
+
+    std::vector<sstables::shared_sstable>
+    get_compaction_candidates(table_state& table_s, std::vector<sstables::shared_sstable> candidate_sstables, int64_t now, int base);
+
+    /**
+     * Gets the timestamp that DateTieredCompactionStrategy considers to be the "current time".
+     * @return the maximum timestamp across all SSTables.
+     */
+    static int64_t get_now(lw_shared_ptr<const sstables::sstable_list> shared_set);
+
+    /**
+     * Removes all sstables with max timestamp older than maxSSTableAge.
+     * @return a list of sstables with the oldest sstables excluded
+     */
+    static std::vector<sstables::shared_sstable>
+    filter_old_sstables(std::vector<sstables::shared_sstable> sstables, api::timestamp_type max_sstable_age, int64_t now);
+
+    /**
+     *
+     * @param sstables
+     * @return
+     */
+    static std::vector<std::pair<sstables::shared_sstable,int64_t>>
+    create_sst_and_min_timestamp_pairs(const std::vector<sstables::shared_sstable>& sstables);
+
+    /**
+     * A target time span used for bucketing SSTables based on timestamps.
+     */
+    struct target {
+        // How big a range of timestamps fit inside the target.
+        int64_t size;
+        // A timestamp t hits the target iff t / size == divPosition.
+        int64_t div_position;
+
+        target() = delete;
+        target(int64_t size, int64_t div_position) : size(size), div_position(div_position) {}
+
+        /**
+         * Compares the target to a timestamp.
+         * @param timestamp the timestamp to compare.
+         * @return a negative integer, zero, or a positive integer as the target lies before, covering, or after than the timestamp.
+         */
+        int compare_to_timestamp(int64_t timestamp) {
+            auto ts1 = div_position;
+            auto ts2 = timestamp / size;
+            return (ts1 > ts2 ? 1 : (ts1 == ts2 ? 0 : -1));
+        }
+
+        /**
+         * Tells if the timestamp hits the target.
+         * @param timestamp the timestamp to test.
+         * @return <code>true</code> iff timestamp / size == divPosition.
+         */
+        bool on_target(int64_t timestamp) {
+            return compare_to_timestamp(timestamp) == 0;
+        }
+
+        /**
+         * Gets the next target, which represents an earlier time span.
+         * @param base The number of contiguous targets that will have the same size. Targets following those will be <code>base</code> times as big.
+         * @return
+         */
+        target next_target(int base)
+        {
+            if (div_position % base > 0) {
+                return target(size, div_position - 1);
+            } else {
+                return target(size * base, div_position / base - 1);
+            }
+        }
+    };
+
+
+    /**
+     * Group files with similar min timestamp into buckets. Files with recent min timestamps are grouped together into
+     * buckets designated to short timespans while files with older timestamps are grouped into buckets representing
+     * longer timespans.
+     * @param files pairs consisting of a file and its min timestamp
+     * @param timeUnit
+     * @param base
+     * @param now
+     * @return a list of buckets of files. The list is ordered such that the files with newest timestamps come first.
+     *         Each bucket is also a list of files ordered from newest to oldest.
+     */
+    std::vector<std::vector<sstables::shared_sstable>>
+    get_buckets(std::vector<std::pair<sstables::shared_sstable,int64_t>>&& files, api::timestamp_type time_unit, int base, int64_t now) const {
+        // Sort files by age. Newest first.
+        std::sort(files.begin(), files.end(), [] (auto& i, auto& j) {
+            return i.second > j.second;
+        });
+
+        std::vector<std::vector<sstables::shared_sstable>> buckets;
+        auto target = get_initial_target(now, time_unit);
+        auto it = files.begin();
+
+        while (it != files.end()) {
+            bool finish = false;
+            while (!target.on_target(it->second)) {
+                // If the file is too new for the target, skip it.
+                if (target.compare_to_timestamp(it->second) < 0) {
+                    it++;
+                    if (it == files.end()) {
+                        finish = true;
+                        break;
+                    }
+                } else { // If the file is too old for the target, switch targets.
+                    target = target.next_target(base);
+                }
+            }
+            if (finish) {
+                break;
+            }
+
+            std::vector<sstables::shared_sstable> bucket;
+            while (target.on_target(it->second)) {
+                bucket.push_back(it->first);
+                it++;
+                if (it == files.end()) {
+                    break;
+                }
+            }
+            buckets.push_back(bucket);
+        }
+
+        return buckets;
+    }
+
+    target get_initial_target(uint64_t now, int64_t time_unit) const {
+        return target(time_unit, now / time_unit);
+    }
+
+    /**
+     * @param buckets list of buckets, sorted from newest to oldest, from which to return the newest bucket within thresholds.
+     * @param minThreshold minimum number of sstables in a bucket to qualify.
+     * @param maxThreshold maximum number of sstables to compact at once (the returned bucket will be trimmed down to this).
+     * @return a bucket (list) of sstables to compact.
+     */
+    std::vector<sstables::shared_sstable>
+    newest_bucket(std::vector<std::vector<sstables::shared_sstable>>& buckets, int min_threshold, int max_threshold,
+            int64_t now, api::timestamp_type base_time) {
+
+        // If the "incoming window" has at least minThreshold SSTables, choose that one.
+        // For any other bucket, at least 2 SSTables is enough.
+        // In any case, limit to maxThreshold SSTables.
+        target incoming_window = get_initial_target(now, base_time);
+        for (auto& bucket : buckets) {
+            auto min_timestamp = bucket.front()->get_stats_metadata().min_timestamp;
+            if (bucket.size() >= size_t(min_threshold) ||
+                    (bucket.size() >= 2 && !incoming_window.on_target(min_timestamp))) {
+                trim_to_threshold(bucket, max_threshold);
+                return bucket;
+            }
+        }
+        return {};
+    }
+
+
+    /**
+     * @param bucket list of sstables, ordered from newest to oldest by getMinTimestamp().
+     * @param maxThreshold maximum number of sstables in a single compaction task.
+     * @return A bucket trimmed to the <code>maxThreshold</code> newest sstables.
+     */
+    static void trim_to_threshold(std::vector<sstables::shared_sstable>& bucket, int max_threshold) {
+        // Trim the oldest sstables off the end to meet the maxThreshold
+        bucket.resize(std::min(bucket.size(), size_t(max_threshold)));
+    }
+};
+
+namespace sstables {
+
+class date_tiered_compaction_strategy : public compaction_strategy_impl {
+    date_tiered_manifest _manifest;
+public:
+    date_tiered_compaction_strategy(const std::map<sstring, sstring>& options);
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;
+
+    virtual int64_t estimated_pending_compactions(table_state& table_s) const override {
+        return _manifest.get_estimated_tasks(table_s);
+    }
+
+    virtual compaction_strategy_type type() const override {
+        return compaction_strategy_type::date_tiered;
+    }
+
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;
+};
+
+}
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -19,9 +19,8 @@ leveled_compaction_strategy_state& leveled_compaction_strategy::get_state(table_
    return table_s.get_compaction_strategy_state().get<leveled_compaction_strategy_state>();
 }

-compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
    auto& state = get_state(table_s);
-    auto candidates = control.candidates(table_s);
    // NOTE: leveled_manifest creation may be slightly expensive, so later on,
    // we may want to store it in the strategy itself. However, the sstable
    // lists managed by the manifest may become outdated. For example, one
@@ -51,18 +50,18 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
        auto& sstables = manifest.get_level(level);
        // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
        auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
-            return !worth_dropping_tombstones(sst, compaction_time, table_s);
+            return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
        });
        sstables.erase(e, sstables.end());
        if (sstables.empty()) {
            continue;
        }
        auto& sst = *std::max_element(sstables.begin(), sstables.end(), [&] (auto& i, auto& j) {
-            auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
-            auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+            auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
+            auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
            return i->estimate_droppable_tombstone_ratio(gc_before1) < j->estimate_droppable_tombstone_ratio(gc_before2);
        });
-        return sstables::compaction_descriptor({ sst }, sst->get_sstable_level());
+        return sstables::compaction_descriptor({ sst }, service::get_local_compaction_priority(), sst->get_sstable_level());
    }
    return {};
 }
@@ -146,7 +145,7 @@ int64_t leveled_compaction_strategy::estimated_pending_compactions(table_state&
 }

 compaction_descriptor
-leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const {
    std::array<std::vector<shared_sstable>, leveled_manifest::MAX_LEVELS> level_info;

    auto is_disjoint = [schema] (const std::vector<shared_sstable>& sstables, unsigned tolerance) -> std::tuple<bool, unsigned> {
@@ -156,8 +155,6 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    auto max_sstable_size_in_bytes = _max_sstable_size_in_mb * 1024 * 1024;

-    clogger.debug("get_reshaping_job: mode={} input.size={} max_sstable_size_in_bytes={}", mode == reshape_mode::relaxed ? "relaxed" : "strict", input.size(), max_sstable_size_in_bytes);
-
    for (auto& sst : input) {
        auto sst_level = sst->get_sstable_level();
        if (sst_level > leveled_manifest::MAX_LEVELS - 1) {
@@ -165,7 +162,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

            // This is really unexpected, so we'll just compact it all to fix it
            auto ideal_level = ideal_level_for_input(input, max_sstable_size_in_bytes);
-            compaction_descriptor desc(std::move(input), ideal_level, max_sstable_size_in_bytes);
+            compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -196,14 +193,14 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        unsigned ideal_level = ideal_level_for_input(level_info[0], max_sstable_size_in_bytes);

        leveled_manifest::logger.info("Reshaping {} disjoint sstables in level 0 into level {}", level_info[0].size(), ideal_level);
-        compaction_descriptor desc(std::move(input), ideal_level, max_sstable_size_in_bytes);
+        compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
        desc.options = compaction_type_options::make_reshape();
        return desc;
    }

    if (level_info[0].size() > offstrategy_threshold) {
        size_tiered_compaction_strategy stcs(_stcs_options);
-        return stcs.get_reshaping_job(std::move(level_info[0]), schema, mode);
+        return stcs.get_reshaping_job(std::move(level_info[0]), schema, iop, mode);
    }

    for (unsigned level = leveled_manifest::MAX_LEVELS - 1; level > 0; --level) {
@@ -214,7 +211,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        auto [disjoint, overlapping_sstables] = is_disjoint(level_info[level], tolerance(level));
        if (!disjoint) {
            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so the level will be entirely compacted on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
-            compaction_descriptor desc(std::move(level_info[level]), level, max_sstable_size_in_bytes);
+            compaction_descriptor desc(std::move(level_info[level]), iop, level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -234,15 +231,12 @@ leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, s
        if (levels[level].empty()) {
            continue;
        }
-        ret.push_back(compaction_descriptor(std::move(levels[level]), level, _max_sstable_size_in_mb * 1024 * 1024));
+        ret.push_back(compaction_descriptor(std::move(levels[level]), service::get_local_compaction_priority(), level, _max_sstable_size_in_mb * 1024 * 1024));
    }
    return ret;
 }

 unsigned leveled_compaction_strategy::ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size) {
-    if (!max_sstable_size) {
-        return 1;
-    }
    auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
        double inv_log_fanout = 1.0f / std::log(fanout);
        return log(x) * inv_log_fanout;
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -36,7 +36,7 @@ struct leveled_compaction_strategy_state {

 class leveled_compaction_strategy : public compaction_strategy_impl {
    static constexpr int32_t DEFAULT_MAX_SSTABLE_SIZE_IN_MB = 160;
-    static constexpr auto SSTABLE_SIZE_OPTION = "sstable_size_in_mb";
+    const sstring SSTABLE_SIZE_OPTION = "sstable_size_in_mb";

    int32_t _max_sstable_size_in_mb = DEFAULT_MAX_SSTABLE_SIZE_IN_MB;
    size_tiered_compaction_strategy_options _stcs_options;
@@ -46,10 +46,9 @@ private:
    leveled_compaction_strategy_state& get_state(table_state& table_s) const;
 public:
    static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
-    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

    leveled_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;

    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;

@@ -74,7 +73,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const override;
 };

 }
--- a/compaction/leveled_manifest.hh
+++ b/compaction/leveled_manifest.hh
@@ -11,11 +11,13 @@
 #pragma once

 #include "sstables/sstables.hh"
+#include "compaction.hh"
 #include "size_tiered_compaction_strategy.hh"
 #include "range.hh"
 #include "log.hh"
 #include <boost/range/algorithm/sort.hpp>
 #include <boost/range/algorithm/partial_sort.hpp>
+#include "service/priority_manager.hh"

 class leveled_manifest {
    table_state& _table_s;
@@ -147,7 +149,8 @@ public:
            if (info.can_promote) {
                info.candidates = get_overlapping_starved_sstables(next_level, std::move(info.candidates), compaction_counter);
            }
-            return sstables::compaction_descriptor(std::move(info.candidates), next_level, _max_sstable_size_in_bytes);
+            return sstables::compaction_descriptor(std::move(info.candidates),
+                                                   service::get_local_compaction_priority(), next_level, _max_sstable_size_in_bytes);
        } else {
            logger.debug("No compaction candidates for L{}", level);
            return sstables::compaction_descriptor();
@@ -211,7 +214,8 @@ public:
                    _table_s.min_compaction_threshold(), _schema->max_compaction_threshold(), _stcs_options);
                if (!most_interesting.empty()) {
                    logger.debug("L0 is too far behind, performing size-tiering there first");
-                    return sstables::compaction_descriptor(std::move(most_interesting));
+                    return sstables::compaction_descriptor(std::move(most_interesting),
+                                                           service::get_local_compaction_priority());
                }
            }
            auto descriptor = get_descriptor_for_level(i, last_compacted_keys, compaction_counter);
@@ -225,7 +229,8 @@ public:
            auto info = get_candidates_for(0, last_compacted_keys);
            if (!info.candidates.empty()) {
                auto next_level = get_next_level(info.candidates, info.can_promote);
-                return sstables::compaction_descriptor(std::move(info.candidates), next_level, _max_sstable_size_in_bytes);
+                return sstables::compaction_descriptor(std::move(info.candidates),
+                                                       service::get_local_compaction_priority(), next_level, _max_sstable_size_in_bytes);
            }
        }

--- a/compaction/size_tiered_backlog_tracker.hh
+++ b/compaction/size_tiered_backlog_tracker.hh
@@ -9,6 +9,7 @@
 #include "compaction_backlog_manager.hh"
 #include "size_tiered_compaction_strategy.hh"
 #include <cmath>
+#include <ctgmath>

 // Backlog for one SSTable under STCS:
 //
@@ -63,14 +64,10 @@
 // certain point in time, whose size is the amount of bytes currently written. So all we need
 // to do is keep track of them too, and add the current estimate to the static part of (4).
 class size_tiered_backlog_tracker final : public compaction_backlog_tracker::impl {
-    struct sstables_backlog_contribution {
-        double value = 0.0f;
-        std::unordered_set<sstables::shared_sstable> sstables;
-    };
-
    sstables::size_tiered_compaction_strategy_options _stcs_options;
    int64_t _total_bytes = 0;
-    sstables_backlog_contribution _contrib;
+    double _sstables_backlog_contribution = 0.0f;
+    std::unordered_set<sstables::shared_sstable> _sstables_contributing_backlog;
    std::unordered_set<sstables::shared_sstable> _all;

    struct inflight_component {
@@ -80,12 +77,12 @@ class size_tiered_backlog_tracker final : public compaction_backlog_tracker::imp

    inflight_component compacted_backlog(const compaction_backlog_tracker::ongoing_compactions& ongoing_compactions) const;

-    static double log4(double x) {
+    double log4(double x) const {
        double inv_log_4 = 1.0f / std::log(4);
        return log(x) * inv_log_4;
    }

-    static sstables_backlog_contribution calculate_sstables_backlog_contribution(const std::vector<sstables::shared_sstable>& all, const sstables::size_tiered_compaction_strategy_options& stcs_options);
+    void refresh_sstables_backlog_contribution();
 public:
    size_tiered_backlog_tracker(sstables::size_tiered_compaction_strategy_options stcs_options) : _stcs_options(stcs_options) {}

@@ -93,8 +90,7 @@ public:

    // Removing could be the result of a failure of an in progress write, successful finish of a
    // compaction, or some one-off operation, like drop
-    // Provides strong exception safety guarantees.
-    virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) override;
+    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override;

    int64_t total_bytes() const {
        return _total_bytes;
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -15,73 +15,20 @@

 namespace sstables {

-static long validate_sstable_size(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY);
-    auto min_sstables_size = cql3::statements::property_definitions::to_long(size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_MIN_SSTABLE_SIZE);
-    if (min_sstables_size < 0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be non negative", size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY, min_sstables_size));
-    }
-    return min_sstables_size;
-}
-
-static long validate_sstable_size(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto min_sstables_size = validate_sstable_size(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::MIN_SSTABLE_SIZE_KEY);
-    return min_sstables_size;
-}
-
-static double validate_bucket_low(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::BUCKET_LOW_KEY);
-    auto bucket_low = cql3::statements::property_definitions::to_double(size_tiered_compaction_strategy_options::BUCKET_LOW_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_BUCKET_LOW);
-    if (bucket_low <= 0.0 || bucket_low >= 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be between 0.0 and 1.0", size_tiered_compaction_strategy_options::BUCKET_LOW_KEY, bucket_low));
-    }
-    return bucket_low;
-}
-
-static double validate_bucket_low(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto bucket_low = validate_bucket_low(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::BUCKET_LOW_KEY);
-    return bucket_low;
-}
-
-static double validate_bucket_high(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY);
-    auto bucket_high = cql3::statements::property_definitions::to_double(size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_BUCKET_HIGH);
-    if (bucket_high <= 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be greater than 1.0", size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY, bucket_high));
-    }
-    return bucket_high;
-}
-
-static double validate_bucket_high(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto bucket_high = validate_bucket_high(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::BUCKET_HIGH_KEY);
-    return bucket_high;
-}
-
-static double validate_cold_reads_to_omit(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY);
-    auto cold_reads_to_omit = cql3::statements::property_definitions::to_double(size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY, tmp_value, size_tiered_compaction_strategy_options::DEFAULT_COLD_READS_TO_OMIT);
-    if (cold_reads_to_omit < 0.0 || cold_reads_to_omit > 1.0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be between 0.0 and 1.0", size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY, cold_reads_to_omit));
-    }
-    return cold_reads_to_omit;
-}
-
-static double validate_cold_reads_to_omit(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto cold_reads_to_omit = validate_cold_reads_to_omit(options);
-    unchecked_options.erase(size_tiered_compaction_strategy_options::COLD_READS_TO_OMIT_KEY);
-    return cold_reads_to_omit;
-}
-
 size_tiered_compaction_strategy_options::size_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options) {
    using namespace cql3::statements;

-    min_sstable_size = validate_sstable_size(options);
-    bucket_low = validate_bucket_low(options);
-    bucket_high = validate_bucket_high(options);
-    cold_reads_to_omit = validate_cold_reads_to_omit(options);
+    auto tmp_value = compaction_strategy_impl::get_value(options, MIN_SSTABLE_SIZE_KEY);
+    min_sstable_size = property_definitions::to_long(MIN_SSTABLE_SIZE_KEY, tmp_value, DEFAULT_MIN_SSTABLE_SIZE);
+
+    tmp_value = compaction_strategy_impl::get_value(options, BUCKET_LOW_KEY);
+    bucket_low = property_definitions::to_double(BUCKET_LOW_KEY, tmp_value, DEFAULT_BUCKET_LOW);
+
+    tmp_value = compaction_strategy_impl::get_value(options, BUCKET_HIGH_KEY);
+    bucket_high = property_definitions::to_double(BUCKET_HIGH_KEY, tmp_value, DEFAULT_BUCKET_HIGH);
+
+    tmp_value = compaction_strategy_impl::get_value(options, COLD_READS_TO_OMIT_KEY);
+    cold_reads_to_omit = property_definitions::to_double(COLD_READS_TO_OMIT_KEY, tmp_value, DEFAULT_COLD_READS_TO_OMIT);
 }

 size_tiered_compaction_strategy_options::size_tiered_compaction_strategy_options() {
@@ -91,20 +38,6 @@ size_tiered_compaction_strategy_options::size_tiered_compaction_strategy_options
    cold_reads_to_omit = DEFAULT_COLD_READS_TO_OMIT;
 }

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void size_tiered_compaction_strategy_options::validate(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    validate_sstable_size(options, unchecked_options);
-    auto bucket_low = validate_bucket_low(options, unchecked_options);
-    auto bucket_high = validate_bucket_high(options, unchecked_options);
-    if (bucket_high <= bucket_low) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) is less than or equal to the {} value ({})", BUCKET_HIGH_KEY, bucket_high, BUCKET_LOW_KEY, bucket_low));
-    }
-    validate_cold_reads_to_omit(options, unchecked_options);
-    compaction_strategy_impl::validate_min_max_threshold(options, unchecked_options);
-}
-
 std::vector<std::pair<sstables::shared_sstable, uint64_t>>
 size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) {

@@ -210,12 +143,11 @@ size_tiered_compaction_strategy::most_interesting_bucket(std::vector<std::vector
 }

 compaction_descriptor
-size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) {
    // make local copies so they can't be changed out from under us mid-method
    int min_threshold = table_s.min_compaction_threshold();
    int max_threshold = table_s.schema()->max_compaction_threshold();
    auto compaction_time = gc_clock::now();
-    auto candidates = control.candidates(table_s);

    // TODO: Add support to filter cold sstables (for reference: SizeTieredCompactionStrategy::filterColdSSTables).

@@ -223,13 +155,13 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_

    if (is_any_bucket_interesting(buckets, min_threshold)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), min_threshold, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        return sstables::compaction_descriptor(std::move(most_interesting), service::get_local_compaction_priority());
    }

    // If we are not enforcing min_threshold explicitly, try any pair of SStables in the same tier.
    if (!table_s.compaction_enforce_min_threshold() && is_any_bucket_interesting(buckets, 2)) {
        std::vector<sstables::shared_sstable> most_interesting = most_interesting_bucket(std::move(buckets), 2, max_threshold);
-        return sstables::compaction_descriptor(std::move(most_interesting));
+        return sstables::compaction_descriptor(std::move(most_interesting), service::get_local_compaction_priority());
    }

    if (!table_s.tombstone_gc_enabled()) {
@@ -243,7 +175,7 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
    for (auto&& sstables : buckets | boost::adaptors::reversed) {
        // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
        auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
-            return !worth_dropping_tombstones(sst, compaction_time, table_s);
+            return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
        });
        sstables.erase(e, sstables.end());
        if (sstables.empty()) {
@@ -253,7 +185,7 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
        auto it = std::min_element(sstables.begin(), sstables.end(), [] (auto& i, auto& j) {
            return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
        });
-        return sstables::compaction_descriptor({ *it });
+        return sstables::compaction_descriptor({ *it }, service::get_local_compaction_priority());
    }
    return sstables::compaction_descriptor();
 }
@@ -297,7 +229,7 @@ size_tiered_compaction_strategy::most_interesting_bucket(const std::vector<sstab
 }

 compaction_descriptor
-size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const
+size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const
 {
    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
@@ -313,7 +245,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
        // All sstables can be reshaped at once if the amount of overlapping will not cause memory usage to be high,
        // which is possible because partitioned set is able to incrementally open sstables during compaction
        if (sstable_set_overlapping_count(schema, input) <= max_sstables) {
-            compaction_descriptor desc(std::move(input));
+            compaction_descriptor desc(std::move(input), iop);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -329,7 +261,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
                });
                bucket.resize(max_sstables);
            }
-            compaction_descriptor desc(std::move(bucket));
+            compaction_descriptor desc(std::move(bucket), iop);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -357,7 +289,7 @@ size_tiered_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_
            unsigned needed = std::min(remaining, max_threshold);
            std::vector<shared_sstable> sstables;
            std::move(it, it + needed, std::back_inserter(sstables));
-            ret.push_back(compaction_descriptor(std::move(sstables)));
+            ret.push_back(compaction_descriptor(std::move(sstables), service::get_local_compaction_priority()));
            std::advance(it, needed);
        }
    }
--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -18,16 +18,15 @@ class size_tiered_backlog_tracker;
 namespace sstables {

 class size_tiered_compaction_strategy_options {
-public:
    static constexpr uint64_t DEFAULT_MIN_SSTABLE_SIZE = 50L * 1024L * 1024L;
    static constexpr double DEFAULT_BUCKET_LOW = 0.5;
    static constexpr double DEFAULT_BUCKET_HIGH = 1.5;
    static constexpr double DEFAULT_COLD_READS_TO_OMIT = 0.05;
-    static constexpr auto MIN_SSTABLE_SIZE_KEY = "min_sstable_size";
-    static constexpr auto BUCKET_LOW_KEY = "bucket_low";
-    static constexpr auto BUCKET_HIGH_KEY = "bucket_high";
-    static constexpr auto COLD_READS_TO_OMIT_KEY = "cold_reads_to_omit";
-private:
+    const sstring MIN_SSTABLE_SIZE_KEY = "min_sstable_size";
+    const sstring BUCKET_LOW_KEY = "bucket_low";
+    const sstring BUCKET_HIGH_KEY = "bucket_high";
+    const sstring COLD_READS_TO_OMIT_KEY = "cold_reads_to_omit";
+
    uint64_t min_sstable_size = DEFAULT_MIN_SSTABLE_SIZE;
    double bucket_low = DEFAULT_BUCKET_LOW;
    double bucket_high = DEFAULT_BUCKET_HIGH;
@@ -36,13 +35,48 @@ public:
    size_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options);

    size_tiered_compaction_strategy_options();
-    size_tiered_compaction_strategy_options(const size_tiered_compaction_strategy_options&) = default;
-    size_tiered_compaction_strategy_options(size_tiered_compaction_strategy_options&&) = default;
-    size_tiered_compaction_strategy_options& operator=(const size_tiered_compaction_strategy_options&) = default;
-    size_tiered_compaction_strategy_options& operator=(size_tiered_compaction_strategy_options&&) = default;

-    static void validate(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
+    // FIXME: convert java code below.
+#if 0
+    public static Map<String, String> validateOptions(Map<String, String> options, Map<String, String> uncheckedOptions) throws ConfigurationException
+    {
+        String optionValue = options.get(MIN_SSTABLE_SIZE_KEY);
+        try
+        {
+            long minSSTableSize = optionValue == null ? DEFAULT_MIN_SSTABLE_SIZE : Long.parseLong(optionValue);
+            if (minSSTableSize < 0)
+            {
+                throw new ConfigurationException(String.format("%s must be non negative: %d", MIN_SSTABLE_SIZE_KEY, minSSTableSize));
+            }
+        }
+        catch (NumberFormatException e)
+        {
+            throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", optionValue, MIN_SSTABLE_SIZE_KEY), e);
+        }

+        double bucketLow = parseDouble(options, BUCKET_LOW_KEY, DEFAULT_BUCKET_LOW);
+        double bucketHigh = parseDouble(options, BUCKET_HIGH_KEY, DEFAULT_BUCKET_HIGH);
+        if (bucketHigh <= bucketLow)
+        {
+            throw new ConfigurationException(String.format("%s value (%s) is less than or equal to the %s value (%s)",
+                                                           BUCKET_HIGH_KEY, bucketHigh, BUCKET_LOW_KEY, bucketLow));
+        }
+
+        double maxColdReadsRatio = parseDouble(options, COLD_READS_TO_OMIT_KEY, DEFAULT_COLD_READS_TO_OMIT);
+        if (maxColdReadsRatio < 0.0 || maxColdReadsRatio > 1.0)
+        {
+            throw new ConfigurationException(String.format("%s value (%s) should be between between 0.0 and 1.0",
+                                                           COLD_READS_TO_OMIT_KEY, optionValue));
+        }
+
+        uncheckedOptions.remove(MIN_SSTABLE_SIZE_KEY);
+        uncheckedOptions.remove(BUCKET_LOW_KEY);
+        uncheckedOptions.remove(BUCKET_HIGH_KEY);
+        uncheckedOptions.remove(COLD_READS_TO_OMIT_KEY);
+
+        return uncheckedOptions;
+    }
+#endif
    friend class size_tiered_compaction_strategy;
 };

@@ -75,9 +109,8 @@ public:

    size_tiered_compaction_strategy(const std::map<sstring, sstring>& options);
    explicit size_tiered_compaction_strategy(const size_tiered_compaction_strategy_options& options);
-    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);

-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;

    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;

@@ -96,7 +129,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const override;

    friend class ::size_tiered_backlog_tracker;
 };
--- a/compaction/strategy_control.hh
+++ b/compaction/strategy_control.hh
@@ -10,7 +10,6 @@
 #pragma once

 #include "compaction/compaction_fwd.hh"
-#include "sstables/sstable_set.hh"

 namespace compaction {

@@ -19,8 +18,6 @@ class strategy_control {
 public:
    virtual ~strategy_control() {}
    virtual bool has_ongoing_compaction(table_state& table_s) const noexcept = 0;
-    virtual std::vector<sstables::shared_sstable> candidates(table_state&) const = 0;
-    virtual std::vector<sstables::frozen_sstable_run> candidates_as_runs(table_state&) const = 0;
 };

 }
--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -53,7 +53,7 @@ public:
    virtual bool tombstone_gc_enabled() const noexcept = 0;
    virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;
    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
-    virtual const std::string get_group_id() const noexcept = 0;
+    virtual const std::string& get_group_id() const noexcept = 0;
    virtual seastar::condition_variable& get_staging_done_condition() noexcept = 0;
 };

--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -6,263 +6,29 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

-#include <boost/range/algorithm/min_element.hpp>
-
 #include "compaction/task_manager_module.hh"
 #include "compaction/compaction_manager.hh"
 #include "replica/database.hh"
-#include "sstables/sstables.hh"
-#include "sstables/sstable_directory.hh"
-#include "utils/pretty_printers.hh"
-
-namespace replica {
-
-// Helper structure for resharding.
-//
-// Describes the sstables (represented by their foreign_sstable_open_info) that are shared and
-// need to be resharded. Each shard will keep one such descriptor, that contains the list of
-// SSTables assigned to it, and their total size. The total size is used to make sure we are
-// fairly balancing SSTables among shards.
-struct reshard_shard_descriptor {
-    sstables::sstable_directory::sstable_open_info_vector info_vec;
-    uint64_t uncompressed_data_size = 0;
-
-    bool total_size_smaller(const reshard_shard_descriptor& rhs) const {
-        return uncompressed_data_size < rhs.uncompressed_data_size;
-    }
-
-    uint64_t size() const {
-        return uncompressed_data_size;
-    }
-};
-
-} // namespace replica
-
-// Collects shared SSTables from all shards and sstables that require cleanup and returns a vector containing them all.
-// This function assumes that the list of SSTables can be fairly big so it is careful to
-// manipulate it in a do_for_each loop (which yields) instead of using standard accumulators.
-future<sstables::sstable_directory::sstable_open_info_vector>
-collect_all_shared_sstables(sharded<sstables::sstable_directory>& dir, sharded<replica::database>& db, sstring ks_name, sstring table_name, compaction::owned_ranges_ptr owned_ranges_ptr) {
-    auto info_vec = sstables::sstable_directory::sstable_open_info_vector();
-
-    // We want to make sure that each distributed object reshards about the same amount of data.
-    // Each sharded object has its own shared SSTables. We can use a clever algorithm in which they
-    // all distributely figure out which SSTables to exchange, but we'll keep it simple and move all
-    // their foreign_sstable_open_info to a coordinator (the shard who called this function). We can
-    // move in bulk and that's efficient. That shard can then distribute the work among all the
-    // others who will reshard.
-    auto coordinator = this_shard_id();
-    // We will first move all of the foreign open info to temporary storage so that we can sort
-    // them. We want to distribute bigger sstables first.
-    const auto* sorted_owned_ranges_ptr = owned_ranges_ptr.get();
-    co_await dir.invoke_on_all([&] (sstables::sstable_directory& d) -> future<> {
-        auto shared_sstables = d.retrieve_shared_sstables();
-        sstables::sstable_directory::sstable_open_info_vector need_cleanup;
-        if (sorted_owned_ranges_ptr) {
-            co_await d.filter_sstables([&] (sstables::shared_sstable sst) -> future<bool> {
-                if (needs_cleanup(sst, *sorted_owned_ranges_ptr)) {
-                    need_cleanup.push_back(co_await sst->get_open_info());
-                    co_return false;
-                }
-                co_return true;
-            });
-        }
-        if (shared_sstables.empty() && need_cleanup.empty()) {
-            co_return;
-        }
-        co_await smp::submit_to(coordinator, [&] () -> future<> {
-            info_vec.reserve(info_vec.size() + shared_sstables.size() + need_cleanup.size());
-            for (auto& info : shared_sstables) {
-                info_vec.emplace_back(std::move(info));
-                co_await coroutine::maybe_yield();
-            }
-            for (auto& info : need_cleanup) {
-                info_vec.emplace_back(std::move(info));
-                co_await coroutine::maybe_yield();
-            }
-        });
-    });
-
-    co_return info_vec;
-}
-
-// Given a vector of shared sstables to be resharded, distribute it among all shards.
-// The vector is first sorted to make sure that we are moving the biggest SSTables first.
-//
-// Returns a reshard_shard_descriptor per shard indicating the work that each shard has to do.
-future<std::vector<replica::reshard_shard_descriptor>>
-distribute_reshard_jobs(sstables::sstable_directory::sstable_open_info_vector source) {
-    auto destinations = std::vector<replica::reshard_shard_descriptor>(smp::count);
-
-    std::sort(source.begin(), source.end(), [] (const sstables::foreign_sstable_open_info& a, const sstables::foreign_sstable_open_info& b) {
-        // Sort on descending SSTable sizes.
-        return a.uncompressed_data_size > b.uncompressed_data_size;
-    });
-
-    for (auto& info : source) {
-        // Choose the stable shard owner with the smallest amount of accumulated work.
-        // Note that for sstables that need cleanup via resharding, owners may contain
-        // a single shard.
-        auto shard_it = boost::min_element(info.owners, [&] (const shard_id& lhs, const shard_id& rhs) {
-            return destinations[lhs].total_size_smaller(destinations[rhs]);
-        });
-        auto& dest = destinations[*shard_it];
-        dest.uncompressed_data_size += info.uncompressed_data_size;
-        dest.info_vec.push_back(std::move(info));
-        co_await coroutine::maybe_yield();
-    }
-
-    co_return destinations;
-}
-
-// reshards a collection of SSTables.
-//
-// A reference to the compaction manager must be passed so we can register with it. Knowing
-// which table is being processed is a requirement of the compaction manager, so this must be
-// passed too.
-//
-// We will reshard max_sstables_per_job at once.
-//
-// A creator function must be passed that will create an SSTable object in the correct shard,
-// and an I/O priority must be specified.
-future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory::sstable_open_info_vector shared_info, replica::table& table,
-                           sstables::compaction_sstable_creator_fn creator, compaction::owned_ranges_ptr owned_ranges_ptr, std::optional<tasks::task_info> parent_info)
-{
-    // Resharding doesn't like empty sstable sets, so bail early. There is nothing
-    // to reshard in this shard.
-    if (shared_info.empty()) {
-        co_return;
-    }
-
-    // We want to reshard many SSTables at a time for efficiency. However if we have too many we may
-    // be risking OOM.
-    auto max_sstables_per_job = table.schema()->max_compaction_threshold();
-    auto num_jobs = (shared_info.size() + max_sstables_per_job - 1) / max_sstables_per_job;
-    auto sstables_per_job = shared_info.size() / num_jobs;
-
-    std::vector<std::vector<sstables::shared_sstable>> buckets;
-    buckets.reserve(num_jobs);
-    buckets.emplace_back();
-    co_await coroutine::parallel_for_each(shared_info, [&] (sstables::foreign_sstable_open_info& info) -> future<> {
-        auto sst = co_await dir.load_foreign_sstable(info);
-        // Last bucket gets leftover SSTables
-        if ((buckets.back().size() >= sstables_per_job) && (buckets.size() < num_jobs)) {
-            buckets.emplace_back();
-        }
-        buckets.back().push_back(std::move(sst));
-    });
-    // There is a semaphore inside the compaction manager in run_resharding_jobs. So we
-    // parallel_for_each so the statistics about pending jobs are updated to reflect all
-    // jobs. But only one will run in parallel at a time
-    auto& t = table.as_table_state();
-    co_await coroutine::parallel_for_each(buckets, [&] (std::vector<sstables::shared_sstable>& sstlist) mutable {
-        return table.get_compaction_manager().run_custom_job(table.as_table_state(), sstables::compaction_type::Reshard, "Reshard compaction", [&] (sstables::compaction_data& info, sstables::compaction_progress_monitor& progress_monitor) -> future<> {
-            auto erm = table.get_effective_replication_map(); // keep alive around compaction.
-
-            sstables::compaction_descriptor desc(sstlist);
-            desc.options = sstables::compaction_type_options::make_reshard();
-            desc.creator = creator;
-            desc.sharder = &erm->get_sharder(*table.schema());
-            desc.owned_ranges = owned_ranges_ptr;
-
-            auto result = co_await sstables::compact_sstables(std::move(desc), info, t, progress_monitor);
-            // input sstables are moved, to guarantee their resources are released once we're done
-            // resharding them.
-            co_await when_all_succeed(dir.collect_output_unshared_sstables(std::move(result.new_sstables), sstables::sstable_directory::can_be_remote::yes), dir.remove_sstables(std::move(sstlist))).discard_result();
-        }, parent_info, throw_if_stopping::no);
-    });
-}

 namespace compaction {

-struct table_tasks_info {
-    tasks::task_manager::task_ptr task;
-    table_info ti;
-
-    table_tasks_info(tasks::task_manager::task_ptr t, table_info info)
-        : task(t)
-        , ti(info)
-    {}
-};
-
-future<> run_on_table(sstring op, replica::database& db, std::string keyspace, table_info ti, std::function<future<> (replica::table&)> func) {
-    std::exception_ptr ex;
-    tasks::tmlogger.debug("Starting {} on {}.{}", op, keyspace, ti.name);
-    try {
-        co_await func(db.find_column_family(ti.id));
-    } catch (const replica::no_such_column_family& e) {
-        tasks::tmlogger.warn("Skipping {} of {}.{}: {}", op, keyspace, ti.name, e.what());
-    } catch (...) {
-        ex = std::current_exception();
-        tasks::tmlogger.error("Failed {} of {}.{}: {}", op, keyspace, ti.name, ex);
-    }
-    if (ex) {
-        co_await coroutine::return_exception_ptr(std::move(ex));
-    }
-}
-
 // Run on all tables, skipping dropped tables
-future<> run_on_existing_tables(sstring op, replica::database& db, std::string keyspace, const std::vector<table_info> local_tables, std::function<future<> (replica::table&)> func) {
-    for (const auto& ti : local_tables) {
-        co_await run_on_table(op, db, keyspace, ti, func);
-    }
-}
-
-future<> wait_for_your_turn(seastar::condition_variable& cv, tasks::task_manager::task_ptr& current_task, tasks::task_id id) {
-    co_await cv.wait([&] {
-        return current_task && current_task->id() == id;
-    });
-}
-
-future<> run_table_tasks(replica::database& db, std::vector<table_tasks_info> table_tasks, seastar::condition_variable& cv, tasks::task_manager::task_ptr& current_task, bool sort) {
+future<> run_on_existing_tables(sstring op, replica::database& db, std::string_view keyspace, const std::vector<table_id> local_tables, std::function<future<> (replica::table&)> func) {
    std::exception_ptr ex;
-
-    // While compaction is run on one table, the size of tables may significantly change.
-    // Thus, they are sorted before each invidual compaction and the smallest table is chosen.
-    while (!table_tasks.empty()) {
+    for (const auto& ti : local_tables) {
+        tasks::tmlogger.debug("Starting {} on {}.{}", op, keyspace, ti);
        try {
-            if (sort) {
-                // Major compact smaller tables first, to increase chances of success if low on space.
-                // Tables will be kept in descending order.
-                std::ranges::sort(table_tasks, std::greater<>(), [&] (const table_tasks_info& tti) {
-                    try {
-                        return db.find_column_family(tti.ti.id).get_stats().live_disk_space_used;
-                    } catch (const replica::no_such_column_family& e) {
-                        return int64_t(-1);
-                    }
-                });
-            }
-            // Task responsible for the smallest table.
-            current_task = table_tasks.back().task;
-            table_tasks.pop_back();
-            cv.broadcast();
-            co_await current_task->done();
+            co_await func(db.find_column_family(ti));
+        } catch (const replica::no_such_column_family& e) {
+            tasks::tmlogger.warn("Skipping {} of {}.{}: {}", op, keyspace, ti, e.what());
        } catch (...) {
            ex = std::current_exception();
-            current_task = nullptr;
-            cv.broken(ex);
-            break;
+            tasks::tmlogger.error("Failed {} of {}.{}: {}", op, keyspace, ti, ex);
+        }
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
        }
    }
-
-    if (ex) {
-        // Wait for all tasks even on failure.
-        for (auto& tti: table_tasks) {
-            co_await tti.task->done();
-        }
-        co_await coroutine::return_exception_ptr(std::move(ex));
-    }
-}
-
-future<tasks::task_manager::task::progress> compaction_task_impl::get_progress(const sstables::compaction_data& cdata, const sstables::compaction_progress_monitor& progress_monitor) const {
-    if (cdata.compaction_size == 0) {
-        co_return get_binary_progress();
-    }
-
-    co_return tasks::task_manager::task::progress{
-        .completed = is_done() ? cdata.compaction_size : progress_monitor.get_progress(),   // Consider tasks which skip all files.
-        .total = cdata.compaction_size
-    };
 }

 future<> major_keyspace_compaction_task_impl::run() {
@@ -274,51 +40,48 @@ future<> major_keyspace_compaction_task_impl::run() {
    });
 }

-future<> shard_major_keyspace_compaction_task_impl::run() {
-    seastar::condition_variable cv;
-    tasks::task_manager::task_ptr current_task;
-    tasks::task_info parent_info{_status.id, _status.shard};
-    std::vector<table_tasks_info> table_tasks;
-    for (auto& ti : _local_tables) {
-        table_tasks.emplace_back(co_await _module->make_and_start_task<table_major_keyspace_compaction_task_impl>(parent_info, _status.keyspace, ti.name, _status.id, _db, ti, cv, current_task), ti);
-    }
-
-    co_await run_table_tasks(_db, std::move(table_tasks), cv, current_task, true);
+tasks::is_internal shard_major_keyspace_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
 }

-future<> table_major_keyspace_compaction_task_impl::run() {
-    co_await wait_for_your_turn(_cv, _current_task, _status.id);
-    tasks::task_info info{_status.id, _status.shard};
-    co_await run_on_table("force_keyspace_compaction", _db, _status.keyspace, _ti, [info] (replica::table& t) {
-        return t.compact_all_sstables(info);
+future<> shard_major_keyspace_compaction_task_impl::run() {
+    // Major compact smaller tables first, to increase chances of success if low on space.
+    std::ranges::sort(_local_tables, std::less<>(), [&] (const table_id& ti) {
+        try {
+            return _db.find_column_family(ti).get_stats().live_disk_space_used;
+        } catch (const replica::no_such_column_family& e) {
+            return int64_t(-1);
+        }
+    });
+    co_await run_on_existing_tables("force_keyspace_compaction", _db, _status.keyspace, _local_tables, [] (replica::table& t) {
+        return t.compact_all_sstables();
    });
 }

 future<> cleanup_keyspace_compaction_task_impl::run() {
    co_await _db.invoke_on_all([&] (replica::database& db) -> future<> {
        auto& module = db.get_compaction_manager().get_task_manager_module();
-        auto task = co_await module.make_and_start_task<shard_cleanup_keyspace_compaction_task_impl>({_status.id, _status.shard}, _status.keyspace, _status.id, db, _table_infos);
+        auto task = co_await module.make_and_start_task<shard_cleanup_keyspace_compaction_task_impl>({_status.id, _status.shard}, _status.keyspace, _status.id, db, _table_ids);
        co_await task->done();
    });
 }

-future<> shard_cleanup_keyspace_compaction_task_impl::run() {
-    seastar::condition_variable cv;
-    tasks::task_manager::task_ptr current_task;
-    tasks::task_info parent_info{_status.id, _status.shard};
-    std::vector<table_tasks_info> table_tasks;
-    for (auto& ti : _local_tables) {
-        table_tasks.emplace_back(co_await _module->make_and_start_task<table_cleanup_keyspace_compaction_task_impl>(parent_info, _status.keyspace, ti.name, _status.id, _db, ti, cv, current_task), ti);
-    }
-
-    co_await run_table_tasks(_db, std::move(table_tasks), cv, current_task, true);
+tasks::is_internal shard_cleanup_keyspace_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
 }

-future<> table_cleanup_keyspace_compaction_task_impl::run() {
-    co_await wait_for_your_turn(_cv, _current_task, _status.id);
+future<> shard_cleanup_keyspace_compaction_task_impl::run() {
+    // Cleanup smaller tables first, to increase chances of success if low on space.
+    std::ranges::sort(_local_tables, std::less<>(), [&] (const table_id& ti) {
+        try {
+            return _db.find_column_family(ti).get_stats().live_disk_space_used;
+        } catch (const replica::no_such_column_family& e) {
+            return int64_t(-1);
+        }
+    });
    auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(_db.get_keyspace_local_ranges(_status.keyspace));
-    co_await run_on_table("force_keyspace_cleanup", _db, _status.keyspace, _ti, [&] (replica::table& t) {
-        return t.perform_cleanup_compaction(owned_ranges_ptr, tasks::task_info{_status.id, _status.shard});
+    co_await run_on_existing_tables("force_keyspace_cleanup", _db, _status.keyspace, _local_tables, [&] (replica::table& t) {
+        return t.perform_cleanup_compaction(owned_ranges_ptr);
    });
 }

@@ -333,23 +96,13 @@ future<> offstrategy_keyspace_compaction_task_impl::run() {
    }, false, std::plus<bool>());
 }

-future<> shard_offstrategy_keyspace_compaction_task_impl::run() {
-    seastar::condition_variable cv;
-    tasks::task_manager::task_ptr current_task;
-    tasks::task_info parent_info{_status.id, _status.shard};
-    std::vector<table_tasks_info> table_tasks;
-    for (auto& ti : _table_infos) {
-        table_tasks.emplace_back(co_await _module->make_and_start_task<table_offstrategy_keyspace_compaction_task_impl>(parent_info, _status.keyspace, ti.name, _status.id, _db, ti, cv, current_task, _needed), ti);
-    }
-
-    co_await run_table_tasks(_db, std::move(table_tasks), cv, current_task, false);
+tasks::is_internal shard_offstrategy_keyspace_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
 }

-future<> table_offstrategy_keyspace_compaction_task_impl::run() {
-    co_await wait_for_your_turn(_cv, _current_task, _status.id);
-    tasks::task_info info{_status.id, _status.shard};
-    co_await run_on_table("perform_keyspace_offstrategy_compaction", _db, _status.keyspace, _ti, [this, info] (replica::table& t) -> future<> {
-        _needed |= co_await t.perform_offstrategy_compaction(info);
+future<> shard_offstrategy_keyspace_compaction_task_impl::run() {
+    co_await run_on_existing_tables("perform_keyspace_offstrategy_compaction", _db, _status.keyspace, _table_infos, [this] (replica::table& t) -> future<> {
+        _needed |= co_await t.perform_offstrategy_compaction();
    });
 }

@@ -362,25 +115,15 @@ future<> upgrade_sstables_compaction_task_impl::run() {
    });
 }

-future<> shard_upgrade_sstables_compaction_task_impl::run() {
-    seastar::condition_variable cv;
-    tasks::task_manager::task_ptr current_task;
-    tasks::task_info parent_info{_status.id, _status.shard};
-    std::vector<table_tasks_info> table_tasks;
-    for (auto& ti : _table_infos) {
-        table_tasks.emplace_back(co_await _module->make_and_start_task<table_upgrade_sstables_compaction_task_impl>(parent_info, _status.keyspace, ti.name, _status.id, _db, ti, cv, current_task, _exclude_current_version), ti);
-    }
-
-    co_await run_table_tasks(_db, std::move(table_tasks), cv, current_task, false);
+tasks::is_internal shard_upgrade_sstables_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
 }

-future<> table_upgrade_sstables_compaction_task_impl::run() {
-    co_await wait_for_your_turn(_cv, _current_task, _status.id);
+future<> shard_upgrade_sstables_compaction_task_impl::run() {
    auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(_db.get_keyspace_local_ranges(_status.keyspace));
-    tasks::task_info info{_status.id, _status.shard};
-    co_await run_on_table("upgrade_sstables", _db, _status.keyspace, _ti, [&] (replica::table& t) -> future<> {
+    co_await run_on_existing_tables("upgrade_sstables", _db, _status.keyspace, _table_infos, [&] (replica::table& t) -> future<> {
        return t.parallel_foreach_table_state([&] (compaction::table_state& ts) -> future<> {
-            return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, _exclude_current_version, info);
+            return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, _exclude_current_version);
        });
    });
 }
@@ -396,6 +139,10 @@ future<> scrub_sstables_compaction_task_impl::run() {
    }, sstables::compaction_stats{}, std::plus<sstables::compaction_stats>());
 }

+tasks::is_internal shard_scrub_sstables_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
+}
+
 future<> shard_scrub_sstables_compaction_task_impl::run() {
    _stats = co_await map_reduce(_column_families, [&] (sstring cfname) -> future<sstables::compaction_stats> {
        sstables::compaction_stats stats{};
@@ -407,123 +154,18 @@ future<> shard_scrub_sstables_compaction_task_impl::run() {
    }, sstables::compaction_stats{}, std::plus<sstables::compaction_stats>());
 }

+tasks::is_internal table_scrub_sstables_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
+}
+
 future<> table_scrub_sstables_compaction_task_impl::run() {
    auto& cm = _db.get_compaction_manager();
    auto& cf = _db.find_column_family(_status.keyspace, _status.table);
-    tasks::task_info info{_status.id, _status.shard};
    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
-        auto r = co_await cm.perform_sstable_scrub(ts, _opts, info);
+        auto r = co_await cm.perform_sstable_scrub(ts, _opts);
        _stats += r.value_or(sstables::compaction_stats{});
    });
 }

-future<> table_reshaping_compaction_task_impl::run() {
-    auto start = std::chrono::steady_clock::now();
-    auto total_size = co_await _dir.map_reduce0([&] (sstables::sstable_directory& d) -> future<uint64_t> {
-        uint64_t total_shard_size;
-        tasks::task_info parent_info{_status.id, _status.shard};
-        auto& compaction_module = _db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<shard_reshaping_compaction_task_impl>(parent_info, _status.keyspace, _status.table, _status.id, d, _db, _mode, _creator, _filter, total_shard_size);
-        co_await task->done();
-        co_return total_shard_size;
-    }, uint64_t(0), std::plus<uint64_t>());
-
-    if (total_size > 0) {
-        auto duration = std::chrono::duration_cast<std::chrono::duration<float>>(std::chrono::steady_clock::now() - start);
-        dblog.info("Reshaped {} in {:.2f} seconds, {}", utils::pretty_printed_data_size(total_size), duration.count(), utils::pretty_printed_throughput(total_size, duration));
-    }
-}
-
-future<> shard_reshaping_compaction_task_impl::run() {
-    auto& table = _db.local().find_column_family(_status.keyspace, _status.table);
-    uint64_t reshaped_size = 0;
-    tasks::task_info info{_status.id, _status.shard};
-
-    while (true) {
-        auto reshape_candidates = boost::copy_range<std::vector<sstables::shared_sstable>>(_dir.get_unshared_local_sstables()
-                | boost::adaptors::filtered([&filter = _filter] (const auto& sst) {
-            return filter(sst);
-        }));
-        auto desc = table.get_compaction_strategy().get_reshaping_job(std::move(reshape_candidates), table.schema(), _mode);
-        if (desc.sstables.empty()) {
-            break;
-        }
-
-        if (!reshaped_size) {
-            dblog.info("Table {}.{} with compaction strategy {} found SSTables that need reshape. Starting reshape process", table.schema()->ks_name(), table.schema()->cf_name(), table.get_compaction_strategy().name());
-        }
-
-        std::vector<sstables::shared_sstable> sstlist;
-        for (auto& sst : desc.sstables) {
-            reshaped_size += sst->data_size();
-            sstlist.push_back(sst);
-        }
-
-        desc.creator = _creator;
-
-        std::exception_ptr ex;
-        try {
-            co_await table.get_compaction_manager().run_custom_job(table.as_table_state(), sstables::compaction_type::Reshape, "Reshape compaction", [&dir = _dir, &table, sstlist = std::move(sstlist), desc = std::move(desc)] (sstables::compaction_data& info, sstables::compaction_progress_monitor& progress_monitor) mutable -> future<> {
-                sstables::compaction_result result = co_await sstables::compact_sstables(std::move(desc), info, table.as_table_state(), progress_monitor);
-                co_await dir.remove_unshared_sstables(std::move(sstlist));
-                co_await dir.collect_output_unshared_sstables(std::move(result.new_sstables), sstables::sstable_directory::can_be_remote::no);
-            }, info, throw_if_stopping::yes);
-        } catch (...) {
-            ex = std::current_exception();
-        }
-
-        if (ex != nullptr) {
-              try {
-                std::rethrow_exception(std::move(ex));
-              } catch (sstables::compaction_stopped_exception& e) {
-                  dblog.info("Table {}.{} with compaction strategy {} had reshape successfully aborted.", table.schema()->ks_name(), table.schema()->cf_name(), table.get_compaction_strategy().name());
-                  break;
-              } catch (...) {
-                  dblog.info("Reshape failed for Table {}.{} with compaction strategy {} due to {}", table.schema()->ks_name(), table.schema()->cf_name(), table.get_compaction_strategy().name(), std::current_exception());
-                  break;
-              }
-        }
-
-        co_await coroutine::maybe_yield();
-    }
-
-    _total_shard_size = reshaped_size;
-}
-
-future<> table_resharding_compaction_task_impl::run() {
-    auto all_jobs = co_await collect_all_shared_sstables(_dir, _db, _status.keyspace, _status.table, _owned_ranges_ptr);
-    auto destinations = co_await distribute_reshard_jobs(std::move(all_jobs));
-
-    uint64_t total_size = boost::accumulate(destinations | boost::adaptors::transformed(std::mem_fn(&replica::reshard_shard_descriptor::size)), uint64_t(0));
-    if (total_size == 0) {
-        co_return;
-    }
-
-    auto start = std::chrono::steady_clock::now();
-    dblog.info("Resharding {} for {}.{}", utils::pretty_printed_data_size(total_size), _status.keyspace, _status.table);
-
-    co_await _db.invoke_on_all(coroutine::lambda([&] (replica::database& db) -> future<> {
-        tasks::task_info parent_info{_status.id, _status.shard};
-        auto& compaction_module = _db.local().get_compaction_manager().get_task_manager_module();
-        // make shard-local copy of owned_ranges
-        compaction::owned_ranges_ptr local_owned_ranges_ptr;
-        if (_owned_ranges_ptr) {
-            local_owned_ranges_ptr = make_lw_shared<const dht::token_range_vector>(*_owned_ranges_ptr);
-        }
-        auto task = co_await compaction_module.make_and_start_task<shard_resharding_compaction_task_impl>(parent_info, _status.keyspace, _status.table, _status.id, _dir, db, _creator, std::move(local_owned_ranges_ptr), destinations);
-        co_await task->done();
-    }));
-
-    auto duration = std::chrono::duration_cast<std::chrono::duration<float>>(std::chrono::steady_clock::now() - start);
-    dblog.info("Resharded {} for {}.{} in {:.2f} seconds, {}", utils::pretty_printed_data_size(total_size), _status.keyspace, _status.table, duration.count(), utils::pretty_printed_throughput(total_size, duration));
-}
-
-future<> shard_resharding_compaction_task_impl::run() {
-    auto& table = _db.find_column_family(_status.keyspace, _status.table);
-    auto info_vec = std::move(_destinations[this_shard_id()].info_vec);
-    tasks::task_info info{_status.id, _status.shard};
-    co_await reshard(_dir.local(), std::move(info_vec), table, _creator, std::move(_local_owned_ranges_ptr), info);
-    co_await _dir.local().move_foreign_sstables(_dir);
-}

 }
--- a/compaction/task_manager_module.hh
+++ b/compaction/task_manager_module.hh
@@ -13,14 +13,6 @@
 #include "schema/schema_fwd.hh"
 #include "tasks/task_manager.hh"

-namespace sstables {
-class sstable_directory;
-}
-
-namespace replica {
-class reshard_shard_descriptor;
-}
-
 namespace compaction {

 class compaction_task_impl : public tasks::task_manager::task::impl {
@@ -28,12 +20,11 @@ public:
    compaction_task_impl(tasks::task_manager::module_ptr module,
            tasks::task_id id,
            unsigned sequence_number,
-            std::string scope,
            std::string keyspace,
            std::string table,
            std::string entity,
            tasks::task_id parent_id) noexcept
-        : tasks::task_manager::task::impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
+        : tasks::task_manager::task::impl(module, id, sequence_number, std::move(keyspace), std::move(table), std::move(entity), parent_id)
    {
        // FIXME: add progress units
    }
@@ -41,8 +32,6 @@ public:
    virtual std::string type() const override = 0;
 protected:
    virtual future<> run() override = 0;
-
-    future<tasks::task_manager::task::progress> get_progress(const sstables::compaction_data& cdata, const sstables::compaction_progress_monitor& progress_monitor) const;
 };

 class major_compaction_task_impl : public compaction_task_impl {
@@ -50,12 +39,11 @@ public:
    major_compaction_task_impl(tasks::task_manager::module_ptr module,
            tasks::task_id id,
            unsigned sequence_number,
-            std::string scope,
            std::string keyspace,
            std::string table,
            std::string entity,
            tasks::task_id parent_id) noexcept
-        : compaction_task_impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
+        : compaction_task_impl(module, id, sequence_number, std::move(keyspace), std::move(table), std::move(entity), parent_id)
    {
        // FIXME: add progress units
    }
@@ -70,13 +58,13 @@ protected:
 class major_keyspace_compaction_task_impl : public major_compaction_task_impl {
 private:
    sharded<replica::database>& _db;
-    std::vector<table_info> _table_infos;
+    std::vector<table_id> _table_infos;
 public:
    major_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            sharded<replica::database>& db,
-            std::vector<table_info> table_infos) noexcept
-        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "keyspace", std::move(keyspace), "", "", tasks::task_id::create_null_id())
+            std::vector<table_id> table_infos) noexcept
+        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
        , _db(db)
        , _table_infos(std::move(table_infos))
    {}
@@ -87,58 +75,33 @@ protected:
 class shard_major_keyspace_compaction_task_impl : public major_compaction_task_impl {
 private:
    replica::database& _db;
-    std::vector<table_info> _local_tables;
+    std::vector<table_id> _local_tables;
 public:
    shard_major_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            tasks::task_id parent_id,
            replica::database& db,
-            std::vector<table_info> local_tables) noexcept
-        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), "", "", parent_id)
+            std::vector<table_id> local_tables) noexcept
+        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
        , _db(db)
        , _local_tables(std::move(local_tables))
    {}
+
+    virtual tasks::is_internal is_internal() const noexcept override;
 protected:
    virtual future<> run() override;
 };

-class table_major_keyspace_compaction_task_impl : public major_compaction_task_impl {
-private:
-    replica::database& _db;
-    table_info _ti;
-    seastar::condition_variable& _cv;
-    tasks::task_manager::task_ptr& _current_task;
-public:
-    table_major_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            tasks::task_id parent_id,
-            replica::database& db,
-            table_info ti,
-            seastar::condition_variable& cv,
-            tasks::task_manager::task_ptr& current_task) noexcept
-        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "table", std::move(keyspace), std::move(table), "", parent_id)
-        , _db(db)
-        , _ti(std::move(ti))
-        , _cv(cv)
-        , _current_task(current_task)
-    {}
-protected:
-    virtual future<> run() override;
-};
-
-
 class cleanup_compaction_task_impl : public compaction_task_impl {
 public:
    cleanup_compaction_task_impl(tasks::task_manager::module_ptr module,
            tasks::task_id id,
            unsigned sequence_number,
-            std::string scope,
            std::string keyspace,
            std::string table,
            std::string entity,
            tasks::task_id parent_id) noexcept
-        : compaction_task_impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
+        : compaction_task_impl(module, id, sequence_number, std::move(keyspace), std::move(table), std::move(entity), parent_id)
    {
        // FIXME: add progress units
    }
@@ -153,15 +116,15 @@ protected:
 class cleanup_keyspace_compaction_task_impl : public cleanup_compaction_task_impl {
 private:
    sharded<replica::database>& _db;
-    std::vector<table_info> _table_infos;
+    std::vector<table_id> _table_ids;
 public:
    cleanup_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            sharded<replica::database>& db,
-            std::vector<table_info> table_infos) noexcept
-        : cleanup_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "keyspace", std::move(keyspace), "", "", tasks::task_id::create_null_id())
+            std::vector<table_id> table_ids) noexcept
+        : cleanup_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
        , _db(db)
-        , _table_infos(std::move(table_infos))
+        , _table_ids(std::move(table_ids))
    {}
 protected:
    virtual future<> run() override;
@@ -170,42 +133,19 @@ protected:
 class shard_cleanup_keyspace_compaction_task_impl : public cleanup_compaction_task_impl {
 private:
    replica::database& _db;
-    std::vector<table_info> _local_tables;
+    std::vector<table_id> _local_tables;
 public:
    shard_cleanup_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            tasks::task_id parent_id,
            replica::database& db,
-            std::vector<table_info> local_tables) noexcept
-        : cleanup_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), "", "", parent_id)
+            std::vector<table_id> local_tables) noexcept
+        : cleanup_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
        , _db(db)
        , _local_tables(std::move(local_tables))
    {}
-protected:
-    virtual future<> run() override;
-};

-class table_cleanup_keyspace_compaction_task_impl : public cleanup_compaction_task_impl {
-private:
-    replica::database& _db;
-    table_info _ti;
-    seastar::condition_variable& _cv;
-    tasks::task_manager::task_ptr& _current_task;
-public:
-    table_cleanup_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            tasks::task_id parent_id,
-            replica::database& db,
-            table_info ti,
-            seastar::condition_variable& cv,
-            tasks::task_manager::task_ptr& current_task) noexcept
-        : cleanup_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "table", std::move(keyspace), std::move(table), "", parent_id)
-        , _db(db)
-        , _ti(std::move(ti))
-        , _cv(cv)
-        , _current_task(current_task)
-    {}
+    virtual tasks::is_internal is_internal() const noexcept override;
 protected:
    virtual future<> run() override;
 };
@@ -215,12 +155,11 @@ public:
    offstrategy_compaction_task_impl(tasks::task_manager::module_ptr module,
            tasks::task_id id,
            unsigned sequence_number,
-            std::string scope,
            std::string keyspace,
            std::string table,
            std::string entity,
            tasks::task_id parent_id) noexcept
-        : compaction_task_impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
+        : compaction_task_impl(module, id, sequence_number, std::move(keyspace), std::move(table), std::move(entity), parent_id)
    {
        // FIXME: add progress units
    }
@@ -235,15 +174,15 @@ protected:
 class offstrategy_keyspace_compaction_task_impl : public offstrategy_compaction_task_impl {
 private:
    sharded<replica::database>& _db;
-    std::vector<table_info> _table_infos;
+    std::vector<table_id> _table_infos;
    bool& _needed;
 public:
    offstrategy_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            sharded<replica::database>& db,
-            std::vector<table_info> table_infos,
+            std::vector<table_id> table_infos,
            bool& needed) noexcept
-        : offstrategy_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "keyspace", std::move(keyspace), "", "", tasks::task_id::create_null_id())
+        : offstrategy_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
        , _db(db)
        , _table_infos(std::move(table_infos))
        , _needed(needed)
@@ -255,48 +194,22 @@ protected:
 class shard_offstrategy_keyspace_compaction_task_impl : public offstrategy_compaction_task_impl {
 private:
    replica::database& _db;
-    std::vector<table_info> _table_infos;
+    std::vector<table_id> _table_infos;
    bool& _needed;
 public:
    shard_offstrategy_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            tasks::task_id parent_id,
            replica::database& db,
-            std::vector<table_info> table_infos,
+            std::vector<table_id> table_infos,
            bool& needed) noexcept
-        : offstrategy_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), "", "", parent_id)
+        : offstrategy_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
        , _db(db)
        , _table_infos(std::move(table_infos))
        , _needed(needed)
    {}
-protected:
-    virtual future<> run() override;
-};

-class table_offstrategy_keyspace_compaction_task_impl : public offstrategy_compaction_task_impl {
-private:
-    replica::database& _db;
-    table_info _ti;
-    seastar::condition_variable& _cv;
-    tasks::task_manager::task_ptr& _current_task;
-    bool& _needed;
-public:
-    table_offstrategy_keyspace_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            tasks::task_id parent_id,
-            replica::database& db,
-            table_info ti,
-            seastar::condition_variable& cv,
-            tasks::task_manager::task_ptr& current_task,
-            bool& needed) noexcept
-        : offstrategy_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "table", std::move(keyspace), std::move(table), "", parent_id)
-        , _db(db)
-        , _ti(std::move(ti))
-        , _cv(cv)
-        , _current_task(current_task)
-        , _needed(needed)
-    {}
+    virtual tasks::is_internal is_internal() const noexcept override;
 protected:
    virtual future<> run() override;
 };
@@ -306,18 +219,17 @@ public:
    sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
            tasks::task_id id,
            unsigned sequence_number,
-            std::string scope,
            std::string keyspace,
            std::string table,
            std::string entity,
            tasks::task_id parent_id) noexcept
-        : compaction_task_impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
+        : compaction_task_impl(module, id, sequence_number, std::move(keyspace), std::move(table), std::move(entity), parent_id)
    {
        // FIXME: add progress units
    }

    virtual std::string type() const override {
-        return "sstables compaction";
+        return "rewrite sstables compaction";
    }
 protected:
    virtual future<> run() override = 0;
@@ -326,23 +238,19 @@ protected:
 class upgrade_sstables_compaction_task_impl : public sstables_compaction_task_impl {
 private:
    sharded<replica::database>& _db;
-    std::vector<table_info> _table_infos;
+    std::vector<table_id> _table_infos;
    bool _exclude_current_version;
 public:
    upgrade_sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            sharded<replica::database>& db,
-            std::vector<table_info> table_infos,
+            std::vector<table_id> table_infos,
            bool exclude_current_version) noexcept
-        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "keyspace", std::move(keyspace), "", "", tasks::task_id::create_null_id())
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
        , _db(db)
        , _table_infos(std::move(table_infos))
        , _exclude_current_version(exclude_current_version)
    {}
-
-    virtual std::string type() const override {
-        return "upgrade " + sstables_compaction_task_impl::type();
-    }
 protected:
    virtual future<> run() override;
 };
@@ -350,56 +258,22 @@ protected:
 class shard_upgrade_sstables_compaction_task_impl : public sstables_compaction_task_impl {
 private:
    replica::database& _db;
-    std::vector<table_info> _table_infos;
+    std::vector<table_id> _table_infos;
    bool _exclude_current_version;
 public:
    shard_upgrade_sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
            tasks::task_id parent_id,
            replica::database& db,
-            std::vector<table_info> table_infos,
+            std::vector<table_id> table_infos,
            bool exclude_current_version) noexcept
-        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), "", "", parent_id)
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
        , _db(db)
        , _table_infos(std::move(table_infos))
        , _exclude_current_version(exclude_current_version)
    {}

-    virtual std::string type() const override {
-        return "upgrade " + sstables_compaction_task_impl::type();
-    }
-protected:
-    virtual future<> run() override;
-};
-
-class table_upgrade_sstables_compaction_task_impl : public sstables_compaction_task_impl {
-private:
-    replica::database& _db;
-    table_info _ti;
-    seastar::condition_variable& _cv;
-    tasks::task_manager::task_ptr& _current_task;
-    bool _exclude_current_version;
-public:
-    table_upgrade_sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            tasks::task_id parent_id,
-            replica::database& db,
-            table_info ti,
-            seastar::condition_variable& cv,
-            tasks::task_manager::task_ptr& current_task,
-            bool exclude_current_version) noexcept
-        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "table", std::move(keyspace), std::move(table), "", parent_id)
-        , _db(db)
-        , _ti(std::move(ti))
-        , _cv(cv)
-        , _current_task(current_task)
-        , _exclude_current_version(exclude_current_version)
-    {}
-
-    virtual std::string type() const override {
-        return "upgrade " + sstables_compaction_task_impl::type();
-    }
+    virtual tasks::is_internal is_internal() const noexcept override;
 protected:
    virtual future<> run() override;
 };
@@ -417,16 +291,12 @@ public:
            std::vector<sstring> column_families,
            sstables::compaction_type_options::scrub opts,
            sstables::compaction_stats& stats) noexcept
-        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "keyspace", std::move(keyspace), "", "", tasks::task_id::create_null_id())
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
        , _db(db)
        , _column_families(std::move(column_families))
        , _opts(opts)
        , _stats(stats)
    {}
-
-    virtual std::string type() const override {
-        return "scrub " + sstables_compaction_task_impl::type();
-    }
 protected:
    virtual future<> run() override;
 };
@@ -445,16 +315,14 @@ public:
            std::vector<sstring> column_families,
            sstables::compaction_type_options::scrub opts,
            sstables::compaction_stats& stats) noexcept
-        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), "", "", parent_id)
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
        , _db(db)
        , _column_families(std::move(column_families))
        , _opts(opts)
        , _stats(stats)
    {}

-    virtual std::string type() const override {
-        return "scrub " + sstables_compaction_task_impl::type();
-    }
+    virtual tasks::is_internal is_internal() const noexcept override;
 protected:
    virtual future<> run() override;
 };
@@ -472,170 +340,13 @@ public:
            replica::database& db,
            sstables::compaction_type_options::scrub opts,
            sstables::compaction_stats& stats) noexcept
-        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "table", std::move(keyspace), std::move(table), "", parent_id)
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), std::move(table), "", parent_id)
        , _db(db)
        , _opts(opts)
        , _stats(stats)
    {}

-    virtual std::string type() const override {
-        return "scrub " + sstables_compaction_task_impl::type();
-    }
-protected:
-    virtual future<> run() override;
-};
-
-class reshaping_compaction_task_impl : public compaction_task_impl {
-public:
-    reshaping_compaction_task_impl(tasks::task_manager::module_ptr module,
-            tasks::task_id id,
-            unsigned sequence_number,
-            std::string scope,
-            std::string keyspace,
-            std::string table,
-            std::string entity,
-            tasks::task_id parent_id) noexcept
-        : compaction_task_impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
-    {
-        // FIXME: add progress units
-    }
-
-    virtual std::string type() const override {
-        return "reshaping compaction";
-    }
-protected:
-    virtual future<> run() override = 0;
-};
-
-class table_reshaping_compaction_task_impl : public reshaping_compaction_task_impl {
-private:
-    sharded<sstables::sstable_directory>& _dir;
-    sharded<replica::database>& _db;
-    sstables::reshape_mode _mode;
-    sstables::compaction_sstable_creator_fn _creator;
-    std::function<bool (const sstables::shared_sstable&)> _filter;
-public:
-    table_reshaping_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            sharded<sstables::sstable_directory>& dir,
-            sharded<replica::database>& db,
-            sstables::reshape_mode mode,
-            sstables::compaction_sstable_creator_fn creator,
-            std::function<bool (const sstables::shared_sstable&)> filter) noexcept
-        : reshaping_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "table", std::move(keyspace), std::move(table), "", tasks::task_id::create_null_id())
-        , _dir(dir)
-        , _db(db)
-        , _mode(mode)
-        , _creator(std::move(creator))
-        , _filter(std::move(filter))
-    {}
-protected:
-    virtual future<> run() override;
-};
-
-class shard_reshaping_compaction_task_impl : public reshaping_compaction_task_impl {
-private:
-    sstables::sstable_directory& _dir;
-    sharded<replica::database>& _db;
-    sstables::reshape_mode _mode;
-    sstables::compaction_sstable_creator_fn _creator;
-    std::function<bool (const sstables::shared_sstable&)> _filter;
-    uint64_t& _total_shard_size;
-public:
-    shard_reshaping_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            tasks::task_id parent_id,
-            sstables::sstable_directory& dir,
-            sharded<replica::database>& db,
-            sstables::reshape_mode mode,
-            sstables::compaction_sstable_creator_fn creator,
-            std::function<bool (const sstables::shared_sstable&)> filter,
-            uint64_t& total_shard_size) noexcept
-        : reshaping_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), std::move(table), "", parent_id)
-        , _dir(dir)
-        , _db(db)
-        , _mode(mode)
-        , _creator(std::move(creator))
-        , _filter(std::move(filter))
-        , _total_shard_size(total_shard_size)
-    {}
-protected:
-    virtual future<> run() override;
-};
-
-
-class resharding_compaction_task_impl : public compaction_task_impl {
-public:
-    resharding_compaction_task_impl(tasks::task_manager::module_ptr module,
-            tasks::task_id id,
-            unsigned sequence_number,
-            std::string scope,
-            std::string keyspace,
-            std::string table,
-            std::string entity,
-            tasks::task_id parent_id) noexcept
-        : compaction_task_impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
-    {
-        // FIXME: add progress units
-    }
-
-    virtual std::string type() const override {
-        return "resharding compaction";
-    }
-protected:
-    virtual future<> run() override = 0;
-};
-
-class table_resharding_compaction_task_impl : public resharding_compaction_task_impl {
-private:
-    sharded<sstables::sstable_directory>& _dir;
-    sharded<replica::database>& _db;
-    sstables::compaction_sstable_creator_fn _creator;
-    compaction::owned_ranges_ptr _owned_ranges_ptr;
-public:
-    table_resharding_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            sharded<sstables::sstable_directory>& dir,
-            sharded<replica::database>& db,
-            sstables::compaction_sstable_creator_fn creator,
-            compaction::owned_ranges_ptr owned_ranges_ptr) noexcept
-        : resharding_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "table", std::move(keyspace), std::move(table), "", tasks::task_id::create_null_id())
-        , _dir(dir)
-        , _db(db)
-        , _creator(std::move(creator))
-        , _owned_ranges_ptr(std::move(owned_ranges_ptr))
-    {}
-protected:
-    virtual future<> run() override;
-};
-
-class shard_resharding_compaction_task_impl : public resharding_compaction_task_impl {
-private:
-    sharded<sstables::sstable_directory>& _dir;
-    replica::database& _db;
-    sstables::compaction_sstable_creator_fn _creator;
-    compaction::owned_ranges_ptr _local_owned_ranges_ptr;
-    std::vector<replica::reshard_shard_descriptor>& _destinations;
-public:
-    shard_resharding_compaction_task_impl(tasks::task_manager::module_ptr module,
-            std::string keyspace,
-            std::string table,
-            tasks::task_id parent_id,
-            sharded<sstables::sstable_directory>& dir,
-            replica::database& db,
-            sstables::compaction_sstable_creator_fn creator,
-            compaction::owned_ranges_ptr local_owned_ranges_ptr,
-            std::vector<replica::reshard_shard_descriptor>& destinations) noexcept
-        : resharding_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), std::move(table), "", parent_id)
-        , _dir(dir)
-        , _db(db)
-        , _creator(std::move(creator))
-        , _local_owned_ranges_ptr(std::move(local_owned_ranges_ptr))
-        , _destinations(destinations)
-    {}
+    virtual tasks::is_internal is_internal() const noexcept override;
 protected:
    virtual future<> run() override;
 };
@@ -645,25 +356,4 @@ public:
    task_manager_module(tasks::task_manager& tm) noexcept : tasks::task_manager::module(tm, "compaction") {}
 };

-class regular_compaction_task_impl : public compaction_task_impl {
-public:
-    regular_compaction_task_impl(tasks::task_manager::module_ptr module,
-            tasks::task_id id,
-            unsigned sequence_number,
-            std::string keyspace,
-            std::string table,
-            std::string entity,
-            tasks::task_id parent_id) noexcept
-        : compaction_task_impl(module, id, sequence_number, "compaction group", std::move(keyspace), std::move(table), std::move(entity), parent_id)
-    {
-        // FIXME: add progress units
-    }
-
-    virtual std::string type() const override {
-        return "regular compaction";
-    }
-protected:
-    virtual future<> run() override = 0;
-};
-
 }
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -26,104 +26,53 @@ time_window_compaction_strategy_state& time_window_compaction_strategy::get_stat
    return table_s.get_compaction_strategy_state().get<time_window_compaction_strategy_state>();
 }

-const std::unordered_map<sstring, std::chrono::seconds> time_window_compaction_strategy_options::valid_window_units = {
-    { "MINUTES", 60s }, { "HOURS", 3600s }, { "DAYS", 86400s }
-};
+time_window_compaction_strategy_options::time_window_compaction_strategy_options(const std::map<sstring, sstring>& options) {
+    std::chrono::seconds window_unit = DEFAULT_COMPACTION_WINDOW_UNIT;
+    int window_size = DEFAULT_COMPACTION_WINDOW_SIZE;

-const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions> time_window_compaction_strategy_options::valid_timestamp_resolutions = {
-    { "MICROSECONDS", timestamp_resolutions::microsecond },
-    { "MILLISECONDS", timestamp_resolutions::millisecond },
-};
-
-static std::chrono::seconds validate_compaction_window_unit(const std::map<sstring, sstring>& options) {
-    std::chrono::seconds window_unit = time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_UNIT;
-
-    auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY);
-    if (tmp_value) {
-        auto valid_window_units_it = time_window_compaction_strategy_options::valid_window_units.find(tmp_value.value());
-        if (valid_window_units_it == time_window_compaction_strategy_options::valid_window_units.end()) {
-            throw exceptions::configuration_exception(fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
+    auto it = options.find(COMPACTION_WINDOW_UNIT_KEY);
+    if (it != options.end()) {
+        auto valid_window_units_it = valid_window_units.find(it->second);
+        if (valid_window_units_it == valid_window_units.end()) {
+            throw exceptions::syntax_exception(sstring("Invalid window unit ") + it->second + " for " + COMPACTION_WINDOW_UNIT_KEY);
        }
        window_unit = valid_window_units_it->second;
    }

-    return window_unit;
-}
-
-static std::chrono::seconds validate_compaction_window_unit(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    auto window_unit = validate_compaction_window_unit(options);
-    unchecked_options.erase(time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY);
-    return window_unit;
-}
-
-static int validate_compaction_window_size(const std::map<sstring, sstring>& options) {
-    auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY);
-    int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value, time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
+    it = options.find(COMPACTION_WINDOW_SIZE_KEY);
+    if (it != options.end()) {
+        try {
+            window_size = std::stoi(it->second);
+        } catch (const std::exception& e) {
+            throw exceptions::syntax_exception(sstring("Invalid integer value ") + it->second + " for " + COMPACTION_WINDOW_SIZE_KEY);
+        }
+    }

    if (window_size <= 0) {
-        throw exceptions::configuration_exception(fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
+        throw exceptions::configuration_exception(fmt::format("{} must be greater than 1 for compaction_window_size", window_size));
    }

-    return window_size;
-}
-
-static int validate_compaction_window_size(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    int window_size = validate_compaction_window_size(options);
-    unchecked_options.erase(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY);
-    return window_size;
-}
-
-static db_clock::duration validate_expired_sstable_check_frequency_seconds(const std::map<sstring, sstring>& options) {
-    db_clock::duration expired_sstable_check_frequency = time_window_compaction_strategy_options::DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS();
-
-    auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
-    if (tmp_value) {
-        try {
-            expired_sstable_check_frequency = std::chrono::seconds(std::stol(tmp_value.value()));
-        } catch (const std::exception& e) {
-            throw exceptions::syntax_exception(fmt::format("Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
-        }
-    }
-
-    return expired_sstable_check_frequency;
-}
-
-static db_clock::duration validate_expired_sstable_check_frequency_seconds(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    db_clock::duration expired_sstable_check_frequency = validate_expired_sstable_check_frequency_seconds(options);
-    unchecked_options.erase(time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
-    return expired_sstable_check_frequency;
-}
-
-static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options) {
-    time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
-
-    auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
-    if (tmp_value) {
-        if (!time_window_compaction_strategy_options::valid_timestamp_resolutions.contains(tmp_value.value())) {
-            throw exceptions::configuration_exception(fmt::format("Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
-        } else {
-            timestamp_resolution = time_window_compaction_strategy_options::valid_timestamp_resolutions.at(tmp_value.value());
-        }
-    }
-
-    return timestamp_resolution;
-}
-
-static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = validate_timestamp_resolution(options);
-    unchecked_options.erase(time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
-    return timestamp_resolution;
-}
-
-time_window_compaction_strategy_options::time_window_compaction_strategy_options(const std::map<sstring, sstring>& options) {
-    auto window_unit = validate_compaction_window_unit(options);
-    int window_size = validate_compaction_window_size(options);
-
    sstable_window_size = window_size * window_unit;
-    expired_sstable_check_frequency = validate_expired_sstable_check_frequency_seconds(options);
-    timestamp_resolution = validate_timestamp_resolution(options);

-    auto it = options.find("enable_optimized_twcs_queries");
+    it = options.find(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
+    if (it != options.end()) {
+        try {
+            expired_sstable_check_frequency = std::chrono::seconds(std::stol(it->second));
+        } catch (const std::exception& e) {
+            throw exceptions::syntax_exception(sstring("Invalid long value ") + it->second + "for " + EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
+        }
+    }
+
+    it = options.find(TIMESTAMP_RESOLUTION_KEY);
+    if (it != options.end()) {
+        if (!valid_timestamp_resolutions.contains(it->second)) {
+            throw exceptions::syntax_exception(sstring("Invalid timestamp resolution ") + it->second + "for " + TIMESTAMP_RESOLUTION_KEY);
+        } else {
+            timestamp_resolution = valid_timestamp_resolutions.at(it->second);
+        }
+    }
+
+    it = options.find("enable_optimized_twcs_queries");
    if (it != options.end() && it->second == "false") {
        enable_optimized_twcs_queries = false;
    }
@@ -133,29 +82,6 @@ time_window_compaction_strategy_options::time_window_compaction_strategy_options

 time_window_compaction_strategy_options::time_window_compaction_strategy_options(const time_window_compaction_strategy_options&) = default;

-// options is a map of compaction strategy options and their values.
-// unchecked_options is an analogical map from which already checked options are deleted.
-// This helps making sure that only allowed options are being set.
-void time_window_compaction_strategy_options::validate(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
-    validate_compaction_window_unit(options, unchecked_options);
-    validate_compaction_window_size(options, unchecked_options);
-    validate_expired_sstable_check_frequency_seconds(options, unchecked_options);
-    validate_timestamp_resolution(options, unchecked_options);
-    compaction_strategy_impl::validate_min_max_threshold(options, unchecked_options);
-
-    auto it = options.find("enable_optimized_twcs_queries");
-    if (it != options.end() && it->second != "true"  && it->second != "false") {
-        throw exceptions::configuration_exception(fmt::format("enable_optimized_twcs_queries value ({}) must be \"true\" or \"false\"", it->second));
-    }
-    unchecked_options.erase("enable_optimized_twcs_queries");
-
-    it = unchecked_options.find("unsafe_aggressive_sstable_expiration");
-    if (it != unchecked_options.end()) {
-        clogger.warn("unsafe_aggressive_sstable_expiration option is not supported for time window compaction strategy");
-        unchecked_options.erase(it);
-    }
-}
-
 class classify_by_timestamp {
    time_window_compaction_strategy_options _options;
    std::vector<int64_t> _known_windows;
@@ -212,7 +138,7 @@ reader_consumer_v2 time_window_compaction_strategy::make_interposer_consumer(con
 }

 compaction_descriptor
-time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const {
    std::vector<shared_sstable> single_window;
    std::vector<shared_sstable> multi_window;

@@ -266,7 +192,7 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
            });
            multi_window.resize(max_sstables);
        }
-        compaction_descriptor desc(std::move(multi_window));
+        compaction_descriptor desc(std::move(multi_window), iop);
        desc.options = compaction_type_options::make_reshape();
        return desc;
    }
@@ -285,14 +211,14 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
            }
            // reuse STCS reshape logic which will only compact similar-sized files, to increase overall efficiency
            // when reshaping time buckets containing a huge amount of files
-            auto desc = size_tiered_compaction_strategy(_stcs_options).get_reshaping_job(std::move(ssts), schema, mode);
+            auto desc = size_tiered_compaction_strategy(_stcs_options).get_reshaping_job(std::move(ssts), schema, iop, mode);
            if (!desc.sstables.empty()) {
                return desc;
            }
        }
    }
    if (!single_window.empty()) {
-        compaction_descriptor desc(std::move(single_window));
+        compaction_descriptor desc(std::move(single_window), iop);
        desc.options = compaction_type_options::make_reshape();
        return desc;
    }
@@ -301,10 +227,9 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
 }

 compaction_descriptor
-time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control) {
+time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidates) {
    auto& state = get_state(table_s);
    auto compaction_time = gc_clock::now();
-    auto candidates = control.candidates(table_s);

    if (candidates.empty()) {
        state.estimated_remaining_tasks = 0;
@@ -319,7 +244,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
        auto expired = table_s.fully_expired_sstables(candidates, compaction_time);
        if (!expired.empty()) {
            clogger.debug("[{}] Going to compact {} expired sstables", fmt::ptr(this), expired.size());
-            return compaction_descriptor(has_only_fully_expired::yes, std::vector<shared_sstable>(expired.begin(), expired.end()));
+            return compaction_descriptor(has_only_fully_expired::yes, std::vector<shared_sstable>(expired.begin(), expired.end()), service::get_local_compaction_priority());
        }
        // Keep checking for fully_expired_sstables until we don't find
        // any among the candidates, meaning they are either already compacted
@@ -331,7 +256,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_

    auto compaction_candidates = get_next_non_expired_sstables(table_s, control, std::move(candidates), compaction_time);
    clogger.debug("[{}] Going to compact {} non-expired sstables", fmt::ptr(this), compaction_candidates.size());
-    return compaction_descriptor(std::move(compaction_candidates));
+    return compaction_descriptor(std::move(compaction_candidates), service::get_local_compaction_priority());
 }

 time_window_compaction_strategy::bucket_compaction_mode
@@ -366,7 +291,7 @@ time_window_compaction_strategy::get_next_non_expired_sstables(table_state& tabl
    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
    // ratio is greater than threshold.
    auto e = boost::range::remove_if(non_expiring_sstables, [this, compaction_time, &table_s] (const shared_sstable& sst) -> bool {
-        return !worth_dropping_tombstones(sst, compaction_time, table_s);
+        return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
    });
    non_expiring_sstables.erase(e, non_expiring_sstables.end());
    if (non_expiring_sstables.empty()) {
@@ -503,7 +428,6 @@ void time_window_compaction_strategy::update_estimated_compaction_by_tasks(time_
            break;
        case bucket_compaction_mode::major:
            n++;
-            break;
        default:
            break;
        }
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -11,9 +11,12 @@
 #pragma once

 #include "compaction_strategy_impl.hh"
+#include "compaction.hh"
 #include "size_tiered_compaction_strategy.hh"
 #include "timestamp.hh"
+#include "exceptions/exceptions.hh"
 #include "sstables/shared_sstable.hh"
+#include "service/priority_manager.hh"

 namespace sstables {

@@ -33,15 +36,18 @@ public:
    static constexpr auto COMPACTION_WINDOW_UNIT_KEY = "compaction_window_unit";
    static constexpr auto COMPACTION_WINDOW_SIZE_KEY = "compaction_window_size";
    static constexpr auto EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY = "expired_sstable_check_frequency_seconds";
-
-    static const std::unordered_map<sstring, std::chrono::seconds> valid_window_units;
+private:
+    const std::unordered_map<sstring, std::chrono::seconds> valid_window_units = { { "MINUTES", 60s }, { "HOURS", 3600s }, { "DAYS", 86400s } };

    enum class timestamp_resolutions {
        microsecond,
        millisecond,
    };
-    static const std::unordered_map<sstring, timestamp_resolutions> valid_timestamp_resolutions;
-private:
+    const std::unordered_map<sstring, timestamp_resolutions> valid_timestamp_resolutions = {
+        { "MICROSECONDS", timestamp_resolutions::microsecond },
+        { "MILLISECONDS", timestamp_resolutions::millisecond },
+    };
+
    std::chrono::seconds sstable_window_size = DEFAULT_COMPACTION_WINDOW_UNIT * DEFAULT_COMPACTION_WINDOW_SIZE;
    db_clock::duration expired_sstable_check_frequency = DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS();
    timestamp_resolutions timestamp_resolution = timestamp_resolutions::microsecond;
@@ -51,8 +57,6 @@ public:
    time_window_compaction_strategy_options(time_window_compaction_strategy_options&&);
    time_window_compaction_strategy_options(const std::map<sstring, sstring>& options);

-    static void validate(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
-public:
    std::chrono::seconds get_sstable_window_size() const { return sstable_window_size; }

    friend class time_window_compaction_strategy;
@@ -83,11 +87,9 @@ public:
    enum class bucket_compaction_mode { none, size_tiered, major };
 public:
    time_window_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control) override;
+    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidates) override;

    virtual std::vector<compaction_descriptor> get_cleanup_compaction_jobs(table_state& table_s, std::vector<shared_sstable> candidates) const override;
-
-    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
 private:
    time_window_compaction_strategy_state& get_state(table_state& table_s) const;

@@ -170,7 +172,7 @@ public:
        return true;
    }

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) const override;
 };

 }
--- a/Show More
+++ b/Show More