s3/test: collect log on exit

the temporary directory holding the log file collecting the scylla subprocess's output is specified by the test itself, and it is `test_tempdir`. but unfortunately, cql-pytest/run.py is not aware of this. so `cleanup_all()` is not able to print out the logging messages at exit. as, please note, cql-pytest/run.py always collect "log" file under the directory created using `pid_to_dir()` where pid is the spawned subprocesses. but `object_store/run` uses the main process's pid for its reusable tempdir. so, with this change, we also register a cleanup func to printout the logging message when the test exits. Signed-off-by: Kefu Chai <kefu.chai@scylladb.com> Closes #13647
2023-04-24 13:53:25 +03:00
1476 changed files with 51235 additions and 70748 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,8 @@ tags
 testlog
 test/*/*.reject
 .vscode
+docs/_build
+docs/poetry.lock
 compile_commands.json
 .ccls-cache/
 .mypy_cache
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../scylla-seastar
+	url = ../seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.27)
+cmake_minimum_required(VERSION 3.18)

 project(scylla)

@@ -8,19 +8,11 @@ list(APPEND CMAKE_MODULE_PATH
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake
  ${CMAKE_CURRENT_SOURCE_DIR}/seastar/cmake)

+set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE
+    STRING "Choose the type of build." FORCE)
 # Set the possible values of build type for cmake-gui
-set(scylla_build_types
-    "Debug" "Release" "Dev" "Sanitize" "Coverage")
 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-  ${scylla_build_types})
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE
-        STRING "Choose the type of build." FORCE)
-    message(WARNING "CMAKE_BUILD_TYPE not specified, Using 'Release'")
-elseif(NOT CMAKE_BUILD_TYPE IN_LIST scylla_build_types)
-    message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}. "
-        "Following types are supported: ${scylla_build_types}")
-endif()
+  "Debug" "Release" "Dev" "Sanitize")
 string(TOUPPER "${CMAKE_BUILD_TYPE}" build_mode)
 include(mode.${build_mode})
 include(mode.common)
@@ -34,9 +26,6 @@ set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)

 set(Seastar_TESTING ON CACHE BOOL "" FORCE)
-set(Seastar_API_LEVEL 7 CACHE STRING "" FORCE)
-set(Seastar_APPS ON CACHE BOOL "" FORCE)
-set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE)
 add_subdirectory(seastar)

 # System libraries dependencies
@@ -56,8 +45,6 @@ find_package(xxHash REQUIRED)
 set(scylla_gen_build_dir "${CMAKE_BINARY_DIR}/gen")
 file(MAKE_DIRECTORY "${scylla_gen_build_dir}")

-include(add_version_library)
-generate_scylla_version()

 add_library(scylla-main STATIC)
 target_sources(scylla-main
@@ -126,7 +113,6 @@ add_subdirectory(lang)
 add_subdirectory(locator)
 add_subdirectory(mutation)
 add_subdirectory(mutation_writer)
-add_subdirectory(node_ops)
 add_subdirectory(readers)
 add_subdirectory(redis)
 add_subdirectory(replica)
@@ -144,6 +130,7 @@ add_subdirectory(tracing)
 add_subdirectory(transport)
 add_subdirectory(types)
 add_subdirectory(utils)
+include(add_version_library)
 add_version_library(scylla_version
    release.cc)

@@ -196,25 +183,12 @@ target_link_libraries(scylla PRIVATE
 # Force SHA1 build-id generation
 set(default_linker_flags "-Wl,--build-id=sha1")
 include(CheckLinkerFlag)
-set(Scylla_USE_LINKER
-    ""
-    CACHE
-    STRING
-    "Use specified linker instead of the default one")
-if(Scylla_USE_LINKER)
-    set(linkers "${Scylla_USE_LINKER}")
-else()
-    set(linkers "lld" "gold")
-endif()
-
-foreach(linker ${linkers})
+foreach(linker "lld" "gold")
    set(linker_flag "-fuse-ld=${linker}")
    check_linker_flag(CXX ${linker_flag} "CXX_LINKER_HAVE_${linker}")
    if(CXX_LINKER_HAVE_${linker})
        string(APPEND default_linker_flags " ${linker_flag}")
        break()
-    elseif(Scylla_USE_LINKER)
-        message(FATAL_ERROR "${Scylla_USE_LINKER} is not supported.")
    endif()
 endforeach()

@@ -225,5 +199,3 @@ set(CMAKE_EXE_LINKER_FLAGS "${default_linker_flags}" CACHE INTERNAL "")
 target_include_directories(scylla PRIVATE
    "${CMAKE_CURRENT_SOURCE_DIR}"
    "${scylla_gen_build_dir}")
-
-add_subdirectory(dist)
--- a/12
+++ b/12
@@ -7,7 +7,6 @@ Options:
  -h|--help show this help message.
  -o|--output-dir PATH specify destination path at which the version files are to be created.
  -d|--date-stamp DATE manually set date for release parameter
-  -v|--verbose also print out the version number

 By default, the script will attempt to parse 'version' file
 in the current directory, which should contain a string of
@@ -34,7 +33,6 @@ END
 )

 DATE=""
-PRINT_VERSION=false

 while [ $# -gt 0 ]; do
 	opt="$1"
@@ -53,10 +51,6 @@ while [ $# -gt 0 ]; do
 			shift
 			shift
 			;;
-		-v|--verbose)
-			PRINT_VERSION=true
-			shift
-			;;
 		*)
 			echo "Unexpected argument found: $1"
 			echo
@@ -78,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.4.2
+VERSION=5.3.0-dev

 if test -f version
 then
@@ -108,9 +102,7 @@ if [ -f "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" ]; then
 	fi
 fi

-if $PRINT_VERSION; then
-	echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
-fi
+echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p "$OUTPUT_DIR"
 echo "$SCYLLA_VERSION" > "$OUTPUT_DIR/SCYLLA-VERSION-FILE"
 echo "$SCYLLA_RELEASE" > "$OUTPUT_DIR/SCYLLA-RELEASE-FILE"
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -53,7 +53,7 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::strin
    if (result_set->empty()) {
        co_await coroutine::return_exception(api_error::unrecognized_client(format("User not found: {}", username)));
    }
-    const managed_bytes_opt& salted_hash = result_set->rows().front().front(); // We only asked for 1 row and 1 column
+    const bytes_opt& salted_hash = result_set->rows().front().front(); // We only asked for 1 row and 1 column
    if (!salted_hash) {
        co_await coroutine::return_exception(api_error::unrecognized_client(format("No password found for user: {}", username)));
    }
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -76,16 +76,13 @@ future<> controller::start_server() {
        _ssg = create_smp_service_group(c).get0();

        rmw_operation::set_default_write_isolation(_config.alternator_write_isolation());
+        executor::set_default_timeout(std::chrono::milliseconds(_config.alternator_timeout_in_ms()));

        net::inet_address addr = utils::resolve(_config.alternator_address, family).get0();

        auto get_cdc_metadata = [] (cdc::generation_service& svc) { return std::ref(svc.get_cdc_metadata()); };
-        auto get_timeout_in_ms = [] (const db::config& cfg) -> utils::updateable_value<uint32_t> {
-            return cfg.alternator_timeout_in_ms;
-        };
-        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks),
-                        sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value(),
-                        sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
+
+        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
        // Note: from this point on, if start_server() throws for any reason,
        // it must first call stop_server() to stop the executor and server
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -6,6 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <regex>
+
 #include "utils/base64.hh"

 #include <seastar/core/sleep.hh>
@@ -38,6 +40,7 @@
 #include <seastar/json/json_elements.hh>
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include "collection_mutation.hh"
+#include "db/query_context.hh"
 #include "schema/schema.hh"
 #include "db/tags/extension.hh"
 #include "db/tags/utils.hh"
@@ -59,28 +62,7 @@ logging::logger elogger("alternator-executor");

 namespace alternator {

-enum class table_status {
-    active = 0,
-    creating,
-    updating,
-    deleting
-};
-
-static sstring_view table_status_to_sstring(table_status tbl_status) {
-    switch(tbl_status) {
-        case table_status::active:
-            return "ACTIVE";
-        case table_status::creating:
-            return "CREATING";
-        case table_status::updating:
-            return "UPDATING";
-        case table_status::deleting:
-            return "DELETING";
-    }
-    return "UKNOWN";
-}
-
-static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type);
+static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type);

 static map_type attrs_type() {
    static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true);
@@ -108,20 +90,17 @@ json::json_return_type make_streamed(rjson::value&& value) {
        // move objects to coroutine frame.
        auto los = std::move(os);
        auto lrs = std::move(rs);
-        std::exception_ptr ex;
        try {
            co_await rjson::print(*lrs, los);
+            co_await los.flush();
+            co_await los.close();
        } catch (...) {
            // at this point, we cannot really do anything. HTTP headers and return code are
            // already written, and quite potentially a portion of the content data.
            // just log + rethrow. It is probably better the HTTP server closes connection
            // abruptly or something...
-            ex = std::current_exception();
-            elogger.error("Exception during streaming HTTP response: {}", ex);
-        }
-        co_await los.close();
-        if (ex) {
-            co_await coroutine::return_exception_ptr(std::move(ex));
+            elogger.error("Unhandled exception in data streaming: {}", std::current_exception());
+            throw;
        }
        co_return;
    };
@@ -211,8 +190,9 @@ static std::string lsi_name(const std::string& table_name, std::string_view inde

 /** Extract table name from a request.
 *  Most requests expect the table's name to be listed in a "TableName" field.
- *  This convenience function returns the name or api_error in case the
- *  table name is missing or not a string.
+ *  This convenience function returns the name, with appropriate validation
+ *  and api_error in case the table name is missing or not a string, or
+ *  doesn't pass validate_table_name().
 */
 static std::optional<std::string> find_table_name(const rjson::value& request) {
    const rjson::value* table_name_value = rjson::find(request, "TableName");
@@ -223,6 +203,7 @@ static std::optional<std::string> find_table_name(const rjson::value& request) {
        throw api_error::validation("Non-string TableName field in request");
    }
    std::string table_name = table_name_value->GetString();
+    validate_table_name(table_name);
    return table_name;
 }

@@ -249,10 +230,6 @@ schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::valu
    try {
        return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(*table_name), *table_name);
    } catch(data_dictionary::no_such_column_family&) {
-        // DynamoDB returns validation error even when table does not exist
-        // and the table name is invalid.
-        validate_table_name(table_name.value());
-
        throw api_error::resource_not_found(
                format("Requested resource not found: Table: {} not found", *table_name));
    }
@@ -303,10 +280,6 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
        try {
            return { proxy.data_dictionary().find_schema(sstring(internal_ks_name), sstring(internal_table_name)), type };
        } catch (data_dictionary::no_such_column_family&) {
-            // DynamoDB returns validation error even when table does not exist
-            // and the table name is invalid.
-            validate_table_name(table_name);
-
            throw api_error::resource_not_found(
                format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
        }
@@ -442,8 +415,22 @@ static rjson::value generate_arn_for_index(const schema& schema, std::string_vie
        schema.ks_name(), schema.cf_name(), index_name));
 }

-static rjson::value fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy const& proxy)
-{
+bool is_alternator_keyspace(const sstring& ks_name) {
+    return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
+}
+
+sstring executor::table_name(const schema& s) {
+    return s.cf_name();
+}
+
+future<executor::request_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
+    _stats.api_operations.describe_table++;
+    elogger.trace("Describing table {}", request);
+
+    schema_ptr schema = get_table(_proxy, request);
+
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
+
    rjson::value table_description = rjson::empty_object();
    rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
    // FIXME: take the tables creation time, not the current time!
@@ -454,8 +441,9 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s
    // We don't currently do this in Alternator - instead CreateTable waits
    // until the table is really available. So/ DescribeTable returns either
    // ACTIVE or doesn't exist at all (and DescribeTable returns an error).
-    // The states CREATING and UPDATING are not currently returned.
-    rjson::add(table_description, "TableStatus", rjson::from_string(table_status_to_sstring(tbl_status)));
+    // The other states (CREATING, UPDATING, DELETING) are not currently
+    // returned.
+    rjson::add(table_description, "TableStatus", "ACTIVE");
    rjson::add(table_description, "TableArn", generate_arn_for_table(*schema));
    rjson::add(table_description, "TableId", rjson::from_string(schema->id().to_sstring()));
    // FIXME: Instead of hardcoding, we should take into account which mode was chosen
@@ -472,9 +460,9 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
-    executor::describe_key_schema(table_description, *schema, key_attribute_types);
+    describe_key_schema(table_description, *schema, key_attribute_types);

-    data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
+    data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
    if (!t.views().empty()) {
        rjson::value gsi_array = rjson::empty_array();
        rjson::value lsi_array = rjson::empty_array();
@@ -490,7 +478,7 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s
            rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
            rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
-            executor::describe_key_schema(view_entry, *vptr, key_attribute_types);
+            describe_key_schema(view_entry, *vptr, key_attribute_types);
            // Add projection type
            rjson::value projection = rjson::empty_object();
            rjson::add(projection, "ProjectionType", "ALL");
@@ -518,29 +506,10 @@ static rjson::value fill_table_description(schema_ptr schema, table_status tbl_s
    }
    rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));

-    executor::supplement_table_stream_info(table_description, *schema, proxy);
-
+    supplement_table_stream_info(table_description, *schema, _proxy);
+    
    // FIXME: still missing some response fields (issue #5026)
-    return table_description;
-}

-bool is_alternator_keyspace(const sstring& ks_name) {
-    return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
-}
-
-sstring executor::table_name(const schema& s) {
-    return s.cf_name();
-}
-
-future<executor::request_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
-    _stats.api_operations.describe_table++;
-    elogger.trace("Describing table {}", request);
-
-    schema_ptr schema = get_table(_proxy, request);
-
-    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
-
-    rjson::value table_description = fill_table_description(schema, table_status::active, _proxy);
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Table", std::move(table_description));
    elogger.trace("returning {}", response);
@@ -552,17 +521,10 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    elogger.trace("Deleting table {}", request);

    std::string table_name = get_table_name(request);
-    // DynamoDB returns validation error even when table does not exist
-    // and the table name is invalid.
-    validate_table_name(table_name);
-
    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    tracing::add_table_name(trace_state, keyspace_name, table_name);
    auto& p = _proxy.container();

-    schema_ptr schema = get_table(_proxy, request);
-    rjson::value table_description = fill_table_description(schema, table_status::deleting, _proxy);
-
    co_await _mm.container().invoke_on(0, [&] (service::migration_manager& mm) -> future<> {
        // FIXME: the following needs to be in a loop. If mm.announce() below
        // fails, we need to retry the whole thing.
@@ -572,14 +534,18 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
            throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
        }

-        auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
-        auto m2 = co_await service::prepare_keyspace_drop_announcement(_proxy.local_db(), keyspace_name, group0_guard.write_timestamp());
+        auto m = co_await mm.prepare_column_family_drop_announcement(keyspace_name, table_name, group0_guard.write_timestamp(), service::migration_manager::drop_views::yes);
+        auto m2 = mm.prepare_keyspace_drop_announcement(keyspace_name, group0_guard.write_timestamp());

        std::move(m2.begin(), m2.end(), std::back_inserter(m));

-        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: delete {} table", table_name));
+        co_await mm.announce(std::move(m), std::move(group0_guard));
    });

+    // FIXME: need more attributes?
+    rjson::value table_description = rjson::empty_object();
+    rjson::add(table_description, "TableName", rjson::from_string(table_name));
+    rjson::add(table_description, "TableStatus", "DELETING");
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TableDescription", std::move(table_description));
    elogger.trace("returning {}", response);
@@ -864,6 +830,17 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta
    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
 }

+static future<> wait_for_schema_agreement(service::migration_manager& mm, db::timeout_clock::time_point deadline) {
+    return do_until([&mm, deadline] {
+        if (db::timeout_clock::now() > deadline) {
+            throw std::runtime_error("Unable to reach schema agreement");
+        }
+        return mm.have_schema_agreement();
+    }, [] {
+        return seastar::sleep(500ms);
+    });
+}
+
 static void verify_billing_mode(const rjson::value& request) {
        // Alternator does not yet support billing or throughput limitations, but
    // let's verify that BillingMode is at least legal.
@@ -881,38 +858,6 @@ static void verify_billing_mode(const rjson::value& request) {
    }
 }

-// Validate that a AttributeDefinitions parameter in CreateTable is valid, and
-// throws user-facing api_error::validation if it's not.
-// In particular, verify that the same AttributeName doesn't appear more than
-// once (Issue #13870).
-static void validate_attribute_definitions(const rjson::value& attribute_definitions){
-    if (!attribute_definitions.IsArray()) {
-        throw api_error::validation("AttributeDefinitions must be an array");
-    }
-    std::unordered_set<std::string> seen_attribute_names;
-    for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
-        const rjson::value* attribute_name = rjson::find(*it, "AttributeName");
-        if (!attribute_name) {
-            throw api_error::validation("AttributeName missing in AttributeDefinitions");
-        }
-        if (!attribute_name->IsString()) {
-            throw api_error::validation("AttributeName in AttributeDefinitions must be a string");
-        }
-        auto [it2, added] = seen_attribute_names.emplace(rjson::to_string_view(*attribute_name));
-        if (!added) {
-            throw api_error::validation(format("Duplicate AttributeName={} in AttributeDefinitions",
-                rjson::to_string_view(*attribute_name)));
-        }
-        const rjson::value* attribute_type = rjson::find(*it, "AttributeType");
-        if (!attribute_type) {
-            throw api_error::validation("AttributeType missing in AttributeDefinitions");
-        }
-        if (!attribute_type->IsString()) {
-            throw api_error::validation("AttributeType in AttributeDefinitions must be a string");
-        }
-    }
-}
-
 static future<executor::request_return_type> create_table_on_shard0(tracing::trace_state_ptr trace_state, rjson::value request, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper) {
    assert(this_shard_id() == 0);

@@ -921,14 +866,11 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    // (e.g., verify that this table doesn't already exist) - we can only
    // do this further down - after taking group0_guard.
    std::string table_name = get_table_name(request);
-    validate_table_name(table_name);
-
    if (table_name.find(executor::INTERNAL_TABLE_PREFIX) == 0) {
        co_return api_error::validation(format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
    }
    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    const rjson::value& attribute_definitions = request["AttributeDefinitions"];
-    validate_attribute_definitions(attribute_definitions);

    tracing::add_table_name(trace_state, keyspace_name, table_name);

@@ -1119,7 +1061,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    auto ts = group0_guard.write_timestamp();
    std::vector<mutation> schema_mutations;
    try {
-        schema_mutations = co_await create_keyspace(keyspace_name, sp, gossiper, ts);
+        schema_mutations = co_await create_keyspace(keyspace_name, sp, mm, gossiper, ts);
    } catch (exceptions::already_exists_exception&) {
        if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
            co_return api_error::resource_in_use(format("Table {} already exists", table_name));
@@ -1142,9 +1084,9 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
        db::schema_tables::add_table_or_view_to_schema_mutation(
            view_ptr(view_builder.build()), ts, true, schema_mutations);
    }
-    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), format("alternator-executor: create {} table", table_name));
+    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard));

-    co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
+    co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, sp);
    rjson::add(status, "TableDescription", std::move(request));
@@ -1207,11 +1149,11 @@ future<executor::request_return_type> executor::update_table(client_state& clien

        auto schema = builder.build();

-        auto m = co_await service::prepare_column_family_update_announcement(p.local(), schema, false,  std::vector<view_ptr>(), group0_guard.write_timestamp());
+        auto m = co_await mm.prepare_column_family_update_announcement(schema, false,  std::vector<view_ptr>(), group0_guard.write_timestamp());

-        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: update {} table", tab->cf_name()));
+        co_await mm.announce(std::move(m), std::move(group0_guard));

-        co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+        co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);

        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
@@ -1423,11 +1365,14 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) co

 // The DynamoDB API doesn't let the client control the server's timeout, so
 // we have a global default_timeout() for Alternator requests. The value of
-// s_default_timeout_ms is overwritten in alternator::controller::start_server()
+// s_default_timeout is overwritten in alternator::controller::start_server()
 // based on the "alternator_timeout_in_ms" configuration parameter.
-thread_local utils::updateable_value<uint32_t> executor::s_default_timeout_in_ms{10'000};
+db::timeout_clock::duration executor::s_default_timeout = 10s;
+void executor::set_default_timeout(db::timeout_clock::duration timeout) {
+    s_default_timeout = timeout;
+}
 db::timeout_clock::time_point executor::default_timeout() {
-    return db::timeout_clock::now() + std::chrono::milliseconds(s_default_timeout_in_ms);
+    return db::timeout_clock::now() + s_default_timeout;
 }
        
 static future<std::unique_ptr<rjson::value>> get_previous_item(
@@ -1647,7 +1592,7 @@ static parsed::condition_expression get_parsed_condition_expression(rjson::value
        throw api_error::validation("ConditionExpression must not be empty");
    }
    try {
-        return parse_condition_expression(rjson::to_string_view(*condition_expression), "ConditionExpression");
+        return parse_condition_expression(rjson::to_string_view(*condition_expression));
    } catch(expressions_syntax_error& e) {
        throw api_error::validation(e.what());
    }
@@ -1662,16 +1607,17 @@ static bool check_needs_read_before_write(const parsed::condition_expression& co

 // Fail the expression if it has unused attribute names or values. This is
 // how DynamoDB behaves, so we do too.
-static void verify_all_are_used(const rjson::value* field,
-        const std::unordered_set<std::string>& used, const char* field_name, const char* operation) {
-    if (!field) {
+static void verify_all_are_used(const rjson::value& req, const char* field,
+        const std::unordered_set<std::string>& used, const char* operation) {
+    const rjson::value* attribute_names = rjson::find(req, field);
+    if (!attribute_names) {
        return;
    }
-    for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) {
+    for (auto it = attribute_names->MemberBegin(); it != attribute_names->MemberEnd(); ++it) {
        if (!used.contains(it->name.GetString())) {
            throw api_error::validation(
                format("{} has spurious '{}', not used in {}",
-                    field_name, it->name.GetString(), operation));
+                       field, it->name.GetString(), operation));
        }
    }
 }
@@ -1698,8 +1644,8 @@ public:
            resolve_condition_expression(_condition_expression,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
-            verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "PutItem");
-            verify_all_are_used(expression_attribute_values, used_attribute_values,"ExpressionAttributeValues", "PutItem");
+            verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "PutItem");
+            verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "PutItem");
        } else {
            if (expression_attribute_names) {
                throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
@@ -1783,8 +1729,8 @@ public:
            resolve_condition_expression(_condition_expression,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
-            verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "DeleteItem");
-            verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "DeleteItem");
+            verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "DeleteItem");
+            verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "DeleteItem");
        } else {
            if (expression_attribute_names) {
                throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
@@ -2354,14 +2300,14 @@ static std::optional<attrs_to_get> calculate_attrs_to_get(const rjson::value& re
 * as before.
 */ 
 void executor::describe_single_item(const cql3::selection::selection& selection,
-    const std::vector<managed_bytes_opt>& result_row,
+    const std::vector<bytes_opt>& result_row,
    const std::optional<attrs_to_get>& attrs_to_get,
    rjson::value& item,
    bool include_all_embedded_attributes) 
 {
    const auto& columns = selection.get_columns();
    auto column_it = columns.begin();
-    for (const managed_bytes_opt& cell : result_row) {
+    for (const bytes_opt& cell : result_row) {
        std::string column_name = (*column_it)->name_as_text();
        if (cell && column_name != executor::ATTRS_COLUMN_NAME) {
            if (!attrs_to_get || attrs_to_get->contains(column_name)) {
@@ -2369,9 +2315,7 @@ void executor::describe_single_item(const cql3::selection::selection& selection,
                // so add() makes sense
                rjson::add_with_string_name(item, column_name, rjson::empty_object());
                rjson::value& field = item[column_name.c_str()];
-                cell->with_linearized([&] (bytes_view linearized_cell) {
-                    rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(linearized_cell, **column_it));
-                });
+                rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(*cell, **column_it));
            }
        } else if (cell) {
            auto deserialized = attrs_type()->deserialize(*cell);
@@ -2427,22 +2371,21 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
    return item;
 }

-future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schema,
-        const query::partition_slice&& slice,
-        shared_ptr<cql3::selection::selection> selection,
-        foreign_ptr<lw_shared_ptr<query::result>> query_result,
-        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get) {
-    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
-    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
+std::vector<rjson::value> executor::describe_multi_item(schema_ptr schema,
+        const query::partition_slice& slice,
+        const cql3::selection::selection& selection,
+        const query::result& query_result,
+        const std::optional<attrs_to_get>& attrs_to_get) {
+    cql3::selection::result_set_builder builder(selection, gc_clock::now());
+    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        describe_single_item(*selection, result_row, *attrs_to_get, item);
+        describe_single_item(selection, result_row, attrs_to_get, item);
        ret.push_back(std::move(item));
-        co_await coroutine::maybe_yield();
    }
-    co_return ret;
+    return ret;
 }

 static bool check_needs_read_before_write(const parsed::value& v) {
@@ -2557,8 +2500,8 @@ update_item_operation::update_item_operation(service::storage_proxy& proxy, rjso
            expression_attribute_names, expression_attribute_values,
            used_attribute_names, used_attribute_values);

-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "UpdateItem");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "UpdateItem");
+    verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "UpdateItem");
+    verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "UpdateItem");

    // DynamoDB forbids having both old-style AttributeUpdates or Expected
    // and new-style UpdateExpression or ConditionExpression in the same request
@@ -3167,8 +3110,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st

    std::unordered_set<std::string> used_attribute_names;
    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");

    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)).then(
@@ -3279,8 +3221,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
        rs.cl = get_read_consistency(it->value);
        std::unordered_set<std::string> used_attribute_names;
        rs.attrs_to_get = ::make_shared<const std::optional<attrs_to_get>>(calculate_attrs_to_get(it->value, used_attribute_names));
-        const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames");
-        verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem");
+        verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");
        auto& keys = (it->value)["Keys"];
        for (rjson::value& key : keys.GetArray()) {
            rs.add(key);
@@ -3316,7 +3257,8 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get));
+                std::vector<rjson::value> jsons = describe_multi_item(schema, partition_slice, *selection, *qr.query_result, *attrs_to_get);
+                return make_ready_future<std::vector<rjson::value>>(std::move(jsons));
            });
            response_futures.push_back(std::move(f));
        }
@@ -3449,7 +3391,7 @@ filter::filter(const rjson::value& request, request_type rt,
            throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet");
        }
        try {
-            auto parsed = parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression");
+            auto parsed = parse_condition_expression(rjson::to_string_view(*expression));
            const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
            const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
            resolve_condition_expression(parsed,
@@ -3556,7 +3498,7 @@ public:
        _column_it = _columns.begin();
    }

-    void accept_value(managed_bytes_view_opt result_bytes_view) {
+    void accept_value(const std::optional<query::result_bytes_view>& result_bytes_view) {
        if (!result_bytes_view) {
            ++_column_it;
            return;
@@ -3853,10 +3795,8 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
    // optimized the filtering by modifying partition_ranges and/or
    // ck_bounds. We haven't done this optimization yet.

-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Scan");
+    verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Scan");

    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
            std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
@@ -4077,7 +4017,7 @@ calculate_bounds_condition_expression(schema_ptr schema,
    // sort-key range.
    parsed::condition_expression p;
    try {
-        p = parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression");
+        p = parse_condition_expression(rjson::to_string_view(expression));
    } catch(expressions_syntax_error& e) {
        throw api_error::validation(e.what());
    }
@@ -4297,17 +4237,13 @@ future<executor::request_return_type> executor::query(client_state& client_state
        throw api_error::validation("Query must have one of "
                "KeyConditions or KeyConditionExpression");
    }
-
-    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
-    const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
-
    // exactly one of key_conditions or key_condition_expression
    auto [partition_ranges, ck_bounds] = key_conditions
                ? calculate_bounds_conditions(schema, *key_conditions)
                : calculate_bounds_condition_expression(schema, *key_condition_expression,
-                        expression_attribute_values,
+                        rjson::find(request, "ExpressionAttributeValues"),
                        used_attribute_values,
-                        expression_attribute_names,
+                        rjson::find(request, "ExpressionAttributeNames"),
                        used_attribute_names);

    filter filter(request, filter::request_type::QUERY,
@@ -4334,8 +4270,8 @@ future<executor::request_return_type> executor::query(client_state& client_state
    select_type select = parse_select(request, table_type);

    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names, select);
-    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query");
-    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query");
+    verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Query");
+    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Query");
    query::partition_slice::option_set opts;
    opts.set_if<query::partition_slice::option::reversed>(!forward);
    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
@@ -4396,17 +4332,6 @@ future<executor::request_return_type> executor::list_tables(client_state& client

 future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header) {
    _stats.api_operations.describe_endpoints++;
-    // The alternator_describe_endpoints configuration can be used to disable
-    // the DescribeEndpoints operation, or set it to return a fixed string
-    std::string override = _proxy.data_dictionary().get_config().alternator_describe_endpoints();
-    if (!override.empty()) {
-        if (override == "disabled") {
-            _stats.unsupported_operations++;
-            return make_ready_future<request_return_type>(api_error::unknown_operation(
-                "DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)"));
-        }
-        host_header = std::move(override);
-    }
    rjson::value response = rjson::empty_object();
    // Without having any configuration parameter to say otherwise, we tell
    // the user to return to the same endpoint they used to reach us. The only
@@ -4444,10 +4369,6 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
    try {
        schema = _proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
    } catch(data_dictionary::no_such_column_family&) {
-        // DynamoDB returns validation error even when table does not exist
-        // and the table name is invalid.
-        validate_table_name(table_name);
-
        throw api_error::table_not_found(
                format("Table {} not found", table_name));
    }
@@ -4467,9 +4388,9 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
 // of nodes in the cluster: A cluster with 3 or more live nodes, gets RF=3.
 // A smaller cluster (presumably, a test only), gets RF=1. The user may
 // manually create the keyspace to override this predefined behavior.
-static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type ts) {
+static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type ts) {
    sstring keyspace_name_str(keyspace_name);
-    int endpoint_count = gossiper.num_endpoints();
+    int endpoint_count = gossiper.get_endpoint_states().size();
    int rf = 3;
    if (endpoint_count < rf) {
        rf = 1;
@@ -4479,7 +4400,7 @@ static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_n
    auto opts = get_network_topology_options(sp, gossiper, rf);
    auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);

-    co_return service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
+    co_return mm.prepare_new_keyspace_announcement(ksm, ts);
 }

 future<> executor::start() {
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -22,7 +22,6 @@
 #include "alternator/error.hh"
 #include "stats.hh"
 #include "utils/rjson.hh"
-#include "utils/updateable_value.hh"

 namespace db {
    class system_distributed_keyspace;
@@ -171,16 +170,8 @@ public:
    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
    static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";

-    executor(gms::gossiper& gossiper,
-             service::storage_proxy& proxy,
-             service::migration_manager& mm,
-             db::system_distributed_keyspace& sdks,
-             cdc::metadata& cdc_metadata,
-             smp_service_group ssg,
-             utils::updateable_value<uint32_t> default_timeout_in_ms)
-        : _gossiper(gossiper), _proxy(proxy), _mm(mm), _sdks(sdks), _cdc_metadata(cdc_metadata), _ssg(ssg) {
-        s_default_timeout_in_ms = std::move(default_timeout_in_ms);
-    }
+    executor(gms::gossiper& gossiper, service::storage_proxy& proxy, service::migration_manager& mm, db::system_distributed_keyspace& sdks, cdc::metadata& cdc_metadata, smp_service_group ssg)
+        : _gossiper(gossiper), _proxy(proxy), _mm(mm), _sdks(sdks), _cdc_metadata(cdc_metadata), _ssg(ssg) {}

    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
@@ -208,16 +199,13 @@ public:
    future<request_return_type> describe_continuous_backups(client_state& client_state, service_permit permit, rjson::value request);

    future<> start();
-    future<> stop() {
-        // disconnect from the value source, but keep the value unchanged.
-        s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
-        return make_ready_future<>();
-    }
+    future<> stop() { return make_ready_future<>(); }

    static sstring table_name(const schema&);
    static db::timeout_clock::time_point default_timeout();
+    static void set_default_timeout(db::timeout_clock::duration timeout);
 private:
-    static thread_local utils::updateable_value<uint32_t> s_default_timeout_in_ms;
+    static db::timeout_clock::duration s_default_timeout;
 public:
    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);

@@ -225,31 +213,30 @@ private:
    friend class rmw_operation;

    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr);
+    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
    
 public:
-    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
-
    static std::optional<rjson::value> describe_single_item(schema_ptr,
        const query::partition_slice&,
        const cql3::selection::selection&,
        const query::result&,
        const std::optional<attrs_to_get>&);

-    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
-        const query::partition_slice&& slice,
-        shared_ptr<cql3::selection::selection> selection,
-        foreign_ptr<lw_shared_ptr<query::result>> query_result,
-        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get);
+    static std::vector<rjson::value> describe_multi_item(schema_ptr schema,
+        const query::partition_slice& slice,
+        const cql3::selection::selection& selection,
+        const query::result& query_result,
+        const std::optional<attrs_to_get>& attrs_to_get);

    static void describe_single_item(const cql3::selection::selection&,
-        const std::vector<managed_bytes_opt>&,
+        const std::vector<bytes_opt>&,
        const std::optional<attrs_to_get>&,
        rjson::value&,
        bool = false);

    static void add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
-    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
+    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
 };

 // is_big() checks approximately if the given JSON value is "bigger" than
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -29,7 +29,7 @@
 namespace alternator {

 template <typename Func, typename Result = std::result_of_t<Func(expressionsParser&)>>
-static Result do_with_parser(std::string_view input, Func&& f) {
+Result do_with_parser(std::string_view input, Func&& f) {
    expressionsLexer::InputStreamType input_stream{
        reinterpret_cast<const ANTLR_UINT8*>(input.data()),
        ANTLR_ENC_UTF8,
@@ -43,41 +43,31 @@ static Result do_with_parser(std::string_view input, Func&& f) {
    return result;
 }

-template <typename Func, typename Result = std::result_of_t<Func(expressionsParser&)>>
-static Result parse(const char* input_name, std::string_view input, Func&& f) {
-    if (input.length() > 4096) {
-        throw expressions_syntax_error(format("{} expression size {} exceeds allowed maximum 4096.",
-            input_name, input.length()));
-    }
-    try {
-        return do_with_parser(input, f);
-    } catch (expressions_syntax_error& e) {
-        // If already an expressions_syntax_error, don't print the type's
-        // name (it's just ugly), just the message.
-        // TODO: displayRecognitionError could set a position inside the
-        // expressions_syntax_error in throws, and we could use it here to
-        // mark the broken position in 'input'.
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
-            input_name, input, e.what()));
-    } catch (...) {
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
-            input_name, input, std::current_exception()));
-    }
-}
-
 parsed::update_expression
 parse_update_expression(std::string_view query) {
-    return parse("UpdateExpression", query,  std::mem_fn(&expressionsParser::update_expression));
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::update_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing UpdateExpression '{}': {}", query, std::current_exception()));
+    }
 }

 std::vector<parsed::path>
 parse_projection_expression(std::string_view query) {
-    return parse ("ProjectionExpression", query,  std::mem_fn(&expressionsParser::projection_expression));
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::projection_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ProjectionExpression '{}': {}", query, std::current_exception()));
+    }
 }

 parsed::condition_expression
-parse_condition_expression(std::string_view query, const char* caller) {
-    return parse(caller, query,  std::mem_fn(&expressionsParser::condition_expression));
+parse_condition_expression(std::string_view query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
+    }
 }

 namespace parsed {
@@ -428,14 +418,9 @@ void for_condition_expression_on(const parsed::condition_expression& ce, const n
 // calculate_size() is ConditionExpression's size() function, i.e., it takes
 // a JSON-encoded value and returns its "size" as defined differently for the
 // different types - also as a JSON-encoded number.
-// If the value's type (e.g. number) has no size defined, there are two cases:
-// 1. If from_data (the value came directly from an attribute of the data),
-//    It returns a JSON-encoded "null" value. Comparisons against this
-//    non-numeric value will later fail, so eventually the application will
-//    get a ConditionalCheckFailedException.
-// 2. Otherwise (the value came from a constant in the query or some other
-//    calculation), throw a ValidationException.
-static rjson::value calculate_size(const rjson::value& v, bool from_data) {
+// It return a JSON-encoded "null" value if this value's type has no size
+// defined. Comparisons against this non-numeric value will later fail.
+static rjson::value calculate_size(const rjson::value& v) {
    // NOTE: If v is improperly formatted for our JSON value encoding, it
    // must come from the request itself, not from the database, so it makes
    // sense to throw a ValidationException if we see such a problem.
@@ -464,12 +449,10 @@ static rjson::value calculate_size(const rjson::value& v, bool from_data) {
            throw api_error::validation(format("invalid byte string: {}", v));
        }
        ret = base64_decoded_len(rjson::to_string_view(it->value));
-    } else if (from_data) {
+    } else {
        rjson::value json_ret = rjson::empty_object();
        rjson::add(json_ret, "null", rjson::value(true));
        return json_ret;
-    } else {
-        throw api_error::validation(format("Unsupported operand type {} for function size()", it->name));
    }
    rjson::value json_ret = rjson::empty_object();
    rjson::add(json_ret, "N", rjson::from_string(std::to_string(ret)));
@@ -551,7 +534,7 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
                        format("{}: size() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
-            return calculate_size(v, f._parameters[0].is_path());
+            return calculate_size(v);
        }
    },
    {"attribute_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
@@ -679,7 +662,7 @@ static rjson::value extract_path(const rjson::value* item,
            // objects. But today Alternator does not validate the structure
            // of nested documents before storing them, so this can happen on
            // read.
-            throw api_error::validation(format("{}: malformed item read: {}", caller, *item));
+            throw api_error::validation(format("{}: malformed item read: {}", *item));
        }
        const char* type = v->MemberBegin()->name.GetString();
        v = &(v->MemberBegin()->value);
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -74,22 +74,7 @@ options {
 */
@parser::context {
    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
-        const char* err;
-        switch (ex->getType()) {
-        case antlr3::ExceptionType::FAILED_PREDICATE_EXCEPTION:
-            err = "expression nested too deeply";
-            break;
-        default:
-            err = "syntax error";
-            break;
-        }
-        // Alternator expressions are always single line so ex->get_line()
-        // is always 1, no sense to print it.
-        // TODO: return the position as part of the exception, so the
-        // caller in expressions.cc that knows the expression string can
-        // mark the error position in the final error message.
-        throw expressions_syntax_error(format("{} at char {}", err,
-            ex->get_charPositionInLine()));
+        throw expressions_syntax_error("syntax error");
    }
 }
@lexer::context {
@@ -98,23 +83,6 @@ options {
    }
 }

-/* Unfortunately, ANTLR uses recursion - not the heap - to parse recursive
- * expressions. To make things even worse, ANTLR has no way to limit the
- * depth of this recursion (unlike Yacc which has YYMAXDEPTH). So deeply-
- * nested expression like "(((((((((((((..." can easily crash Scylla on a
- * stack overflow (see issue #14477).
- *
- * We are lucky that in the grammar for DynamoDB expressions (below),
- * only a few specific rules can recurse, so it was fairly easy to add a
- * "depth" counter to a few specific rules, and then use a predicate
- * "{depth<MAX_DEPTH}?" to avoid parsing if the depth exceeds this limit,
- * and throw a FAILED_PREDICATE_EXCEPTION in that case, which we will
- * report to the user as a "expression nested too deeply" error.
- */
-@parser::members {
-    static constexpr int MAX_DEPTH = 400;
-}
-
 /*
 * Lexical analysis phase, i.e., splitting the input up to tokens.
 * Lexical analyzer rules have names starting in capital letters.
@@ -187,20 +155,19 @@ path returns [parsed::path p]:
      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
    )*;

-/* See comment above why the "depth" counter was needed here */
-value[int depth] returns [parsed::value v]:
+value returns [parsed::value v]:
      VALREF       { $v.set_valref($VALREF.text); }
    | path         { $v.set_path($path.p); }
-    | {depth<MAX_DEPTH}? NAME { $v.set_func_name($NAME.text); }
-     '(' x=value[depth+1]    { $v.add_func_parameter($x.v); }
-     (',' x=value[depth+1]   { $v.add_func_parameter($x.v); })*
+    | NAME         { $v.set_func_name($NAME.text); }
+     '(' x=value   { $v.add_func_parameter($x.v); }
+     (',' x=value  { $v.add_func_parameter($x.v); })*
     ')'
    ;

 update_expression_set_rhs returns [parsed::set_rhs rhs]:
-    v=value[0]  { $rhs.set_value(std::move($v.v)); }
-    (   '+' v=value[0]  { $rhs.set_plus(std::move($v.v)); }
-      | '-' v=value[0]  { $rhs.set_minus(std::move($v.v)); }
+    v=value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
    )?
    ;

@@ -238,7 +205,7 @@ projection_expression returns [std::vector<parsed::path> v]:


 primitive_condition returns [parsed::primitive_condition c]:
-      v=value[0]      { $c.add_value(std::move($v.v));
+      v=value         { $c.add_value(std::move($v.v));
                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
@@ -247,14 +214,14 @@ primitive_condition returns [parsed::primitive_condition c]:
          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
         )
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
         AND
-         v=value[0]   { $c.add_value(std::move($v.v)); }
+         v=value      { $c.add_value(std::move($v.v)); }
       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
-         v=value[0]   { $c.add_value(std::move($v.v)); }
-         (',' v=value[0] { $c.add_value(std::move($v.v)); })*
+         v=value      { $c.add_value(std::move($v.v)); }
+         (',' v=value { $c.add_value(std::move($v.v)); })*
         ')'
      )?
    ;
@@ -264,20 +231,19 @@ primitive_condition returns [parsed::primitive_condition c]:
 // common rule prefixes, and (lack of) support for operator precedence.
 // These rules could have been written more clearly using a more powerful
 // parser generator - such as Yacc.
-// See comment above why the "depth" counter was needed here.
-boolean_expression[int depth] returns [parsed::condition_expression e]:
-	  b=boolean_expression_1[depth]       { $e.append(std::move($b.e), '|'); }
-	  (OR b=boolean_expression_1[depth]   { $e.append(std::move($b.e), '|'); } )*
+boolean_expression returns [parsed::condition_expression e]:
+	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
+	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
 	;
-boolean_expression_1[int depth] returns [parsed::condition_expression e]:
-	  b=boolean_expression_2[depth]       { $e.append(std::move($b.e), '&'); }
-	  (AND b=boolean_expression_2[depth]  { $e.append(std::move($b.e), '&'); } )*
+boolean_expression_1 returns [parsed::condition_expression e]:
+	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
+	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
 	;
-boolean_expression_2[int depth] returns [parsed::condition_expression e]:
+boolean_expression_2 returns [parsed::condition_expression e]:
 	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
-	| {depth<MAX_DEPTH}? NOT b=boolean_expression_2[depth+1]   { $e = std::move($b.e); $e.apply_not(); }
-	| {depth<MAX_DEPTH}? '(' b=boolean_expression[depth+1] ')' { $e = std::move($b.e); }
+	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
+	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
    ;

 condition_expression returns [parsed::condition_expression e]:
-    boolean_expression[0] { e=std::move($boolean_expression.e); } EOF;
+    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -28,7 +28,7 @@ public:

 parsed::update_expression parse_update_expression(std::string_view query);
 std::vector<parsed::path> parse_projection_expression(std::string_view query);
-parsed::condition_expression parse_condition_expression(std::string_view query, const char* caller);
+parsed::condition_expression parse_condition_expression(std::string_view query);

 void resolve_update_expression(parsed::update_expression& ue,
        const rjson::value* expression_attribute_names,
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -50,115 +50,6 @@ type_representation represent_type(alternator_type atype) {
    return it->second;
 }

-// Get the magnitude and precision of a big_decimal - as these concepts are
-// defined by DynamoDB - to allow us to enforce limits on those as explained
-// in ssue #6794. The "magnitude" of 9e123 is 123 and of -9e-123 is -123,
-// the "precision" of 12.34e56 is the number of significant digits - 4.
-//
-// Unfortunately it turned out to be quite difficult to take a big_decimal and
-// calculate its magnitude and precision from its scale() and unscaled_value().
-// So in the following ugly implementation we calculate them from the string
-// representation instead. We assume the number was already parsed
-// sucessfully to a big_decimal to it follows its syntax rules.
-//
-// FIXME: rewrite this function to take a big_decimal, not a string.
-// Maybe a snippet like this can help:
-// boost::multiprecision::cpp_int digits = boost::multiprecision::log10(num.unscaled_value().convert_to<boost::multiprecision::mpf_float_50>()).convert_to<boost::multiprecision::cpp_int>() + 1;
-
-
-internal::magnitude_and_precision internal::get_magnitude_and_precision(std::string_view s) {
-    size_t e_or_end = s.find_first_of("eE");
-    std::string_view base = s.substr(0, e_or_end);
-    if (s[0]=='-' || s[0]=='+') {
-        base = base.substr(1);
-    }
-    int magnitude = 0;
-    int precision = 0;
-    size_t dot_or_end = base.find_first_of(".");
-    size_t nonzero = base.find_first_not_of("0");
-    if (dot_or_end != std::string_view::npos) {
-        if (nonzero == dot_or_end) {
-            // 0.000031 => magnitude = -5 (like 3.1e-5), precision = 2.
-            std::string_view fraction = base.substr(dot_or_end + 1);
-            size_t nonzero2 = fraction.find_first_not_of("0");
-            if (nonzero2 != std::string_view::npos) {
-                magnitude = -nonzero2 - 1;
-                precision = fraction.size() - nonzero2;
-            }
-        } else {
-            // 000123.45678 => magnitude = 2, precision = 8.
-            magnitude = dot_or_end - nonzero - 1;
-            precision = base.size() - nonzero - 1;
-        }
-        // trailing zeros don't count to precision, e.g., precision
-        // of 1000.0, 1.0 or 1.0000 are just 1.
-        size_t last_significant = base.find_last_not_of(".0");
-        if (last_significant == std::string_view::npos) {
-            precision = 0;
-        } else if (last_significant < dot_or_end) {
-            // e.g., 1000.00 reduce 5 = 7 - (0+1) - 1 from precision
-            precision -= base.size() - last_significant - 2;
-        } else {
-            // e.g., 1235.60 reduce 5 = 7 - (5+1) from precision
-            precision -= base.size() - last_significant - 1;
-        }
-    } else if (nonzero == std::string_view::npos) {
-        // all-zero integer 000000
-        magnitude = 0;
-        precision = 0;
-    } else {
-        magnitude = base.size() - 1 - nonzero;
-        precision = base.size() - nonzero;
-        // trailing zeros don't count to precision, e.g., precision
-        // of 1000 is just 1.
-        size_t last_significant = base.find_last_not_of("0");
-        if (last_significant == std::string_view::npos) {
-            precision = 0;
-        } else {
-            // e.g., 1000 reduce 3 = 4 - (0+1)
-            precision -= base.size() - last_significant - 1;
-        }
-    }
-    if (precision && e_or_end != std::string_view::npos) {
-        std::string_view exponent = s.substr(e_or_end + 1);
-        if (exponent.size() > 4) {
-            // don't even bother atoi(), exponent is too large
-            magnitude = exponent[0]=='-' ? -9999 : 9999;
-        } else {
-            try {
-                magnitude += boost::lexical_cast<int32_t>(exponent);
-            } catch (...) {
-                magnitude = 9999;
-            }
-        }
-    }
-    return magnitude_and_precision {magnitude, precision};
-}
-
-// Parse a number read from user input, validating that it has a valid
-// numeric format and also in the allowed magnitude and precision ranges
-// (see issue #6794). Throws an api_error::validation if the validation
-// failed.
-static big_decimal parse_and_validate_number(std::string_view s) {
-    try {
-        big_decimal ret(s);
-        auto [magnitude, precision] = internal::get_magnitude_and_precision(s);
-        if (magnitude > 125) {
-            throw api_error::validation(format("Number overflow: {}. Attempting to store a number with magnitude larger than supported range.", s));
-        }
-        if (magnitude < -130) {
-            throw api_error::validation(format("Number underflow: {}. Attempting to store a number with magnitude lower than supported range.", s));
-        }
-        if (precision > 38) {
-            throw api_error::validation(format("Number too precise: {}. Attempting to store a number with more significant digits than supported.", s));
-        }
-        return ret;
-    } catch (const marshal_exception& e) {
-        throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", s));
-    }
-
-}
-
 struct from_json_visitor {
    const rjson::value& v;
    bytes_ostream& bo;
@@ -176,7 +67,11 @@ struct from_json_visitor {
        bo.write(boolean_type->decompose(v.GetBool()));
    }
    void operator()(const decimal_type_impl& t) const {
-        bo.write(decimal_type->decompose(parse_and_validate_number(rjson::to_string_view(v))));
+        try {
+            bo.write(t.from_string(rjson::to_string_view(v)));
+        } catch (const marshal_exception& e) {
+            throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", v));
+        }
    }
    // default
    void operator()(const abstract_type& t) const {
@@ -308,8 +203,6 @@ bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column
        // FIXME: it's difficult at this point to get information if value was provided
        // in request or comes from the storage, for now we assume it's user's fault.
        return *unwrap_bytes(value, true);
-    } else if (column.type == decimal_type) {
-        return decimal_type->decompose(parse_and_validate_number(rjson::to_string_view(value)));
    } else {
        return column.type->from_string(value_view);
    }
@@ -402,13 +295,16 @@ big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
    if (it->name != "N") {
        throw api_error::validation(format("{}: expected number, found type '{}'", diagnostic, it->name));
    }
-    if (!it->value.IsString()) {
-        // We shouldn't reach here. Callers normally validate their input
-        // earlier with validate_value().
-        throw api_error::validation(format("{}: improperly formatted number constant", diagnostic));
+    try {
+        if (!it->value.IsString()) {
+            // We shouldn't reach here. Callers normally validate their input
+            // earlier with validate_value().
+            throw api_error::validation(format("{}: improperly formatted number constant", diagnostic));
+        }
+        return big_decimal(rjson::to_string_view(it->value));
+    } catch (const marshal_exception& e) {
+        throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", it->value));
    }
-    big_decimal ret = parse_and_validate_number(rjson::to_string_view(it->value));
-    return ret;
 }

 std::optional<big_decimal> try_unwrap_number(const rjson::value& v) {
@@ -420,8 +316,8 @@ std::optional<big_decimal> try_unwrap_number(const rjson::value& v) {
        return std::nullopt;
    }
    try {
-        return parse_and_validate_number(rjson::to_string_view(it->value));
-    } catch (api_error&) {
+        return big_decimal(rjson::to_string_view(it->value));
+    } catch (const marshal_exception& e) {
        return std::nullopt;
    }
 }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -94,12 +94,5 @@ std::optional<rjson::value> set_diff(const rjson::value& v1, const rjson::value&
 // Returns a null value if one of the arguments is not actually a list.
 rjson::value list_concatenate(const rjson::value& v1, const rjson::value& v2);

-namespace internal {
-struct magnitude_and_precision {
-    int magnitude;
-    int precision;
-};
-magnitude_and_precision get_magnitude_and_precision(std::string_view);
-}

 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -424,7 +424,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    co_await client_state.maybe_update_per_service_level_params();

    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
-    tracing::trace(trace_state, "{}", op);
+    tracing::trace(trace_state, op);
    rjson::value json_request = co_await _json_parser.parse(std::move(content));
    co_return co_await callback_it->second(_executor, client_state, trace_state,
            make_service_permit(std::move(units)), std::move(json_request), std::move(req));
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -1096,7 +1096,7 @@ void executor::add_stream_options(const rjson::value& stream_specification, sche
    }
 }

-void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
+void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp) {
    auto& opts = schema.cdc_options();
    if (opts.enabled()) {
        auto db = sp.data_dictionary();
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -241,7 +241,7 @@ static bool is_expired(const rjson::value& expiration_time, gc_clock::time_point
 // understands it is an expiration event - not a user-initiated deletion.
 static future<> expire_item(service::storage_proxy& proxy,
                            const service::query_state& qs,
-                            const std::vector<managed_bytes_opt>& row,
+                            const std::vector<bytes_opt>& row,
                            schema_ptr schema,
                            api::timestamp_type ts) {
    // Prepare the row key to delete
@@ -260,7 +260,7 @@ static future<> expire_item(service::storage_proxy& proxy,
            // FIXME: log or increment a metric if this happens.
            return make_ready_future<>();
        }
-        exploded_pk.push_back(to_bytes(*row_c));
+        exploded_pk.push_back(*row_c);
    }
    auto pk = partition_key::from_exploded(exploded_pk);
    mutation m(schema, pk);
@@ -280,7 +280,7 @@ static future<> expire_item(service::storage_proxy& proxy,
                // FIXME: log or increment a metric if this happens.
                return make_ready_future<>();
            }
-            exploded_ck.push_back(to_bytes(*row_c));
+            exploded_ck.push_back(*row_c);
        }
        auto ck = clustering_key::from_exploded(exploded_ck);
        m.partition().clustered_row(*schema, ck).apply(tombstone(ts, gc_clock::now()));
@@ -387,7 +387,7 @@ class token_ranges_owned_by_this_shard {
    class ranges_holder_primary {
        const dht::token_range_vector _token_ranges;
     public:
-        ranges_holder_primary(const locator::vnode_effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
+        ranges_holder_primary(const locator::effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
            : _token_ranges(erm->get_primary_ranges(ep)) {}
        std::size_t size() const { return _token_ranges.size(); }
        const dht::token_range& operator[](std::size_t i) const {
@@ -430,7 +430,6 @@ class token_ranges_owned_by_this_shard {
    size_t _range_idx;
    size_t _end_idx;
    std::optional<dht::selective_token_range_sharder> _intersecter;
-    locator::effective_replication_map_ptr _erm;
 public:
    token_ranges_owned_by_this_shard(replica::database& db, gms::gossiper& g, schema_ptr s)
        :  _s(s)
@@ -438,7 +437,6 @@ public:
                g, utils::fb_utilities::get_broadcast_address())
        , _range_idx(random_offset(0, _token_ranges.size() - 1))
        , _end_idx(_range_idx + _token_ranges.size())
-        , _erm(s->table().get_effective_replication_map())
    {
        tlogger.debug("Generating token ranges starting from base range {} of {}", _range_idx, _token_ranges.size());
    }
@@ -471,7 +469,7 @@ public:
                    return std::nullopt;
                }
            }
-            _intersecter.emplace(_erm->get_sharder(*_s), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
+            _intersecter.emplace(_s->get_sharder(), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
        }
    }

@@ -595,7 +593,7 @@ static future<> scan_table_ranges(
            continue;
        }
        for (const auto& row : rows) {
-            const managed_bytes_opt& cell = row[*expiration_column];
+            const bytes_opt& cell = row[*expiration_column];
            if (!cell) {
                continue;
            }
--- a/api/CMakeLists.txt
+++ b/api/CMakeLists.txt
@@ -14,7 +14,6 @@ set(swagger_files
  api-doc/hinted_handoff.json
  api-doc/lsa.json
  api-doc/messaging_service.json
-  api-doc/metrics.json
  api-doc/storage_proxy.json
  api-doc/storage_service.json
  api-doc/stream_manager.json
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -84,14 +84,6 @@
                     "type":"string",
                     "paramType":"path"
                  },
-                  {
-                     "name":"flush_memtables",
-                     "description":"Controls flushing of memtables before compaction (true by default). Set to \"false\" to skip automatic flushing of memtables before compaction, e.g. when the table is flushed explicitly before invoking the compaction api.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  },
                  {
                     "name":"split_output",
                     "description":"true if the output of the major compaction should be split in several sstables",
@@ -445,68 +437,6 @@
            }
         ]
      },
-      {
-         "path":"/column_family/tombstone_gc/{name}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Check if tombstone GC is enabled for a given table",
-               "type":"boolean",
-               "nickname":"get_tombstone_gc",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"name",
-                     "description":"The table name in keyspace:name format",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            },
-            {
-               "method":"POST",
-               "summary":"Enable tombstone GC for a given table",
-               "type":"void",
-               "nickname":"enable_tombstone_gc",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"name",
-                     "description":"The table name in keyspace:name format",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Disable tombstone GC for a given table",
-               "type":"void",
-               "nickname":"disable_tombstone_gc",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"name",
-                     "description":"The table name in keyspace:name format",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/column_family/estimate_keys/{name}",
         "operations":[
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -34,14 +34,6 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"parameters",
-                     "description":"dict of parameters to pass to the injection (json format)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"dict",
-                     "paramType":"body"
                  }
               ]
            },
@@ -66,30 +58,6 @@
            }
         ]
      },
-      {
-         "path":"/v2/error_injection/injection/{injection}/message",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Send message to trigger an event in injection's code",
-               "type":"void",
-               "nickname":"message_injection",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"injection",
-                     "description":"injection name, should correspond to an injection added in code",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/v2/error_injection/injection",
         "operations":[
@@ -118,15 +86,5 @@
            }
         ]
      }
-   ],
-   "components":{
-      "schemas": {
-         "dict": {
-            "type": "object",
-            "additionalProperties": {
-               "type": "string"
-            }
-         }
-      }
-   }
+   ]
 }
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -245,7 +245,7 @@
                 "GOSSIP_SHUTDOWN",
                 "DEFINITIONS_UPDATE",
                 "TRUNCATE",
-                 "UNUSED__REPLICATION_FINISHED",
+                 "REPLICATION_FINISHED",
                 "MIGRATION_REQUEST",
                 "PREPARE_MESSAGE",
                 "PREPARE_DONE_MESSAGE",
--- a/api/api-doc/metrics.def.json
+++ b/api/api-doc/metrics.def.json
@@ -1,34 +0,0 @@
-    "metrics_config": {
-        "id": "metrics_config",
-        "summary": "An entry in the metrics configuration",
-        "properties": {
-            "source_labels": {
-                "type": "array",
-                "items": {
-                    "type": "string"
-                },
-                "description": "The source labels, a match is based on concatination of the labels"
-            },
-            "action": {
-                "type": "string",
-                "description": "The action to perfrom on match",
-                "enum": ["skip_when_empty", "report_when_empty", "replace", "keep", "drop", "drop_label"]
-            },
-            "target_label": {
-                "type": "string",
-                "description": "The application state version"
-            },
-            "replacement": {
-                "type": "string",
-                "description": "The replacement string to use when replacing a value"
-            },
-            "regex": {
-                "type": "string",
-                "description": "The regex string to use when replacing a value"
-            },
-            "separator": {
-                "type": "string",
-                "description": "The separator string to use when concatinating the labels"
-            }
-        }
-    }
--- a/api/api-doc/metrics.json
+++ b/api/api-doc/metrics.json
@@ -1,66 +0,0 @@
-    "/v2/metrics-config/":{
-        "get":{
-            "description":"Return the metrics layer configuration",
-            "operationId":"get_metrics_config",
-            "produces":[
-                "application/json"
-            ],
-            "tags":[
-                "metrics"
-            ],
-            "parameters":[
-            ],
-            "responses":{
-                "200":{
-                "schema": {
-                    "type":"array",
-                    "items":{
-                        "$ref":"#/definitions/metrics_config",
-                        "description":"metrics Config value"
-                    }
-                    }
-                },
-                "default":{
-                    "description":"unexpected error",
-                    "schema":{
-                        "$ref":"#/definitions/ErrorModel"
-                    }
-                }
-            }
-        },
-        "post": {
-             "description":"Set the metrics layer relabel configuration",
-            "operationId":"set_metrics_config",
-            "produces":[
-                "application/json"
-            ],
-            "tags":[
-                "metrics"
-            ],
-            "parameters":[
-               {
-                "in":"body",
-                "name":"conf",
-                "description":"An array of relabel_config objects",
-                "schema": {
-                    "type":"array",
-                    "items":{
-                        "$ref":"#/definitions/metrics_config",
-                        "description":"metrics Config value"
-                    }
-                }
-               }
-            ],
-            "responses":{
-                "200":{
-                    "description": "OK"
-                },
-                "default":{
-                    "description":"unexpected error",
-                    "schema":{
-                        "$ref":"#/definitions/ErrorModel"
-                    }
-                }
-            }
-        }
-    }
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -465,7 +465,7 @@
         "operations":[
            {
               "method":"GET",
-               "summary":"Retrieve the mapping of endpoint to host ID of all nodes that own tokens",
+               "summary":"Retrieve the mapping of endpoint to host ID",
               "type":"array",
               "items":{
                  "type":"mapper"
@@ -701,30 +701,6 @@
            }
         ]
      },
-      {
-         "path":"/storage_service/compact",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Forces major compaction in all keyspaces",
-               "type":"void",
-               "nickname":"force_compaction",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"flush_memtables",
-                     "description":"Controls flushing of memtables before compaction (true by default). Set to \"false\" to skip automatic flushing of memtables before compaction, e.g. when tables were flushed explicitly before invoking the compaction api.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/storage_service/keyspace_compaction/{keyspace}",
         "operations":[
@@ -752,14 +728,6 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"flush_memtables",
-                     "description":"Controls flushing of memtables before compaction (true by default). Set to \"false\" to skip automatic flushing of memtables before compaction, e.g. when tables were flushed explicitly before invoking the compaction api.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
                  }
               ]
            }
@@ -944,21 +912,6 @@
            }
         ]
      },
-      {
-         "path":"/storage_service/flush",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Flush all memtables in all keyspaces.",
-               "type":"void",
-               "nickname":"force_flush",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[]
-            }
-         ]
-      },
      {
         "path":"/storage_service/keyspace_flush/{keyspace}",
         "operations":[
@@ -1161,14 +1114,6 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"ranges_parallelism",
-                     "description":"An integer specifying the number of ranges to repair in parallel by user request. If this number is bigger than the max_repair_ranges_in_parallel calculated by Scylla core, the smaller one will be used.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
                  }
               ]
            },
@@ -2001,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Forces this node to recalculate versions of schema objects.",
+               "summary":"Reset local schema",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
@@ -2165,65 +2110,6 @@
            }
         ]
      },
-      {
-         "path":"/storage_service/tombstone_gc/{keyspace}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Enable tombstone GC",
-               "type":"void",
-               "nickname":"enable_tombstone_gc",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"cf",
-                     "description":"Comma-separated column family names",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Disable tombstone GC",
-               "type":"void",
-               "nickname":"disable_tombstone_gc",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"cf",
-                     "description":"Comma-separated column family names",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/storage_service/deliver_hints",
         "operations":[
@@ -2542,23 +2428,7 @@
               ]
            }
         ]
-      },
-      {
-         "path":"/storage_service/raft_topology/reload",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Reload Raft topology state from disk.",
-               "type":"void",
-               "nickname":"reload_raft_topology_state",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      }
+      }      
   ],
   "models":{
      "mapper":{
@@ -2761,7 +2631,7 @@
                "description":"File creation time"
            },
            "generation":{
-                "type":"string",
+                "type":"long",
                "description":"SSTable generation"
            },
            "level":{
--- a/api/api-doc/swagger20_header.json
+++ b/api/api-doc/swagger20_header.json
@@ -16,7 +16,7 @@
    }
  },
  "host": "{{Host}}",
-  "basePath": "/",
+  "basePath": "/v2",
  "schemes": [
    "http"
  ],
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -1,182 +1,182 @@
 {
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/task_manager",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/task_manager/list_modules",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get all modules names",
-               "type":"array",
-               "items":{
-                  "type":"string"
-               },
-               "nickname":"get_modules",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/list_module_tasks/{module}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get a list of tasks",
-               "type":"array",
-               "items":{
-                  "type":"task_stats"
-               },
-               "nickname":"get_tasks",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"module",
-                     "description":"The module to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"internal",
-                     "description":"Boolean flag indicating whether internal tasks should be shown (false by default)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace to query about",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"table",
-                     "description":"The table to query about",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/task_status/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get task status",
-               "type":"task_status",
-               "nickname":"get_task_status",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/abort_task/{task_id}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Abort running task and its descendants",
-               "type":"void",
-               "nickname":"abort_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to abort",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/wait_task/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Wait for a task to complete",
-               "type":"task_status",
-               "nickname":"wait_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to wait for",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager/task_status_recursive/{task_id}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"Get statuses of the task and all its descendants",
-               "type":"array",
-               "items":{
-                  "type":"task_status"
-               },
-               "nickname":"get_task_status_recursively",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to query about",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager/list_modules",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get all modules names",
+                "type":"array",
+                "items":{
+                   "type":"string"
+                },
+                "nickname":"get_modules",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/list_module_tasks/{module}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get a list of tasks",
+                "type":"array",
+                "items":{
+                    "type":"task_stats"
+                },
+                "nickname":"get_tasks",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"module",
+                        "description":"The module to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    },
+                    {
+                        "name":"internal",
+                        "description":"Boolean flag indicating whether internal tasks should be shown (false by default)",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"boolean",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/task_status/{task_id}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get task status",
+                "type":"task_status",
+                "nickname":"get_task_status",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/abort_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Abort running task and its descendants",
+                "type":"void",
+                "nickname":"abort_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to abort",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   }
+                ]
+             }
+          ]
+       },
+       {
+        "path":"/task_manager/wait_task/{task_id}",
+        "operations":[
+           {
+              "method":"GET",
+              "summary":"Wait for a task to complete",
+              "type":"task_status",
+              "nickname":"wait_task",
+              "produces":[
+                 "application/json"
+              ],
+              "parameters":[
+                 {
+                    "name":"task_id",
+                    "description":"The uuid of a task to wait for",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                 }
+              ]
+           }
+        ]
+     },
+     {
+      "path":"/task_manager/task_status_recursive/{task_id}",
+      "operations":[
+         {
+            "method":"GET",
+            "summary":"Get statuses of the task and all its descendants",
+            "type":"array",
+            "items":{
+               "type":"task_status"
+            },
+            "nickname":"get_task_status_recursively",
+            "produces":[
+               "application/json"
+            ],
+            "parameters":[
+                {
+                    "name":"task_id",
+                    "description":"The uuid of a task to query about",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                }
+            ]
+         }
+      ]
+     },
+     {
         "path":"/task_manager/ttl",
         "operations":[
            {
@@ -199,96 +199,88 @@
               ]
            }
         ]
-      }
-   ],
-   "models":{
-      "task_stats" :{
-         "id": "task_stats",
-         "description":"A task statistics object",
-         "properties":{
-            "task_id":{
-               "type":"string",
-               "description":"The uuid of a task"
-            },
-            "state":{
-               "type":"string",
-               "enum":[
+     }
+    ],
+    "models":{
+       "task_stats" :{
+           "id": "task_stats",
+           "description":"A task statistics object",
+           "properties":{
+             "task_id":{
+                "type":"string",
+                "description":"The uuid of a task"
+             },
+             "state":{
+                "type":"string",
+                "enum":[
                  "created",
                  "running",
                  "done",
                  "failed"
-               ],
-               "description":"The state of a task"
-            },
-            "type":{
-               "type":"string",
-               "description":"The description of the task"
-            },
-            "scope":{
-               "type":"string",
-               "description":"The scope of the task"
-            },
-            "keyspace":{
-               "type":"string",
-               "description":"The keyspace the task is working on (if applicable)"
-            },
-            "table":{
-               "type":"string",
-               "description":"The table the task is working on (if applicable)"
-            },
-            "entity":{
-               "type":"string",
-               "description":"Task-specific entity description"
-            },
-            "sequence_number":{
-               "type":"long",
-               "description":"The running sequence number of the task"
-            }
-         }
-      },
-      "task_status":{
-         "id":"task_status",
-         "description":"A task status object",
-         "properties":{
-            "id":{
-               "type":"string",
-               "description":"The uuid of the task"
-            },
-            "type":{
-               "type":"string",
-               "description":"The description of the task"
-            },
-            "scope":{
-               "type":"string",
-               "description":"The scope of the task"
-            },
-            "state":{
+                ],
+                "description":"The state of a task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "keyspace":{
+                "type":"string",
+                "description":"The keyspace the task is working on (if applicable)"
+             },
+             "table":{
+                "type":"string",
+                "description":"The table the task is working on (if applicable)"
+             },
+             "entity":{
+                "type":"string",
+                "description":"Task-specific entity description"
+             },
+             "sequence_number":{
+                "type":"long",
+                "description":"The running sequence number of the task"
+             }
+           }
+       },
+       "task_status":{
+          "id":"task_status",
+          "description":"A task status object",
+          "properties":{
+             "id":{
+                "type":"string",
+                "description":"The uuid of the task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "state":{
               "type":"string",
               "enum":[
-                  "created",
-                  "running",
-                  "done",
-                  "failed"
+                 "created",
+                 "running",
+                 "done",
+                 "failed"
               ],
-               "description":"The state of the task"
-            },
-            "is_abortable":{
-               "type":"boolean",
-               "description":"Boolean flag indicating whether the task can be aborted"
-            },
-            "start_time":{
-               "type":"datetime",
-               "description":"The start time of the task"
-            },
-            "end_time":{
-               "type":"datetime",
-               "description":"The end time of the task (unspecified when the task is not completed)"
-            },
-            "error":{
-               "type":"string",
-               "description":"Error string, if the task failed"
-            },
-            "parent_id":{
+                "description":"The state of the task"
+             },
+             "is_abortable":{
+                "type":"boolean",
+                "description":"Boolean flag indicating whether the task can be aborted"
+             },
+             "start_time":{
+                "type":"datetime",
+                "description":"The start time of the task"
+             },
+             "end_time":{
+                "type":"datetime",
+                "description":"The end time of the task (unspecified when the task is not completed)"
+             },
+             "error":{
+                "type":"string",
+                "description":"Error string, if the task failed"
+             },
+             "parent_id":{
               "type":"string",
               "description":"The uuid of the parent task"
            },
@@ -326,12 +318,12 @@
            },
            "children_ids":{
               "type":"array",
-               "items":{
-                  "type":"string"
-               },
+                "items":{
+                    "type":"string"
+                },
               "description":"Task IDs of children of this task"
            }
-         }
-      }
-   }
-}
+          }
+       }
+    }
+ }
--- a/api/api-doc/task_manager_test.json
+++ b/api/api-doc/task_manager_test.json
@@ -1,153 +1,153 @@
 {
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/task_manager_test",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/task_manager_test/test_module",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Register test module in task manager",
-               "type":"void",
-               "nickname":"register_test_module",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Unregister test module in task manager",
-               "type":"void",
-               "nickname":"unregister_test_module",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager_test/test_task",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Register test task",
-               "type":"string",
-               "nickname":"register_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to register",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"shard",
-                     "description":"The shard of the task",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"long",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"parent_id",
-                     "description":"The uuid of a parent task",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"keyspace",
-                     "description":"The keyspace the task is working on",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"table",
-                     "description":"The table the task is working on",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"entity",
-                     "description":"Task-specific entity description",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Unregister test task",
-               "type":"void",
-               "nickname":"unregister_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to register",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/task_manager_test/finish_test_task/{task_id}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Finish test task",
-               "type":"void",
-               "nickname":"finish_test_task",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"task_id",
-                     "description":"The uuid of a task to finish",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"error",
-                     "description":"The error with which task fails (if it does)",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      }
-   ]
-}
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager_test",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager_test/test_module",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test module in task manager",
+                "type":"void",
+                "nickname":"register_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test module in task manager",
+                "type":"void",
+                "nickname":"unregister_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/test_task",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test task",
+                "type":"string",
+                "nickname":"register_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"shard",
+                        "description":"The shard of the task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"long",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"parent_id",
+                        "description":"The uuid of a parent task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"entity",
+                        "description":"Task-specific entity description",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test task",
+                "type":"void",
+                "nickname":"unregister_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/finish_test_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Finish test task",
+                "type":"void",
+                "nickname":"finish_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to finish",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   },
+                   {
+                      "name":"error",
+                      "description":"The error with which task fails (if it does)",
+                      "required":false,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"query"
+                   }
+                ]
+             }
+          ]
+       }
+    ]
+ }
--- a/api/api.cc
+++ b/api/api.cc
@@ -60,10 +60,8 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
-        rb02->register_api_file(r, "metrics");
        rb->register_function(r, "system",
                "The system related API");
-        rb02->add_definitions_file(r, "metrics");
        set_system(ctx, r);
    });
 }
@@ -71,7 +69,7 @@ future<> set_server_init(http_context& ctx) {
 future<> set_server_config(http_context& ctx, const db::config& cfg) {
    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
    return ctx.http_server.set_routes([&ctx, &cfg, rb02](routes& r) {
-        set_config(rb02, ctx, r, cfg, false);
+        set_config(rb02, ctx, r, cfg);
    });
 }

@@ -102,16 +100,12 @@ future<> unset_rpc_controller(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_rpc_controller(ctx, r); });
 }

-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    return register_api(ctx, "storage_service", "The storage service API", [&ss, &group0_client] (http_context& ctx, routes& r) {
-            set_storage_service(ctx, r, ss, group0_client);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<gms::gossiper>& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks) {
+    return register_api(ctx, "storage_service", "The storage service API", [&ss, &g, &cdc_gs, &sys_ks] (http_context& ctx, routes& r) {
+            set_storage_service(ctx, r, ss, g.local(), cdc_gs, sys_ks);
        });
 }

-future<> unset_server_storage_service(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
-}
-
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader) {
    return ctx.http_server.set_routes([&ctx, &sst_loader] (routes& r) { set_sstables_loader(ctx, r, sst_loader); });
 }
@@ -193,10 +187,10 @@ future<> unset_server_messaging_service(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_messaging_service(ctx, r); });
 }

-future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_proxy>& proxy) {
+future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_service>& ss) {
    return register_api(ctx, "storage_proxy",
-                "The storage proxy API", [&proxy] (http_context& ctx, routes& r) {
-                    set_storage_proxy(ctx, r, proxy);
+                "The storage proxy API", [&ss] (http_context& ctx, routes& r) {
+                    set_storage_proxy(ctx, r, ss);
                });
 }

@@ -220,10 +214,10 @@ future<> set_server_cache(http_context& ctx) {
            "The cache service API", set_cache_service);
 }

-future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& proxy) {
+future<> set_hinted_handoff(http_context& ctx, sharded<gms::gossiper>& g) {
    return register_api(ctx, "hinted_handoff",
-                "The hinted handoff API", [&proxy] (http_context& ctx, routes& r) {
-                    set_hinted_handoff(ctx, r, proxy);
+                "The hinted handoff API", [&g] (http_context& ctx, routes& r) {
+                    set_hinted_handoff(ctx, r, g.local());
                });
 }

--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -22,7 +22,6 @@ namespace service {
 class load_meter;
 class storage_proxy;
 class storage_service;
-class raft_group0_client;

 } // namespace service

@@ -52,6 +51,7 @@ class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
+namespace cdc { class generation_service; }

 namespace gms {

@@ -68,13 +68,15 @@ struct http_context {
    sstring api_doc;
    httpd::http_server_control http_server;
    distributed<replica::database>& db;
+    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
    const sharded<locator::shared_token_metadata>& shared_token_metadata;
    sharded<tasks::task_manager>& tm;

    http_context(distributed<replica::database>& _db,
+            distributed<service::storage_proxy>& _sp,
            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm, sharded<tasks::task_manager>& _tm)
-            : db(_db), lmeter(_lm), shared_token_metadata(_stm), tm(_tm) {
+            : db(_db), sp(_sp), lmeter(_lm), shared_token_metadata(_stm), tm(_tm) {
    }

    const locator::token_metadata& get_token_metadata();
@@ -84,8 +86,7 @@ future<> set_server_init(http_context& ctx);
 future<> set_server_config(http_context& ctx, const db::config& cfg);
 future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
 future<> unset_server_snitch(http_context& ctx);
-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
-future<> unset_server_storage_service(http_context& ctx);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<gms::gossiper>& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks);
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
 future<> unset_server_sstables_loader(http_context& ctx);
 future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb);
@@ -105,11 +106,11 @@ future<> set_server_load_sstable(http_context& ctx, sharded<db::system_keyspace>
 future<> unset_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
 future<> unset_server_messaging_service(http_context& ctx);
-future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_proxy>& proxy);
+future<> set_server_storage_proxy(http_context& ctx, sharded<service::storage_service>& ss);
 future<> unset_server_storage_proxy(http_context& ctx);
 future<> set_server_stream_manager(http_context& ctx, sharded<streaming::stream_manager>& sm);
 future<> unset_server_stream_manager(http_context& ctx);
-future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& p);
+future<> set_hinted_handoff(http_context& ctx, sharded<gms::gossiper>& g);
 future<> unset_hinted_handoff(http_context& ctx);
 future<> set_server_gossip_settle(http_context& ctx, sharded<gms::gossiper>& g);
 future<> set_server_cache(http_context& ctx);
--- a/api/authorization_cache.cc
+++ b/api/authorization_cache.cc
@@ -11,7 +11,6 @@
 #include "api/authorization_cache.hh"
 #include "api/api.hh"
 #include "auth/common.hh"
-#include "auth/service.hh"

 namespace api {
 using namespace json;
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -43,7 +43,7 @@ std::tuple<sstring, sstring> parse_fully_qualified_cf_name(sstring name) {
    return std::make_tuple(name.substr(0, pos), name.substr(end));
 }

-table_id get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
+const table_id& get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
    try {
        return db.find_uuid(ks, cf);
    } catch (replica::no_such_column_family& e) {
@@ -51,7 +51,7 @@ table_id get_uuid(const sstring& ks, const sstring& cf, const replica::database&
    }
 }

-table_id get_uuid(const sstring& name, const replica::database& db) {
+const table_id& get_uuid(const sstring& name, const replica::database& db) {
    auto [ks, cf] = parse_fully_qualified_cf_name(name);
    return get_uuid(ks, cf, db);
 }
@@ -135,9 +135,9 @@ static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const
 static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
    std::function<utils::ihistogram(const replica::database&)> fun = [f] (const replica::database& db)  {
        utils::ihistogram res;
-        db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) mutable {
-            res += (table->get_stats().*f).hist;
-        });
+        for (auto i : db.get_column_families()) {
+            res += (i.second->get_stats().*f).hist;
+        }
        return res;
    };
    return ctx.db.map(fun).then([](const std::vector<utils::ihistogram> &res) {
@@ -162,9 +162,9 @@ static future<json::json_return_type>  get_cf_rate_and_histogram(http_context& c
 static future<json::json_return_type> get_cf_rate_and_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
    std::function<utils::rate_moving_average_and_histogram(const replica::database&)> fun = [f] (const replica::database& db)  {
        utils::rate_moving_average_and_histogram res;
-        db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
-            res += (table->get_stats().*f).rate();
-        });
+        for (auto i : db.get_column_families()) {
+            res += (i.second->get_stats().*f).rate();
+        }
        return res;
    };
    return ctx.db.map(fun).then([](const std::vector<utils::rate_moving_average_and_histogram> &res) {
@@ -306,21 +306,21 @@ ratio_holder filter_recent_false_positive_as_ratio_holder(const sstables::shared
 void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace>& sys_ks) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        std::vector<sstring> res;
-        ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
-            res.push_back(kscf.first + ":" + kscf.second);
-        });
+        for (auto i: ctx.db.local().get_column_families_mapping()) {
+            res.push_back(i.first.first + ":" + i.first.second);
+        }
        return res;
    });

    cf::get_column_family.set(r, [&ctx] (std::unique_ptr<http::request> req){
-        std::list<cf::column_family_info> res;
-            ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
+            std::list<cf::column_family_info> res;
+            for (auto i: ctx.db.local().get_column_families_mapping()) {
                cf::column_family_info info;
-                info.ks = kscf.first;
-                info.cf =  kscf.second;
+                info.ks = i.first.first;
+                info.cf =  i.first.second;
                info.type = "ColumnFamilies";
                res.push_back(info);
-            });
+            }
            return make_ready_future<json::json_return_type>(json::stream_range_as_array(std::move(res), std::identity()));
        });

@@ -871,7 +871,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/enable_auto_compaction: name={}", req->param["name"]);
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
@@ -883,7 +882,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/disable_auto_compaction: name={}", req->param["name"]);
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
@@ -894,30 +892,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        });
    });

-    cf::get_tombstone_gc.set(r, [&ctx] (const_req req) {
-        auto uuid = get_uuid(req.param["name"], ctx.db.local());
-        replica::table& t = ctx.db.local().find_column_family(uuid);
-        return t.tombstone_gc_enabled();
-    });
-
-    cf::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/enable_tombstone_gc: name={}", req->param["name"]);
-        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
-            t.set_tombstone_gc_enabled(true);
-        }).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    cf::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/disable_tombstone_gc: name={}", req->param["name"]);
-        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
-            t.set_tombstone_gc_enabled(false);
-        }).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
    cf::get_built_indexes.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
        auto ks_cf = parse_fully_qualified_cf_name(req->param["name"]);
        auto&& ks = std::get<0>(ks_cf);
@@ -981,7 +955,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::set_compaction_strategy_class.set(r, [&ctx](std::unique_ptr<http::request> req) {
        sstring strategy = req->get_query_param("class_name");
-        apilog.info("column_family/set_compaction_strategy_class: name={} strategy={}", req->param["name"], strategy);
        return foreach_column_family(ctx, req->param["name"], [strategy](replica::column_family& cf) {
            cf.set_compaction_strategy(sstables::compaction_strategy::type(strategy));
        }).then([] {
@@ -1017,12 +990,11 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        auto key = req->get_query_param("key");
        auto uuid = get_uuid(req->param["name"], ctx.db.local());

-        return ctx.db.map_reduce0([key, uuid] (replica::database& db) -> future<std::unordered_set<sstring>> {
-            auto sstables = co_await db.find_column_family(uuid).get_sstables_by_partition_key(key);
-            co_return boost::copy_range<std::unordered_set<sstring>>(sstables | boost::adaptors::transformed([] (auto s) { return s->get_filename(); }));
+        return ctx.db.map_reduce0([key, uuid] (replica::database& db) {
+            return db.find_column_family(uuid).get_sstables_by_partition_key(key);
        }, std::unordered_set<sstring>(),
-        [](std::unordered_set<sstring> a, std::unordered_set<sstring>&& b) mutable {
-            a.merge(b);
+            [](std::unordered_set<sstring> a, std::unordered_set<sstring>&& b) mutable {
+            a.insert(b.begin(),b.end());
            return a;
        }).then([](const std::unordered_set<sstring>& res) {
            return make_ready_future<json::json_return_type>(container_to_vec(res));
@@ -1047,31 +1019,16 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto params = req_params({
-            std::pair("name", mandatory::yes),
-            std::pair("flush_memtables", mandatory::no),
-            std::pair("split_output", mandatory::no),
-        });
-        params.process(*req);
-        if (params.get("split_output")) {
+        if (req->get_query_param("split_output") != "") {
            fail(unimplemented::cause::API);
        }
-        auto [ks, cf] = parse_fully_qualified_cf_name(*params.get("name"));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.info("column_family/force_major_compaction: name={} flush={}", req->param["name"], flush);

+        auto [ks, cf] = parse_fully_qualified_cf_name(req->param["name"]);
        auto keyspace = validate_keyspace(ctx, ks);
-        std::vector<table_info> table_infos = {table_info{
-            .name = cf,
-            .id = ctx.db.local().find_uuid(ks, cf)
-        }};
+        std::vector<table_id> table_infos = {ctx.db.local().find_uuid(ks, cf)};

        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<major_compaction_task_impl::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = major_compaction_task_impl::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), ctx.db, std::move(table_infos), fmopt);
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, std::move(table_infos));
        co_await task->done();
        co_return json_void();
    });
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -23,7 +23,7 @@ namespace api {
 void set_column_family(http_context& ctx, httpd::routes& r, sharded<db::system_keyspace>& sys_ks);
 void unset_column_family(http_context& ctx, httpd::routes& r);

-table_id get_uuid(const sstring& name, const replica::database& db);
+const table_id& get_uuid(const sstring& name, const replica::database& db);
 future<> foreach_column_family(http_context& ctx, const sstring& name, std::function<void(replica::column_family&)> f);


@@ -68,10 +68,9 @@ struct map_reduce_column_families_locally {
    std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)> reducer;
    future<std::unique_ptr<std::any>> operator()(replica::database& db) const {
        auto res = seastar::make_lw_shared<std::unique_ptr<std::any>>(std::make_unique<std::any>(init));
-        return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) {
-            *res = reducer(std::move(*res), mapper(*table.get()));
-            return make_ready_future();
-        }).then([res] () {
+        return do_for_each(db.get_column_families(), [res, this](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) {
+            *res = reducer(std::move(*res), mapper(*i.second.get()));
+        }).then([res] {
            return std::move(*res);
        });
    }
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -68,8 +68,8 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) {
-                    replica::table& cf = *table.get();
+                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) -> future<> {
+                    replica::table& cf = *i.second.get();
                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
                    return make_ready_future<>();
                }).then([&tasks] {
--- a/api/config.cc
+++ b/api/config.cc
@@ -45,7 +45,7 @@ future<> get_config_swagger_entry(std::string_view name, const std::string& desc
    } else {
        ss <<',';
    };
-    ss << "\"/v2/config/" << name <<"\": {"
+    ss << "\"/config/" << name <<"\": {"
      "\"get\": {"
        "\"description\": \"" << boost::replace_all_copy(boost::replace_all_copy(boost::replace_all_copy(description,"\n","\\n"),"\"", "''"), "\t", " ") <<"\","
        "\"operationId\": \"find_config_"<< name <<"\","
@@ -76,9 +76,9 @@ future<> get_config_swagger_entry(std::string_view name, const std::string& desc

 namespace cs = httpd::config_json;

-void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx, routes& r, const db::config& cfg, bool first) {
-    rb->register_function(r, [&cfg, first] (output_stream<char>& os) {
-        return do_with(first, [&os, &cfg] (bool& first) {
+void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx, routes& r, const db::config& cfg) {
+    rb->register_function(r, [&cfg] (output_stream<char>& os) {
+        return do_with(true, [&os, &cfg] (bool& first) {
            auto f = make_ready_future();
            for (auto&& cfg_ref : cfg.values()) {
                auto&& cfg = cfg_ref.get();
--- a/api/config.hh
+++ b/api/config.hh
@@ -13,5 +13,5 @@

 namespace api {

-void set_config(std::shared_ptr<httpd::api_registry_builder20> rb, http_context& ctx, httpd::routes& r, const db::config& cfg, bool first = false);
+void set_config(std::shared_ptr<httpd::api_registry_builder20> rb, http_context& ctx, httpd::routes& r, const db::config& cfg);
 }
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -12,9 +12,7 @@
 #include <seastar/http/exception.hh>
 #include "log.hh"
 #include "utils/error_injection.hh"
-#include "utils/rjson.hh"
 #include <seastar/core/future-util.hh>
-#include <seastar/util/short_streams.hh>

 namespace api {
 using namespace seastar::httpd;
@@ -26,27 +24,10 @@ void set_error_injection(http_context& ctx, routes& r) {
    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
        sstring injection = req->param["injection"];
        bool one_shot = req->get_query_param("one_shot") == "True";
-        auto params = req->content;
-
-        const size_t max_params_size = 1024 * 1024;
-        if (params.size() > max_params_size) {
-            // This is a hard limit, because we don't want to allocate
-            // too much memory or block the thread for too long.
-            throw httpd::bad_param_exception(format("Injection parameters are too long, max length is {}", max_params_size));
-        }
-
-        try {
-            auto parameters = params.empty()
-                ? utils::error_injection_parameters{}
-                : rjson::parse_to_map<utils::error_injection_parameters>(params);
-
-            auto& errinj = utils::get_local_injector();
-            return errinj.enable_on_all(injection, one_shot, std::move(parameters)).then([] {
-                return make_ready_future<json::json_return_type>(json::json_void());
-            });
-        } catch (const rjson::error& e) {
-            throw httpd::bad_param_exception(format("Failed to parse injections parameters: {}", e.what()));
-        }
+        auto& errinj = utils::get_local_injector();
+        return errinj.enable_on_all(injection, one_shot).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
    });

    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
@@ -71,13 +52,6 @@ void set_error_injection(http_context& ctx, routes& r) {
        });
    });

-    hf::message_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->param["injection"];
-        auto& errinj = utils::get_local_injector();
-        return errinj.receive_message_on_all(injection).then([] {
-            return make_ready_future<json::json_return_type>(json::json_void());
-        });
-    });
 }

 } // namespace api
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -18,43 +18,36 @@ namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            std::vector<fd::endpoint_state> res;
-            res.reserve(g.num_endpoints());
-            g.for_each_endpoint_state([&] (const gms::inet_address& addr, const gms::endpoint_state& eps) {
-                fd::endpoint_state val;
-                val.addrs = fmt::to_string(addr);
-                val.is_alive = g.is_alive(addr);
-                val.generation = eps.get_heart_beat_state().get_generation().value();
-                val.version = eps.get_heart_beat_state().get_heart_beat_version().value();
-                val.update_time = eps.get_update_timestamp().time_since_epoch().count();
-                for (const auto& [as_type, app_state] : eps.get_application_state_map()) {
-                    fd::version_value version_val;
-                    // We return the enum index and not it's name to stay compatible to origin
-                    // method that the state index are static but the name can be changed.
-                    version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(as_type);
-                    version_val.value = app_state.value();
-                    version_val.version = app_state.version().value();
-                    val.application_state.push(version_val);
-                }
-                res.emplace_back(std::move(val));
-            });
-            return make_ready_future<json::json_return_type>(res);
-        });
+        std::vector<fd::endpoint_state> res;
+        for (auto i : g.get_endpoint_states()) {
+            fd::endpoint_state val;
+            val.addrs = fmt::to_string(i.first);
+            val.is_alive = i.second.is_alive();
+            val.generation = i.second.get_heart_beat_state().get_generation().value();
+            val.version = i.second.get_heart_beat_state().get_heart_beat_version().value();
+            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+            for (auto a : i.second.get_application_state_map()) {
+                fd::version_value version_val;
+                // We return the enum index and not it's name to stay compatible to origin
+                // method that the state index are static but the name can be changed.
+                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                version_val.value = a.second.value();
+                version_val.version = a.second.version().value();
+                val.application_state.push(version_val);
+            }
+            res.push_back(val);
+        }
+        return make_ready_future<json::json_return_type>(res);
    });

    fd::get_up_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            int res = g.get_up_endpoint_count();
-            return make_ready_future<json::json_return_type>(res);
-        });
+        int res = g.get_up_endpoint_count();
+        return make_ready_future<json::json_return_type>(res);
    });

    fd::get_down_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            int res = g.get_down_endpoint_count();
-            return make_ready_future<json::json_return_type>(res);
-        });
+        int res = g.get_down_endpoint_count();
+        return make_ready_future<json::json_return_type>(res);
    });

    fd::get_phi_convict_threshold.set(r, [] (std::unique_ptr<request> req) {
@@ -62,13 +55,11 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [] (gms::gossiper& g) {
-            std::map<sstring, sstring> nodes_status;
-            g.for_each_endpoint_state([&] (const gms::inet_address& node, const gms::endpoint_state&) {
-                nodes_status.emplace(node.to_sstring(), g.is_alive(node) ? "UP" : "DOWN");
-            });
-            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
-        });
+        std::map<sstring, sstring> nodes_status;
+        for (auto& entry : g.get_endpoint_states()) {
+            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+        }
+        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
    });

    fd::set_phi_convict_threshold.set(r, [](std::unique_ptr<request> req) {
@@ -79,15 +70,13 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
-            auto state = g.get_endpoint_state_ptr(gms::inet_address(req->param["addr"]));
-            if (!state) {
-                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
-            }
-            std::stringstream ss;
-            g.append_endpoint_state(ss, *state);
-            return make_ready_future<json::json_return_type>(sstring(ss.str()));
-        });
+        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
+        if (!state) {
+            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+        }
+        std::stringstream ss;
+        g.append_endpoint_state(ss, *state);
+        return make_ready_future<json::json_return_type>(sstring(ss.str()));
    });

    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -6,11 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

-#include <seastar/core/coroutine.hh>
-
 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
-#include "gms/endpoint_state.hh"
 #include "gms/gossiper.hh"

 namespace api {
@@ -18,9 +15,9 @@ using namespace seastar::httpd;
 using namespace json;

 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
-    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
-        auto res = co_await g.get_unreachable_members_synchronized();
-        co_return json::json_return_type(container_to_vec(res));
+    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (const_req req) {
+        auto res = g.get_unreachable_members();
+        return container_to_vec(res);
    });


@@ -30,11 +27,9 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
        });
    });

-    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
-        gms::inet_address ep(req->param["addr"]);
-        // synchronize unreachable_members on all shards
-        co_await g.get_unreachable_members_synchronized();
-        co_return g.get_endpoint_downtime(ep);
+    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
+        gms::inet_address ep(req.param["addr"]);
+        return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<http::request> req) {
@@ -64,7 +59,7 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {

    httpd::gossiper_json::force_remove_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
        gms::inet_address ep(req->param["addr"]);
-        return g.force_remove_endpoint(ep, gms::null_permit_id).then([] {
+        return g.force_remove_endpoint(ep).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
--- a/api/hinted_handoff.cc
+++ b/api/hinted_handoff.cc
@@ -13,6 +13,7 @@
 #include "api/api-doc/hinted_handoff.json.hh"

 #include "gms/inet_address.hh"
+#include "gms/gossiper.hh"
 #include "service/storage_proxy.hh"

 namespace api {
@@ -21,33 +22,38 @@ using namespace json;
 using namespace seastar::httpd;
 namespace hh = httpd::hinted_handoff_json;

-void set_hinted_handoff(http_context& ctx, routes& r, sharded<service::storage_proxy>& proxy) {
-    hh::create_hints_sync_point.set(r, [&proxy] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto parse_hosts_list = [] (sstring arg) {
+void set_hinted_handoff(http_context& ctx, routes& r, gms::gossiper& g) {
+    hh::create_hints_sync_point.set(r, [&ctx, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto parse_hosts_list = [&g] (sstring arg) {
            std::vector<sstring> hosts_str = split(arg, ",");
            std::vector<gms::inet_address> hosts;
            hosts.reserve(hosts_str.size());

-            for (const auto& host_str : hosts_str) {
-                try {
-                    gms::inet_address host;
-                    host = gms::inet_address(host_str);
-                    hosts.push_back(host);
-                } catch (std::exception& e) {
-                    throw httpd::bad_param_exception(format("Failed to parse host address {}: {}", host_str, e.what()));
+            if (hosts_str.empty()) {
+                // No target_hosts specified means that we should wait for hints for all nodes to be sent
+                const auto members_set = g.get_live_members();
+                std::copy(members_set.begin(), members_set.end(), std::back_inserter(hosts));
+            } else {
+                for (const auto& host_str : hosts_str) {
+                    try {
+                        gms::inet_address host;
+                        host = gms::inet_address(host_str);
+                        hosts.push_back(host);
+                    } catch (std::exception& e) {
+                        throw httpd::bad_param_exception(format("Failed to parse host address {}: {}", host_str, e.what()));
+                    }
                }
            }
-
            return hosts;
        };

        std::vector<gms::inet_address> target_hosts = parse_hosts_list(req->get_query_param("target_hosts"));
-        return proxy.local().create_hint_sync_point(std::move(target_hosts)).then([] (db::hints::sync_point sync_point) {
+        return ctx.sp.local().create_hint_sync_point(std::move(target_hosts)).then([] (db::hints::sync_point sync_point) {
            return json::json_return_type(sync_point.encode());
        });
    });

-    hh::get_hints_sync_point.set(r, [&proxy] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    hh::get_hints_sync_point.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        db::hints::sync_point sync_point;
        const sstring encoded = req->get_query_param("id");
        try {
@@ -81,7 +87,7 @@ void set_hinted_handoff(http_context& ctx, routes& r, sharded<service::storage_p
        using return_type = hh::ns_get_hints_sync_point::get_hints_sync_point_return_type;
        using return_type_wrapper = hh::ns_get_hints_sync_point::return_type_wrapper;

-        return proxy.local().wait_for_hint_sync_point(std::move(sync_point), deadline).then([] {
+        return ctx.sp.local().wait_for_hint_sync_point(std::move(sync_point), deadline).then([] {
            return json::json_return_type(return_type_wrapper(return_type::DONE));
        }).handle_exception_type([] (const timed_out_error&) {
            return json::json_return_type(return_type_wrapper(return_type::IN_PROGRESS));
--- a/api/hinted_handoff.hh
+++ b/api/hinted_handoff.hh
@@ -8,14 +8,17 @@

 #pragma once

-#include <seastar/core/sharded.hh>
 #include "api.hh"

-namespace service { class storage_proxy; }
+namespace gms {
+
+class gossiper;
+
+}

 namespace api {

-void set_hinted_handoff(http_context& ctx, httpd::routes& r, sharded<service::storage_proxy>& p);
+void set_hinted_handoff(http_context& ctx, httpd::routes& r, gms::gossiper& g);
 void unset_hinted_handoff(http_context& ctx, httpd::routes& r);

 }
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -10,6 +10,7 @@
 #include "service/storage_proxy.hh"
 #include "api/api-doc/storage_proxy.json.hh"
 #include "api/api-doc/utils.json.hh"
+#include "service/storage_service.hh"
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "replica/database.hh"
@@ -115,17 +116,17 @@ utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimat
    return res;
 }

-static future<json::json_return_type>  sum_estimated_histogram(sharded<service::storage_proxy>& proxy, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(proxy, [f] (service::storage_proxy_stats::stats& stats) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
        return (stats.*f).histogram();
    }, utils::time_estimated_histogram_merge, utils::time_estimated_histogram()).then([](const utils::time_estimated_histogram& val) {
        return make_ready_future<json::json_return_type>(time_to_json_histogram(val));
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(sharded<service::storage_proxy>& proxy, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {

-    return two_dimensional_map_reduce(proxy, f, utils::estimated_histogram_merge,
+    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
@@ -133,8 +134,8 @@ static future<json::json_return_type>  sum_estimated_histogram(sharded<service::
    });
 }

-static future<json::json_return_type>  total_latency(sharded<service::storage_proxy>& proxy, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(proxy, [f] (service::storage_proxy_stats::stats& stats) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
            return (stats.*f).hist.mean * (stats.*f).hist.count;
        }, std::plus<double>(), 0.0).then([](double val) {
        int64_t res = val;
@@ -183,43 +184,43 @@ sum_timer_stats_storage_proxy(distributed<proxy>& d,
    });
 }

-void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_proxy>& proxy) {
+void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_service>& ss) {
    sp::get_total_hints.set(r, [](std::unique_ptr<http::request> req)  {
        //TBD
        unimplemented();
        return make_ready_future<json::json_return_type>(0);
    });

-    sp::get_hinted_handoff_enabled.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        const auto& filter = proxy.local().get_hints_host_filter();
+    sp::get_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        const auto& filter = ctx.sp.local().get_hints_host_filter();
        return make_ready_future<json::json_return_type>(!filter.is_disabled_for_all());
    });

-    sp::set_hinted_handoff_enabled.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::set_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        auto enable = req->get_query_param("enable");
        auto filter = (enable == "true" || enable == "1")
                ? db::hints::host_filter(db::hints::host_filter::enabled_for_all_tag {})
                : db::hints::host_filter(db::hints::host_filter::disabled_for_all_tag {});
-        return proxy.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+        return ctx.sp.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
            return sp.change_hints_host_filter(filter);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    sp::get_hinted_handoff_enabled_by_dc.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::get_hinted_handoff_enabled_by_dc.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        std::vector<sstring> res;
-        const auto& filter = proxy.local().get_hints_host_filter();
+        const auto& filter = ctx.sp.local().get_hints_host_filter();
        const auto& dcs = filter.get_dcs();
        res.reserve(res.size());
        std::copy(dcs.begin(), dcs.end(), std::back_inserter(res));
        return make_ready_future<json::json_return_type>(res);
    });

-    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [&proxy](std::unique_ptr<http::request> req)  {
+    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [&ctx](std::unique_ptr<http::request> req)  {
        auto dcs = req->get_query_param("dcs");
        auto filter = db::hints::host_filter::parse_from_dc_list(std::move(dcs));
-        return proxy.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+        return ctx.sp.invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
            return sp.change_hints_host_filter(filter);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -341,131 +342,144 @@ void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_pr
        return make_ready_future<json::json_return_type>(json_void());
    });

-    sp::get_read_repair_attempted.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_attempts);
+    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
    });

-    sp::get_read_repair_repaired_blocking.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
+    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
    });

-    sp::get_read_repair_repaired_background.set(r, [&proxy](std::unique_ptr<http::request> req)  {
-        return sum_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read_repair_repaired_background);
+    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<http::request> req)  {
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
    });

-    sp::get_cas_read_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_read_timeouts);
+    sp::get_schema_versions.set(r, [&ss](std::unique_ptr<http::request> req)  {
+        return ss.local().describe_schema_versions().then([] (auto result) {
+            std::vector<sp::mapper_list> res;
+            for (auto e : result) {
+                sp::mapper_list entry;
+                entry.key = std::move(e.first);
+                entry.value = std::move(e.second);
+                res.emplace_back(std::move(entry));
+            }
+            return make_ready_future<json::json_return_type>(std::move(res));
+        });
    });

-    sp::get_cas_read_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_read_unavailables);
+    sp::get_cas_read_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_timeouts);
    });

-    sp::get_cas_write_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_write_timeouts);
+    sp::get_cas_read_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_unavailables);
    });

-    sp::get_cas_write_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &proxy::stats::cas_write_unavailables);
+    sp::get_cas_write_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_timeouts);
    });

-    sp::get_cas_write_metrics_unfinished_commit.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_write_unfinished_commit);
+    sp::get_cas_write_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_unavailables);
    });

-    sp::get_cas_write_metrics_contention.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &proxy::stats::cas_write_contention);
+    sp::get_cas_write_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_unfinished_commit);
    });

-    sp::get_cas_write_metrics_condition_not_met.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_write_condition_not_met);
+    sp::get_cas_write_metrics_contention.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_write_contention);
    });

-    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_failed_read_round_optimization);
+    sp::get_cas_write_metrics_condition_not_met.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

-    sp::get_cas_read_metrics_unfinished_commit.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_stats(proxy, &proxy::stats::cas_read_unfinished_commit);
+    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
    });

-    sp::get_cas_read_metrics_contention.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &proxy::stats::cas_read_contention);
+    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });

-    sp::get_read_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::read_timeouts);
+    sp::get_cas_read_metrics_contention.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_read_contention);
    });

-    sp::get_read_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::read_unavailables);
+    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

-    sp::get_range_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::range_slice_timeouts);
+    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

-    sp::get_range_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::range_slice_unavailables);
+    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

-    sp::get_write_metrics_timeouts.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::write_timeouts);
+    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

-    sp::get_write_metrics_unavailables.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_long(proxy, &service::storage_proxy_stats::stats::write_unavailables);
+    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

-    sp::get_read_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::read_timeouts);
+    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

-    sp::get_read_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::read_unavailables);
+    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

-    sp::get_range_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::range_slice_timeouts);
+    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

-    sp::get_range_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::range_slice_unavailables);
+    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

-    sp::get_write_metrics_timeouts_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::write_timeouts);
+    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

-    sp::get_write_metrics_unavailables_rates.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timed_rate_as_obj(proxy, &service::storage_proxy_stats::stats::write_unavailables);
+    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

-    sp::get_range_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

-    sp::get_write_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_read_metrics_latency_histogram_depricated.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_histogram_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_range_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_write_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::write);
-    });
-    sp::get_cas_write_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats(proxy, &proxy::stats::cas_write);
+    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_cas_read_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats(proxy, &proxy::stats::cas_read);
+    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
+    });
+    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
+    });
+
+    sp::get_cas_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_read);
    });

    sp::get_view_write_metrics_latency_histogram.set(r, [](std::unique_ptr<http::request> req) {
@@ -476,31 +490,31 @@ void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_pr
        return make_ready_future<json::json_return_type>(get_empty_moving_average());
    });

-    sp::get_read_metrics_latency_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_read_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::read);
    });

-    sp::get_read_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::read);
+    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
-    sp::get_write_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_estimated_histogram(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_write_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::write);
+    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
    });

-    sp::get_range_estimated_histogram.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return sum_timer_stats_storage_proxy(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

-    sp::get_range_latency.set(r, [&proxy](std::unique_ptr<http::request> req) {
-        return total_latency(proxy, &service::storage_proxy_stats::stats::range);
+    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
    });
 }

@@ -533,6 +547,7 @@ void unset_storage_proxy(http_context& ctx, routes& r) {
    sp::get_read_repair_attempted.unset(r);
    sp::get_read_repair_repaired_blocking.unset(r);
    sp::get_read_repair_repaired_background.unset(r);
+    sp::get_schema_versions.unset(r);
    sp::get_cas_read_timeouts.unset(r);
    sp::get_cas_read_unavailables.unset(r);
    sp::get_cas_write_timeouts.unset(r);
--- a/api/storage_proxy.hh
+++ b/api/storage_proxy.hh
@@ -11,11 +11,11 @@
 #include <seastar/core/sharded.hh>
 #include "api.hh"

-namespace service { class storage_proxy; }
+namespace service { class storage_service; }

 namespace api {

-void set_storage_proxy(http_context& ctx, httpd::routes& r, sharded<service::storage_proxy>& proxy);
+void set_storage_proxy(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss);
 void unset_storage_proxy(http_context& ctx, httpd::routes& r);

 }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -8,7 +8,6 @@

 #include "storage_service.hh"
 #include "api/api-doc/storage_service.json.hh"
-#include "api/api-doc/storage_proxy.json.hh"
 #include "db/config.hh"
 #include "db/schema_tables.hh"
 #include "utils/hash.hh"
@@ -43,6 +42,7 @@
 #include "thrift/controller.hh"
 #include "locator/token_metadata.hh"
 #include "cdc/generation_service.hh"
+#include "service/storage_proxy.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "sstables_loader.hh"
 #include "db/view/view_builder.hh"
@@ -52,10 +52,22 @@ using namespace std::chrono_literals;

 extern logging::logger apilog;

+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti) {
+    fmt::print(os, "table{{name={}, id={}}}", ti.name, ti.id);
+    return os;
+}
+
+} // namespace std
+
 namespace api {

+const locator::token_metadata& http_context::get_token_metadata() {
+        return *shared_token_metadata.local().get();
+}
+
 namespace ss = httpd::storage_service_json;
-namespace sp = httpd::storage_proxy_json;
 using namespace json;

 sstring validate_keyspace(http_context& ctx, sstring ks_name) {
@@ -208,63 +220,44 @@ seastar::future<json::json_return_type> run_toppartitions_query(db::toppartition
    });
 }

-static future<json::json_return_type> set_tables(http_context& ctx, const sstring& keyspace, std::vector<sstring> tables, std::function<future<>(replica::table&)> set) {
+future<json::json_return_type> set_tables_autocompaction(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
    if (tables.empty()) {
        tables = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
    }

-    return do_with(keyspace, std::move(tables), [&ctx, set] (const sstring& keyspace, const std::vector<sstring>& tables) {
-        return ctx.db.invoke_on_all([&keyspace, &tables, set] (replica::database& db) {
-            return parallel_for_each(tables, [&db, &keyspace, set] (const sstring& table) {
-                replica::table& t = db.find_column_family(keyspace, table);
-                return set(t);
-            });
+    apilog.info("set_tables_autocompaction: enabled={} keyspace={} tables={}", enabled, keyspace, tables);
+    return do_with(keyspace, std::move(tables), [&ctx, enabled] (const sstring &keyspace, const std::vector<sstring>& tables) {
+        return ctx.db.invoke_on(0, [&ctx, &keyspace, &tables, enabled] (replica::database& db) {
+            auto g = replica::database::autocompaction_toggle_guard(db);
+            return ctx.db.invoke_on_all([&keyspace, &tables, enabled] (replica::database& db) {
+                return parallel_for_each(tables, [&db, &keyspace, enabled] (const sstring& table) {
+                    replica::column_family& cf = db.find_column_family(keyspace, table);
+                    if (enabled) {
+                        cf.enable_auto_compaction();
+                    } else {
+                        return cf.disable_auto_compaction();
+                    }
+                    return make_ready_future<>();
+                });
+            }).finally([g = std::move(g)] {});
        });
    }).then([] {
        return make_ready_future<json::json_return_type>(json_void());
    });
 }

-future<json::json_return_type> set_tables_autocompaction(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
-    apilog.info("set_tables_autocompaction: enabled={} keyspace={} tables={}", enabled, keyspace, tables);
-
-    return ctx.db.invoke_on(0, [&ctx, keyspace, tables = std::move(tables), enabled] (replica::database& db) {
-        auto g = replica::database::autocompaction_toggle_guard(db);
-        return set_tables(ctx, keyspace, tables, [enabled] (replica::table& cf) {
-            if (enabled) {
-                cf.enable_auto_compaction();
-            } else {
-                return cf.disable_auto_compaction();
-            }
-            return make_ready_future<>();
-        }).finally([g = std::move(g)] {});
-    });
-}
-
-future<json::json_return_type> set_tables_tombstone_gc(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
-    apilog.info("set_tables_tombstone_gc: enabled={} keyspace={} tables={}", enabled, keyspace, tables);
-    return set_tables(ctx, keyspace, std::move(tables), [enabled] (replica::table& t) {
-        t.set_tombstone_gc_enabled(enabled);
-        return make_ready_future<>();
-    });
-}
-
 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
-    ss::start_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.start_server();
-            });
+            return ctl.start_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::stop_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.request_stop_server();
-            });
+            return ctl.request_stop_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -286,21 +279,17 @@ void unset_transport_controller(http_context& ctx, routes& r) {
 }

 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
-    ss::stop_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.request_stop_server();
-            });
+            return ctl.request_stop_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::start_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<http::request> req) {
+    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
-            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
-                return ctl.start_server();
-            });
+            return ctl.start_server();
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -325,7 +314,7 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair) {
    ss::repair_async.set(r, [&ctx, &repair](std::unique_ptr<http::request> req) {
        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "ignore_nodes", "trace",
-                "startToken", "endToken", "ranges_parallelism"};
+                "startToken", "endToken" };
        std::unordered_map<sstring, sstring> options_map;
        for (auto o : options) {
            auto s = req->get_query_param(o);
@@ -470,21 +459,29 @@ static future<json::json_return_type> describe_ring_as_json(sharded<service::sto
    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring(keyspace), token_range_endpoints_to_json));
 }

-void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    ss::local_hostid.set(r, [&ss](std::unique_ptr<http::request> req) {
-        auto id = ss.local().get_token_metadata().get_my_id();
+static std::vector<table_id> get_table_ids(const std::vector<table_info>& table_infos) {
+    std::vector<table_id> table_ids{table_infos.size()};
+    boost::transform(table_infos, table_ids.begin(), [] (const auto& ti) {
+        return ti.id;
+    });
+    return table_ids;
+}
+
+void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks) {
+    ss::local_hostid.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        auto id = ctx.db.local().get_config().host_id;
        return make_ready_future<json::json_return_type>(id.to_sstring());
    });

-    ss::get_tokens.set(r, [&ss] (std::unique_ptr<http::request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ss.local().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<http::request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().sorted_tokens(), [](const dht::token& i) {
           return fmt::to_string(i);
        }));
    });

-    ss::get_node_tokens.set(r, [&ss] (std::unique_ptr<http::request> req) {
+    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ss.local().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().get_tokens(addr), [](const dht::token& i) {
           return fmt::to_string(i);
       }));
    });
@@ -552,8 +549,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::get_leaving_nodes.set(r, [&ss](const_req req) {
-        return container_to_vec(ss.local().get_token_metadata().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
+        return container_to_vec(ctx.get_token_metadata().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -561,8 +558,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [&ss](const_req req) {
-        auto points = ss.local().get_token_metadata().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
+        auto points = ctx.get_token_metadata().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(fmt::to_string(i.second));
@@ -622,7 +619,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::describe_any_ring.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) {
        // Find an arbitrary non-system keyspace.
-        auto keyspaces = ctx.db.local().get_non_local_vnode_based_strategy_keyspaces();
+        auto keyspaces = ctx.db.local().get_non_local_strategy_keyspaces();
        if (keyspaces.empty()) {
            throw std::runtime_error("No keyspace provided and no non system kespace exist");
        }
@@ -634,9 +631,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return describe_ring_as_json(ss, validate_keyspace(ctx, req->param));
    });

-    ss::get_host_id_map.set(r, [&ss](const_req req) {
+    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(ss.local().get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<http::request> req) {
@@ -656,9 +653,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::get_current_generation_number.set(r, [&ss](std::unique_ptr<http::request> req) {
+    ss::get_current_generation_number.set(r, [&g](std::unique_ptr<http::request> req) {
        gms::inet_address ep(utils::fb_utilities::get_broadcast_address());
-        return ss.local().gossiper().get_current_generation_number(ep).then([](gms::generation_type res) {
+        return g.get_current_generation_number(ep).then([](gms::generation_type res) {
            return make_ready_future<json::json_return_type>(res.value());
        });
    });
@@ -669,58 +666,23 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                req.get_query_param("key")));
    });

-    ss::cdc_streams_check_and_repair.set(r, [&ss] (std::unique_ptr<http::request> req) {
-        return ss.invoke_on(0, [] (service::storage_service& ss) {
-            return ss.check_and_repair_cdc_streams();
-        }).then([] {
+    ss::cdc_streams_check_and_repair.set(r, [&cdc_gs] (std::unique_ptr<http::request> req) {
+        if (!cdc_gs.local_is_initialized()) {
+            throw std::runtime_error("get_cdc_generation_service: not initialized yet");
+        }
+        return cdc_gs.local().check_and_repair_cdc_streams().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::force_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("flush_memtables", mandatory::no),
-        });
-        params.process(*req);
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.info("force_compaction: flush={}", flush);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<major_compaction_task_impl::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = major_compaction_task_impl::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_compaction failed: {}", std::current_exception());
-            throw;
-        }
-
-        co_return json_void();
-    });
-
    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("keyspace", mandatory::yes),
-            std::pair("cf", mandatory::no),
-            std::pair("flush_memtables", mandatory::no),
-        });
-        params.process(*req);
-        auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
-        auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.debug("force_keyspace_compaction: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, table_infos);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<major_compaction_task_impl::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = major_compaction_task_impl::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos));
        try {
            co_await task->done();
        } catch (...) {
@@ -743,7 +705,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        }

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos);
+        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos));
        try {
            co_await task->done();
        } catch (...) {
@@ -758,7 +720,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, res);
+        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, get_table_ids(table_infos), res);
        try {
            co_await task->done();
        } catch (...) {
@@ -776,7 +738,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, get_table_ids(table_infos), exclude_current_version);
        try {
            co_await task->done();
        } catch (...) {
@@ -787,14 +749,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        co_return json::json_return_type(0);
    }));

-    ss::force_flush.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        apilog.info("flush all tables");
-        co_await ctx.db.invoke_on_all([] (replica::database& db) {
-            return db.flush_all_tables();
-        });
-        co_return json_void();
-    });
-
    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
@@ -825,16 +779,21 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::remove_node.set(r, [&ss](std::unique_ptr<http::request> req) {
        auto host_id = validate_host_id(req->get_query_param("host_id"));
-        std::vector<sstring> ignore_nodes_strs = utils::split_comma_separated_list(req->get_query_param("ignore_nodes"));
+        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<locator::host_id_or_endpoint>();
-        for (const sstring& n : ignore_nodes_strs) {
+        for (std::string n : ignore_nodes_strs) {
            try {
-                auto hoep = locator::host_id_or_endpoint(n);
-                if (!ignore_nodes.empty() && hoep.has_host_id() != ignore_nodes.front().has_host_id()) {
-                    throw std::runtime_error("All nodes should be identified using the same method: either Host IDs or ip addresses.");
+                std::replace(n.begin(), n.end(), '\"', ' ');
+                std::replace(n.begin(), n.end(), '\'', ' ');
+                boost::trim_all(n);
+                if (!n.empty()) {
+                    auto hoep = locator::host_id_or_endpoint(n);
+                    if (!ignore_nodes.empty() && hoep.has_host_id() != ignore_nodes.front().has_host_id()) {
+                        throw std::runtime_error("All nodes should be identified using the same method: either Host IDs or ip addresses.");
+                    }
+                    ignore_nodes.push_back(std::move(hoep));
                }
-                ignore_nodes.push_back(std::move(hoep));
            } catch (...) {
                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}: {}", ignore_nodes_strs, n, std::current_exception()));
            }
@@ -947,11 +906,11 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(json_void());
    });

-    ss::is_initialized.set(r, [&ss](std::unique_ptr<http::request> req) {
-        return ss.local().get_operation_mode().then([&ss] (auto mode) {
+    ss::is_initialized.set(r, [&ss, &g](std::unique_ptr<http::request> req) {
+        return ss.local().get_operation_mode().then([&g] (auto mode) {
            bool is_initialized = mode >= service::storage_service::mode::STARTING;
            if (mode == service::storage_service::mode::NORMAL) {
-                is_initialized = ss.local().gossiper().is_enabled();
+                is_initialized = g.is_enabled();
            }
            return make_ready_future<json::json_return_type>(is_initialized);
        });
@@ -1020,9 +979,10 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                ks.set_incremental_backups(value);
            }

-            db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
-                table->set_incremental_backups(value);
-            });
+            for (auto& pair: db.get_column_families()) {
+                auto cf_ptr = pair.second;
+                cf_ptr->set_incremental_backups(value);
+            }
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -1063,11 +1023,13 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    ss::reset_local_schema.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
+        auto& fs = ctx.sp.local().features();
        apilog.info("reset_local_schema");
-        co_await ss.local().reload_schema();
-        co_return json_void();
+        return db::schema_tables::recalculate_schema_version(sys_ks, ctx.sp, fs).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<http::request> req) {
@@ -1149,22 +1111,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return set_tables_autocompaction(ctx, keyspace, tables, false);
    });

-    ss::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
-        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-
-        apilog.info("enable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
-        return set_tables_tombstone_gc(ctx, keyspace, tables, true);
-    });
-
-    ss::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
-        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-
-        apilog.info("disable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
-        return set_tables_tombstone_gc(ctx, keyspace, tables, false);
-    });
-
    ss::deliver_hints.set(r, [](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
@@ -1172,12 +1118,12 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(json_void());
      });

-    ss::get_cluster_name.set(r, [&ss](const_req req) {
-        return ss.local().gossiper().get_cluster_name();
+    ss::get_cluster_name.set(r, [&g](const_req req) {
+        return g.get_cluster_name();
    });

-    ss::get_partitioner_name.set(r, [&ss](const_req req) {
-        return ss.local().gossiper().get_partitioner_name();
+    ss::get_partitioner_name.set(r, [&g](const_req req) {
+        return g.get_partitioner_name();
    });

    ss::get_tombstone_warn_threshold.set(r, [](std::unique_ptr<http::request> req) {
@@ -1295,7 +1241,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

                auto& ext = db.get_config().extensions();

-                db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
+                for (auto& t : db.get_column_families() | boost::adaptors::map_values) {
                    auto& schema = t->schema();
                    if ((ks.empty() || ks == schema->ks_name()) && (cf.empty() || cf == schema->cf_name())) {
                        // at most Nsstables long
@@ -1311,7 +1257,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                            ss::sstable info;

                            info.timestamp = t;
-                            info.generation = fmt::to_string(sstable->generation());
+                            info.generation = sstables::generation_value(sstable->generation());
                            info.level = sstable->get_sstable_level();
                            info.size = sstable->bytes_on_disk();
                            info.data_size = sstable->ondisk_data_size();
@@ -1376,7 +1322,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                        }
                        res.emplace_back(std::move(tst));
                    }
-                });
+                }
                std::sort(res.begin(), res.end(), [](const ss::table_sstables& t1, const ss::table_sstables& t2) {
                    return t1.keyspace() < t2.keyspace() || (t1.keyspace() == t2.keyspace() && t1.table() < t2.table());
                });
@@ -1386,125 +1332,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
            });
        });
    });
-
-    ss::reload_raft_topology_state.set(r,
-            [&ss, &group0_client] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        co_await ss.invoke_on(0, [&group0_client] (service::storage_service& ss) -> future<> {
-            apilog.info("Waiting for group 0 read/apply mutex before reloading Raft topology state...");
-            auto holder = co_await group0_client.hold_read_apply_mutex();
-            apilog.info("Reloading Raft topology state");
-            // Using topology_transition() instead of topology_state_load(), because the former notifies listeners
-            co_await ss.topology_transition();
-            apilog.info("Reloaded Raft topology state");
-        });
-        co_return json_void();
-    });
-
-    sp::get_schema_versions.set(r, [&ss](std::unique_ptr<http::request> req)  {
-        return ss.local().describe_schema_versions().then([] (auto result) {
-            std::vector<sp::mapper_list> res;
-            for (auto e : result) {
-                sp::mapper_list entry;
-                entry.key = std::move(e.first);
-                entry.value = std::move(e.second);
-                res.emplace_back(std::move(entry));
-            }
-            return make_ready_future<json::json_return_type>(std::move(res));
-        });
-    });
-}
-
-void unset_storage_service(http_context& ctx, routes& r) {
-    ss::local_hostid.unset(r);
-    ss::get_tokens.unset(r);
-    ss::get_node_tokens.unset(r);
-    ss::get_commitlog.unset(r);
-    ss::get_token_endpoint.unset(r);
-    ss::toppartitions_generic.unset(r);
-    ss::get_leaving_nodes.unset(r);
-    ss::get_moving_nodes.unset(r);
-    ss::get_joining_nodes.unset(r);
-    ss::get_release_version.unset(r);
-    ss::get_scylla_release_version.unset(r);
-    ss::get_schema_version.unset(r);
-    ss::get_all_data_file_locations.unset(r);
-    ss::get_saved_caches_location.unset(r);
-    ss::get_range_to_endpoint_map.unset(r);
-    ss::get_pending_range_to_endpoint_map.unset(r);
-    ss::describe_any_ring.unset(r);
-    ss::describe_ring.unset(r);
-    ss::get_host_id_map.unset(r);
-    ss::get_load.unset(r);
-    ss::get_load_map.unset(r);
-    ss::get_current_generation_number.unset(r);
-    ss::get_natural_endpoints.unset(r);
-    ss::cdc_streams_check_and_repair.unset(r);
-    ss::force_compaction.unset(r);
-    ss::force_keyspace_compaction.unset(r);
-    ss::force_keyspace_cleanup.unset(r);
-    ss::perform_keyspace_offstrategy_compaction.unset(r);
-    ss::upgrade_sstables.unset(r);
-    ss::force_flush.unset(r);
-    ss::force_keyspace_flush.unset(r);
-    ss::decommission.unset(r);
-    ss::move.unset(r);
-    ss::remove_node.unset(r);
-    ss::get_removal_status.unset(r);
-    ss::force_remove_completion.unset(r);
-    ss::set_logging_level.unset(r);
-    ss::get_logging_levels.unset(r);
-    ss::get_operation_mode.unset(r);
-    ss::is_starting.unset(r);
-    ss::get_drain_progress.unset(r);
-    ss::drain.unset(r);
-    ss::truncate.unset(r);
-    ss::get_keyspaces.unset(r);
-    ss::stop_gossiping.unset(r);
-    ss::start_gossiping.unset(r);
-    ss::is_gossip_running.unset(r);
-    ss::stop_daemon.unset(r);
-    ss::is_initialized.unset(r);
-    ss::join_ring.unset(r);
-    ss::is_joined.unset(r);
-    ss::set_stream_throughput_mb_per_sec.unset(r);
-    ss::get_stream_throughput_mb_per_sec.unset(r);
-    ss::get_compaction_throughput_mb_per_sec.unset(r);
-    ss::set_compaction_throughput_mb_per_sec.unset(r);
-    ss::is_incremental_backups_enabled.unset(r);
-    ss::set_incremental_backups_enabled.unset(r);
-    ss::rebuild.unset(r);
-    ss::bulk_load.unset(r);
-    ss::bulk_load_async.unset(r);
-    ss::reschedule_failed_deletions.unset(r);
-    ss::sample_key_range.unset(r);
-    ss::reset_local_schema.unset(r);
-    ss::set_trace_probability.unset(r);
-    ss::get_trace_probability.unset(r);
-    ss::get_slow_query_info.unset(r);
-    ss::set_slow_query.unset(r);
-    ss::enable_auto_compaction.unset(r);
-    ss::disable_auto_compaction.unset(r);
-    ss::enable_tombstone_gc.unset(r);
-    ss::disable_tombstone_gc.unset(r);
-    ss::deliver_hints.unset(r);
-    ss::get_cluster_name.unset(r);
-    ss::get_partitioner_name.unset(r);
-    ss::get_tombstone_warn_threshold.unset(r);
-    ss::set_tombstone_warn_threshold.unset(r);
-    ss::get_tombstone_failure_threshold.unset(r);
-    ss::set_tombstone_failure_threshold.unset(r);
-    ss::get_batch_size_failure_threshold.unset(r);
-    ss::set_batch_size_failure_threshold.unset(r);
-    ss::set_hinted_handoff_throttle_in_kb.unset(r);
-    ss::get_metrics_load.unset(r);
-    ss::get_exceptions.unset(r);
-    ss::get_total_hints_in_progress.unset(r);
-    ss::get_total_hints.unset(r);
-    ss::get_ownership.unset(r);
-    ss::get_effective_ownership.unset(r);
-    ss::sstable_info.unset(r);
-    ss::reload_raft_topology_state.unset(r);
-    sp::get_schema_versions.unset(r);
 }

 enum class scrub_status {
@@ -1667,12 +1494,27 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
            throw httpd::bad_param_exception(fmt::format("Unknown argument for 'quarantine_mode' parameter: {}", quarantine_mode_str));
        }

-        sstables::compaction_stats stats;
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<scrub_sstables_compaction_task_impl>({}, std::move(keyspace), db, column_families, opts, stats);
+        const auto& reduce_compaction_stats = [] (const compaction_manager::compaction_stats_opt& lhs, const compaction_manager::compaction_stats_opt& rhs) {
+            sstables::compaction_stats stats{};
+            stats += lhs.value();
+            stats += rhs.value();
+            return stats;
+        };
+
        try {
-            co_await task->done();
-            if (stats.validation_errors) {
+            auto opt_stats = co_await db.map_reduce0([&] (replica::database& db) {
+                return map_reduce(column_families, [&] (sstring cfname) -> future<std::optional<sstables::compaction_stats>> {
+                    auto& cm = db.get_compaction_manager();
+                    auto& cf = db.find_column_family(keyspace, cfname);
+                    sstables::compaction_stats stats{};
+                    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
+                        auto r = co_await cm.perform_sstable_scrub(ts, opts);
+                        stats += r.value_or(sstables::compaction_stats{});
+                    });
+                    co_return stats;
+                }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
+            }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
+            if (opt_stats && opt_stats->validation_errors) {
                co_return json::json_return_type(static_cast<int>(scrub_status::validation_errors));
            }
        } catch (const sstables::compaction_aborted_exception&) {
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -25,6 +25,7 @@ class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
+namespace cdc { class generation_service; }
 class sstables_loader;

 namespace gms {
@@ -50,6 +51,11 @@ sstring validate_keyspace(http_context& ctx, const httpd::parameters& param);
 // If the parameter is found and empty, returns a list of all table names in the keyspace.
 std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

+struct table_info {
+    sstring name;
+    table_id id;
+};
+
 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective no_such_column_family error.
@@ -57,8 +63,7 @@ std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, con
 // if the parameter is not found or is empty, returns a list of all table infos in the keyspace.
 std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

-void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
-void unset_storage_service(http_context& ctx, httpd::routes& r);
+void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ls);
 void set_sstables_loader(http_context& ctx, httpd::routes& r, sharded<sstables_loader>& sst_loader);
 void unset_sstables_loader(http_context& ctx, httpd::routes& r);
 void set_view_builder(http_context& ctx, httpd::routes& r, sharded<db::view::view_builder>& vb);
@@ -74,3 +79,9 @@ void unset_snapshot(http_context& ctx, httpd::routes& r);
 seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);

 } // namespace api
+
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti);
+
+} // namespace std
--- a/api/system.cc
+++ b/api/system.cc
@@ -7,18 +7,10 @@
 */

 #include "api/api-doc/system.json.hh"
-#include "api/api-doc/metrics.json.hh"
-
 #include "api/api.hh"

 #include <seastar/core/reactor.hh>
-#include <seastar/core/metrics_api.hh>
-#include <seastar/core/relabel_config.hh>
 #include <seastar/http/exception.hh>
-#include <seastar/util/short_streams.hh>
-#include <seastar/http/short_streams.hh>
-#include "utils/rjson.hh"
-
 #include "log.hh"
 #include "replica/database.hh"

@@ -28,77 +20,8 @@ namespace api {
 using namespace seastar::httpd;

 namespace hs = httpd::system_json;
-namespace hm = httpd::metrics_json;

 void set_system(http_context& ctx, routes& r) {
-    hm::get_metrics_config.set(r, [](const_req req) {
-        std::vector<hm::metrics_config> res;
-        res.resize(seastar::metrics::get_relabel_configs().size());
-        size_t i = 0;
-        for (auto&& r : seastar::metrics::get_relabel_configs()) {
-            res[i].action = r.action;
-            res[i].target_label = r.target_label;
-            res[i].replacement = r.replacement;
-            res[i].separator = r.separator;
-            res[i].source_labels = r.source_labels;
-            res[i].regex = r.expr.str();
-            i++;
-        }
-        return res;
-    });
-
-    hm::set_metrics_config.set(r, [](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        rapidjson::Document doc;
-        doc.Parse(req->content.c_str());
-        if (!doc.IsArray()) {
-            throw bad_param_exception("Expected a json array");
-        }
-        std::vector<seastar::metrics::relabel_config> relabels;
-        relabels.resize(doc.Size());
-        for (rapidjson::SizeType i = 0; i < doc.Size(); i++) {
-            const auto& element = doc[i];
-            if (element.HasMember("source_labels")) {
-                std::vector<std::string> source_labels;
-                source_labels.resize(element["source_labels"].Size());
-
-                for (size_t j = 0; j < element["source_labels"].Size(); j++) {
-                    source_labels[j] = element["source_labels"][j].GetString();
-                }
-                relabels[i].source_labels = source_labels;
-            }
-            if (element.HasMember("action")) {
-                relabels[i].action = seastar::metrics::relabel_config_action(element["action"].GetString());
-            }
-            if (element.HasMember("replacement")) {
-                relabels[i].replacement = element["replacement"].GetString();
-            }
-            if (element.HasMember("separator")) {
-                relabels[i].separator = element["separator"].GetString();
-            }
-            if (element.HasMember("target_label")) {
-                relabels[i].target_label = element["target_label"].GetString();
-            }
-            if (element.HasMember("regex")) {
-                relabels[i].expr = element["regex"].GetString();
-            }
-        }
-        return do_with(std::move(relabels), false, [](const std::vector<seastar::metrics::relabel_config>& relabels, bool& failed) {
-            return smp::invoke_on_all([&relabels, &failed] {
-                return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
-                    if (result.metrics_relabeled_due_to_collision > 0) {
-                        failed = true;
-                    }
-                    return;
-                });
-            }).then([&failed](){
-                if (failed) {
-                    throw bad_param_exception("conflicts found during relabeling");
-                }
-                return make_ready_future<json::json_return_type>(seastar::json::json_void());
-            });
-        });
-    });
-
    hs::get_system_uptime.set(r, [](const_req req) {
        return std::chrono::duration_cast<std::chrono::milliseconds>(engine().uptime()).count();
    });
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -44,7 +44,6 @@ struct task_stats {
        : task_id(task->id().to_sstring())
        , state(task->get_status().state)
        , type(task->type())
-        , scope(task->get_status().scope)
        , keyspace(task->get_status().keyspace)
        , table(task->get_status().table)
        , entity(task->get_status().entity)
@@ -54,7 +53,6 @@ struct task_stats {
    sstring task_id;
    tasks::task_manager::task_state state;
    std::string type;
-    std::string scope;
    std::string keyspace;
    std::string table;
    std::string entity;
@@ -71,7 +69,6 @@ tm::task_status make_status(full_task_status status) {
    tm::task_status res{};
    res.id = status.task_status.id.to_sstring();
    res.type = status.type;
-    res.scope = status.task_status.scope;
    res.state = status.task_status.state;
    res.is_abortable = bool(status.abortable);
    res.start_time = st;
@@ -122,12 +119,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {
        auto internal = tasks::is_internal{req_param<bool>(*req, "internal", false)};
        std::vector<chunked_stats> res = co_await ctx.tm.map([&req, internal] (tasks::task_manager& tm) {
            chunked_stats local_res;
-            tasks::task_manager::module_ptr module;
-            try {
-                module = tm.find_module(req->param["module"]);
-            } catch (...) {
-                throw bad_param_exception(fmt::format("{}", std::current_exception()));
-            }
+            auto module = tm.find_module(req->param["module"]);
            const auto& filtered_tasks = module->get_tasks() | boost::adaptors::filtered([&params = req->query_parameters, internal] (const auto& task) {
                return (internal || !task.second->is_internal()) && filter_tasks(task.second, params);
            });
@@ -158,52 +150,37 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {

    tm::get_task_status.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
-                }
-                co_return std::move(task);
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return std::move(task);
+        }));
        auto s = co_await retrieve_status(task);
        co_return make_status(s);
    });

    tm::abort_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
-        try {
-            co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                if (!task->is_abortable()) {
-                    co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
-                }
-                co_await task->abort();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            if (!task->is_abortable()) {
+                co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
+            }
+            co_await task->abort();
+        });
        co_return json_void();
    });

    tm::wait_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
-                return task->done().then_wrapped([task] (auto f) {
-                    task->unregister_task();
-                    // done() is called only because we want the task to be complete before getting its status.
-                    // The future should be ignored here as the result does not matter.
-                    f.ignore_ready_future();
-                    return make_foreign(task);
-                });
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
+            return task->done().then_wrapped([task] (auto f) {
+                task->unregister_task();
+                f.get();
+                return make_foreign(task);
+            });
+        }));
        auto s = co_await retrieve_status(task);
        co_return make_status(s);
    });
@@ -214,26 +191,22 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {
        std::queue<tasks::task_manager::foreign_task_ptr> q;
        utils::chunked_vector<full_task_status> res;

-        tasks::task_manager::foreign_task_ptr task;
-        try {
-            // Get requested task.
-            task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
-                }
-                co_return task;
-            }));
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        // Get requested task.
+        auto task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return task;
+        }));

        // Push children's statuses in BFS order.
        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
        while (!q.empty()) {
            auto& current = q.front();
            res.push_back(co_await retrieve_status(current));
-            for (auto& child: current->get_children()) {
-                q.push(co_await child.copy());
+            for (size_t i = 0; i < current->get_children().size(); ++i) {
+                q.push(co_await current->get_children()[i].copy());
            }
            q.pop();
        }
@@ -255,11 +228,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {

    tm::get_and_update_ttl.set(r, [&cfg] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        uint32_t ttl = cfg.task_ttl_seconds();
-        try {
-            co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
-        } catch (...) {
-            throw bad_param_exception(fmt::format("{}", std::current_exception()));
-        }
+        co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
        co_return json::json_return_type(ttl);
    });
 }
--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -71,14 +71,10 @@ void set_task_manager_test(http_context& ctx, routes& r) {

    tmt::unregister_test_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->query_parameters["task_id"]}};
-        try {
-            co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                tasks::test_task test_task{task};
-                co_await test_task.unregister_task();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            tasks::test_task test_task{task};
+            co_await test_task.unregister_task();
+        });
        co_return json_void();
    });

@@ -88,19 +84,15 @@ void set_task_manager_test(http_context& ctx, routes& r) {
        bool fail = it != req->query_parameters.end();
        std::string error = fail ? it->second : "";

-        try {
-            co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
-                tasks::test_task test_task{task};
-                if (fail) {
-                    test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
-                } else {
-                    test_task.finish();
-                }
-                return make_ready_future<>();
-            });
-        } catch (tasks::task_manager::task_not_found& e) {
-            throw bad_param_exception(e.what());
-        }
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
+            tasks::test_task test_task{task};
+            if (fail) {
+                test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
+            } else {
+                test_task.finish();
+            }
+            return make_ready_future<>();
+        });
        co_return json_void();
    });
 }
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -7,7 +7,6 @@ target_sources(scylla_auth
    allow_all_authorizer.cc
    authenticated_user.cc
    authenticator.cc
-    certificate_authenticator.cc
    common.cc
    default_authorizer.cc
    password_authenticator.cc
@@ -31,7 +30,6 @@ target_link_libraries(scylla_auth
  PRIVATE
    cql3
    idl
-    wasmtime_bindings
-    libxcrypt::libxcrypt)
+    wasmtime_bindings)

 add_whole_archive(auth scylla_auth)
--- a/auth/authenticated_user.hh
+++ b/auth/authenticated_user.hh
@@ -35,9 +35,16 @@ public:
    ///
    authenticated_user() = default;
    explicit authenticated_user(std::string_view name);
-    friend bool operator==(const authenticated_user&, const authenticated_user&) noexcept = default;
 };

+inline bool operator==(const authenticated_user& u1, const authenticated_user& u2) noexcept {
+    return u1.name == u2.name;
+}
+
+inline bool operator!=(const authenticated_user& u1, const authenticated_user& u2) noexcept {
+    return !(u1 == u2);
+}
+
 const authenticated_user& anonymous_user() noexcept;

 inline bool is_anonymous(const authenticated_user& u) noexcept {
--- a/auth/authenticator.cc
+++ b/auth/authenticator.cc
@@ -18,7 +18,3 @@

 const sstring auth::authenticator::USERNAME_KEY("username");
 const sstring auth::authenticator::PASSWORD_KEY("password");
-
-future<std::optional<auth::authenticated_user>> auth::authenticator::authenticate(session_dn_func) const {
-    return make_ready_future<std::optional<auth::authenticated_user>>(std::nullopt);
-}
--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -15,8 +15,6 @@
 #include <set>
 #include <stdexcept>
 #include <unordered_map>
-#include <optional>
-#include <functional>

 #include <seastar/core/enum.hh>
 #include <seastar/core/future.hh>
@@ -38,16 +36,6 @@ namespace auth {

 class authenticated_user;

-// Query alt name info as a single (subject style) string
-using alt_name_func = std::function<future<std::string>()>;
-
-struct certificate_info {
-    std::string subject;
-    alt_name_func get_alt_names;
-};
-
-using session_dn_func = std::function<future<std::optional<certificate_info>>()>;
-
 ///
 /// Abstract client for authenticating role identity.
 ///
@@ -99,13 +87,6 @@ public:
    ///
    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const = 0;

-    ///
-    /// Authenticate (early) using transport info
-    ///
-    /// \returns nullopt if not supported/required. exceptional future if failed
-    ///
-    virtual future<std::optional<authenticated_user>> authenticate(session_dn_func) const;
-
    ///
    /// Create an authentication record for a new user. This is required before the user can log-in.
    ///
--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -39,6 +39,10 @@ inline bool operator==(const permission_details& pd1, const permission_details&
            == std::forward_as_tuple(pd2.role_name, pd2.resource, pd2.permissions.mask());
 }

+inline bool operator!=(const permission_details& pd1, const permission_details& pd2) {
+    return !(pd1 == pd2);
+}
+
 inline bool operator<(const permission_details& pd1, const permission_details& pd2) {
    return std::forward_as_tuple(pd1.role_name, pd1.resource, pd1.permissions)
            < std::forward_as_tuple(pd2.role_name, pd2.resource, pd2.permissions);
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -1,181 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#include "auth/certificate_authenticator.hh"
-
-#include <regex>
-
-#include "utils/class_registrator.hh"
-#include "data_dictionary/data_dictionary.hh"
-#include "cql3/query_processor.hh"
-#include "db/config.hh"
-
-static const auto CERT_AUTH_NAME = "com.scylladb.auth.CertificateAuthenticator";
-const std::string_view auth::certificate_authenticator_name(CERT_AUTH_NAME);
-
-static logging::logger clogger("certificate_authenticator");
-
-static const std::string cfg_source_attr = "source";
-static const std::string cfg_query_attr = "query";
-
-static const std::string cfg_source_subject = "SUBJECT";
-static const std::string cfg_source_altname = "ALTNAME";
-
-static const class_registrator<auth::authenticator
-    , auth::certificate_authenticator
-    , cql3::query_processor&
-    , ::service::migration_manager&> cert_auth_reg(CERT_AUTH_NAME);
-
-enum class auth::certificate_authenticator::query_source {
-    subject, altname
-};
-
-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::migration_manager&)
-    : _queries([&] {
-        auto& conf = qp.db().get_config();
-        auto queries = conf.auth_certificate_role_queries();
-
-        if (queries.empty()) {
-            throw std::invalid_argument("No role extraction queries specified.");
-        }
-
-        std::vector<std::pair<query_source, boost::regex>> res;
-
-        for (auto& map : queries) {
-            // first, check for any invalid config keys
-            if (map.size() == 2) {
-                try {
-                    auto& source = map.at(cfg_source_attr);
-                    std::string query = map.at(cfg_query_attr);
-
-                    std::transform(source.begin(), source.end(), source.begin(), ::toupper);
-
-                    boost::regex ex(query);
-                    if (ex.mark_count() != 1) {
-                        throw std::invalid_argument("Role query must have exactly one mark expression");
-                    }
-
-                    clogger.debug("Append role query: {} : {}", source, query);
-
-                    if (source == cfg_source_subject) {
-                        res.emplace_back(query_source::subject, std::move(ex));
-                    } else if (source == cfg_source_altname) {
-                        res.emplace_back(query_source::altname, std::move(ex));
-                    } else {
-                        throw std::invalid_argument(fmt::format("Invalid source: {}", map.at(cfg_source_attr)));
-                    }
-                    continue;
-                } catch (std::out_of_range&) {
-                    // just fallthrough
-                } catch (std::regex_error&) {
-                    std::throw_with_nested(std::invalid_argument(fmt::format("Invalid query expression: {}", map.at(cfg_query_attr))));
-                }
-            }
-            throw std::invalid_argument(fmt::format("Invalid query: {}", map));
-        }
-        return res;
-    }())
-{}
-
-auth::certificate_authenticator::~certificate_authenticator() = default;
-
-future<> auth::certificate_authenticator::start() {
-    co_return;
-}
-
-future<> auth::certificate_authenticator::stop() {
-    co_return;
-}
-
-std::string_view auth::certificate_authenticator::qualified_java_name() const {
-    return certificate_authenticator_name;
-}
-
-bool auth::certificate_authenticator::require_authentication() const {
-    return true;
-}
-
-auth::authentication_option_set auth::certificate_authenticator::supported_options() const {
-    return {};
-}
-
-auth::authentication_option_set auth::certificate_authenticator::alterable_options() const {
-    return {};
-}
-
-future<std::optional<auth::authenticated_user>> auth::certificate_authenticator::authenticate(session_dn_func f) const {
-    if (!f) {
-        co_return std::nullopt;
-    }
-    auto dninfo = co_await f();
-    if (!dninfo) {
-        throw exceptions::authentication_exception("No valid certificate found");
-    }
-
-    auto& subject = dninfo->subject;
-    std::optional<std::string> altname ;
-
-    const std::string* source_str = nullptr;
-
-    for (auto& [source, expr] : _queries) {
-        switch (source) {
-            default:
-            case query_source::subject:
-                source_str = &subject;
-                break;
-            case query_source::altname:
-                if (!altname) {
-                    altname = dninfo->get_alt_names ? co_await dninfo->get_alt_names() : std::string{};
-                }
-                source_str = &*altname;
-                break;
-        }
-
-        clogger.debug("Checking {}: {}", int(source), *source_str);
-
-        boost::smatch m;
-        if (boost::regex_search(*source_str, m, expr)) {
-            auto username = m[1].str();
-            clogger.debug("Return username: {}", username);
-            co_return username;
-        }
-    }
-    throw exceptions::authentication_exception(format("Subject '{}'/'{}' does not match any query expression", subject, altname));
-}
-
-
-future<auth::authenticated_user> auth::certificate_authenticator::authenticate(const credentials_map&) const {
-    throw exceptions::authentication_exception("Cannot authenticate using attribute map");
-}
-
-future<> auth::certificate_authenticator::create(std::string_view role_name, const authentication_options& options) const {
-    // TODO: should we keep track of roles/enforce existence? Role manager should deal with this...
-    co_return;
-}
-
-future<> auth::certificate_authenticator::alter(std::string_view role_name, const authentication_options& options) const {
-    co_return;
-}
-
-future<> auth::certificate_authenticator::drop(std::string_view role_name) const {
-    co_return;
-}
-
-future<auth::custom_options> auth::certificate_authenticator::query_custom_options(std::string_view) const {
-    co_return auth::custom_options{};
-}
-
-const auth::resource_set& auth::certificate_authenticator::protected_resources() const {
-    static const resource_set resources;
-    return resources;
-}
-
-::shared_ptr<auth::sasl_challenge> auth::certificate_authenticator::new_sasl_challenge() const {
-    throw exceptions::authentication_exception("Login authentication not supported");
-}
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- */
-
-/*
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-#pragma once
-
-#include <boost/regex.hpp>
-#include "auth/authenticator.hh"
-
-namespace cql3 {
-
-class query_processor;
-
-} // namespace cql3
-
-namespace service {
-class migration_manager;
-}
-
-namespace auth {
-
-extern const std::string_view certificate_authenticator_name;
-
-class certificate_authenticator : public authenticator {
-    enum class query_source;
-    std::vector<std::pair<query_source, boost::regex>> _queries;
-public:
-    certificate_authenticator(cql3::query_processor&, ::service::migration_manager&);
-    ~certificate_authenticator();
-
-    future<> start() override;
-    future<> stop() override;
-
-    std::string_view qualified_java_name() const override;
-
-    bool require_authentication() const override;
-
-    authentication_option_set supported_options() const override;
-    authentication_option_set alterable_options() const override;
-
-    future<authenticated_user> authenticate(const credentials_map& credentials) const override;
-    future<std::optional<authenticated_user>> authenticate(session_dn_func) const override;
-
-    future<> create(std::string_view role_name, const authentication_options& options) const override;
-    future<> alter(std::string_view role_name, const authentication_options& options) const override;
-    future<> drop(std::string_view role_name) const override;
-
-    future<custom_options> query_custom_options(std::string_view role_name) const override;
-
-    const resource_set& protected_resources() const override;
-
-    ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;
-private:
-};
-
-}
-
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -71,8 +71,7 @@ static future<> create_metadata_table_if_missing_impl(
        auto group0_guard = co_await mm.start_group0_operation();
        auto ts = group0_guard.write_timestamp();
        try {
-            co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
-                    std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
+            co_return co_await mm.announce(co_await mm.prepare_new_column_family_announcement(table, ts), std::move(group0_guard));
        } catch (exceptions::already_exists_exception&) {}
    }
 }
@@ -85,6 +84,20 @@ future<> create_metadata_table_if_missing(
    return futurize_invoke(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

+future<> wait_for_schema_agreement(::service::migration_manager& mm, const replica::database& db, seastar::abort_source& as) {
+    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
+
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != replica::database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
+    });
+}
+
 ::service::query_state& internal_distributed_query_state() noexcept {
 #ifdef DEBUG
    // Give the much slower debug tests more headroom for completing auth queries.
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -22,6 +22,7 @@
 #include "log.hh"
 #include "seastarx.hh"
 #include "utils/exponential_backoff_retry.hh"
+#include "service/query_state.hh"

 using namespace std::chrono_literals;

@@ -31,7 +32,6 @@ class database;

 namespace service {
 class migration_manager;
-class query_state;
 }

 namespace cql3 {
@@ -67,6 +67,8 @@ future<> create_metadata_table_if_missing(
        std::string_view cql,
        ::service::migration_manager&) noexcept;

+future<> wait_for_schema_agreement(::service::migration_manager&, const replica::database&, seastar::abort_source&);
+
 ///
 /// Time-outs for internal, non-local CQL queries.
 ///
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -129,7 +129,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -29,7 +29,6 @@
 #include "utils/class_registrator.hh"
 #include "replica/database.hh"
 #include "cql3/query_processor.hh"
-#include "db/config.hh"

 namespace auth {

@@ -51,23 +50,14 @@ static const class_registrator<

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

-static std::string_view get_config_value(std::string_view value, std::string_view def) {
-    return value.empty() ? def : value;
-}
-
-std::string password_authenticator::default_superuser(const db::config& cfg) {
-    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
-}
-
 password_authenticator::~password_authenticator() {
 }

 password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::migration_manager& mm)
    : _qp(qp)
    , _migration_manager(mm)
-    , _stopped(make_ready_future<>()) 
-    , _superuser(default_superuser(qp.db().get_config()))
-{}
+    , _stopped(make_ready_future<>()) {
+}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
@@ -116,17 +106,13 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 }

 future<> password_authenticator::create_default_if_missing() const {
-    return default_role_row_satisfies(_qp, &has_salted_hash, _superuser).then([this](bool exists) {
+    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
-            if (salted_pwd.empty()) {
-                salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
-            }
            return _qp.execute_internal(
                    update_row_query(),
                    db::consistency_level::QUORUM,
                    internal_distributed_query_state(),
-                    {salted_pwd, _superuser},
+                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME},
                    cql3::query_processor::cache_internal::no).then([](auto&&) {
                plogger.info("Created default superuser authentication record.");
            });
@@ -146,9 +132,9 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

-                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get0()) {
+                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
                         plogger.warn("Ignoring legacy authentication metadata since nondefault data already exist.");
                     }
@@ -175,8 +161,6 @@ future<> password_authenticator::stop() {
 }

 db::consistency_level password_authenticator::consistency_for_user(std::string_view role_name) {
-    // TODO: this is plain dung. Why treat hardcoded default special, but for example a user-created
-    // super user uses plain LOCAL_ONE?
    if (role_name == DEFAULT_USER_NAME) {
        return db::consistency_level::QUORUM;
    }
@@ -245,8 +229,6 @@ future<authenticated_user> password_authenticator::authenticate(
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
        } catch (exceptions::authentication_exception& e) {
            std::throw_with_nested(e);
-        } catch (exceptions::unavailable_exception& e) {
-            std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
        } catch (...) {
            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -14,10 +14,6 @@

 #include "auth/authenticator.hh"

-namespace db {
-    class config;
-}
-
 namespace cql3 {

 class query_processor;
@@ -37,11 +33,9 @@ class password_authenticator : public authenticator {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    seastar::abort_source _as;
-    std::string _superuser;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
-    static std::string default_superuser(const db::config&);

    password_authenticator(cql3::query_processor&, ::service::migration_manager&);

--- a/auth/resource.cc
+++ b/auth/resource.cc
@@ -79,13 +79,6 @@ static permission_set applicable_permissions(const service_level_resource_view &
 }

 static permission_set applicable_permissions(const functions_resource_view& fv) {
-    if (fv.function_name() || fv.function_signature()) {
-        return permission_set::of<
-                permission::ALTER,
-                permission::DROP,
-                permission::AUTHORIZE,
-                permission::EXECUTE>();
-    }
    return permission_set::of<
            permission::CREATE,
            permission::ALTER,
@@ -299,7 +292,7 @@ std::optional<std::vector<std::string_view>> functions_resource_view::function_a

    std::vector<std::string_view> parts;
    if (_resource._parts[3] == "") {
-        return parts;
+        return {};
    }
    for (size_t i = 3; i < _resource._parts.size(); i++) {
        parts.push_back(_resource._parts[i]);
--- a/auth/resource.hh
+++ b/auth/resource.hh
@@ -117,12 +117,20 @@ private:
    friend class functions_resource_view;

    friend bool operator<(const resource&, const resource&);
-    friend bool operator==(const resource&, const resource&) = default;
+    friend bool operator==(const resource&, const resource&);
    friend resource parse_resource(std::string_view);
 };

 bool operator<(const resource&, const resource&);

+inline bool operator==(const resource& r1, const resource& r2) {
+    return (r1._kind == r2._kind) && (r1._parts == r2._parts);
+}
+
+inline bool operator!=(const resource& r1, const resource& r2) {
+    return !(r1 == r2);
+}
+
 std::ostream& operator<<(std::ostream&, const resource&);

 class resource_kind_mismatch : public std::invalid_argument {
--- a/auth/role_or_anonymous.cc
+++ b/auth/role_or_anonymous.cc
@@ -17,6 +17,10 @@ std::ostream& operator<<(std::ostream& os, const role_or_anonymous& mr) {
    return os;
 }

+bool operator==(const role_or_anonymous& mr1, const role_or_anonymous& mr2) noexcept {
+    return mr1.name == mr2.name;
+}
+
 bool is_anonymous(const role_or_anonymous& mr) noexcept {
    return !mr.name.has_value();
 }
--- a/auth/role_or_anonymous.hh
+++ b/auth/role_or_anonymous.hh
@@ -26,11 +26,16 @@ public:
    role_or_anonymous() = default;
    role_or_anonymous(std::string_view name) : name(name) {
    }
-    friend bool operator==(const role_or_anonymous&, const role_or_anonymous&) noexcept = default;
 };

 std::ostream& operator<<(std::ostream&, const role_or_anonymous&);

+bool operator==(const role_or_anonymous&, const role_or_anonymous&) noexcept;
+
+inline bool operator!=(const role_or_anonymous& mr1, const role_or_anonymous& mr2) noexcept {
+    return !(mr1 == mr2);
+}
+
 bool is_anonymous(const role_or_anonymous&) noexcept;

 }
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -46,43 +46,59 @@ constexpr std::string_view qualified_name("system_auth.roles");

 future<bool> default_role_row_satisfies(
        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
+        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {} WHERE {} = ?",
            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

-    for (auto cl : { db::consistency_level::ONE, db::consistency_level::QUORUM }) {
-        auto results = co_await qp.execute_internal(query, cl
-            , internal_distributed_query_state()
-            , {rolename.value_or(std::string(meta::DEFAULT_SUPERUSER_NAME))}
-            , cql3::query_processor::cache_internal::yes
-            );
-        if (!results->empty()) {
-            co_return p(results->one());
-        }
-    }
-    co_return false;
+    return do_with(std::move(p), [&qp](const auto& p) {
+        return qp.execute_internal(
+                query,
+                db::consistency_level::ONE,
+                {meta::DEFAULT_SUPERUSER_NAME},
+                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
+            if (results->empty()) {
+                return qp.execute_internal(
+                        query,
+                        db::consistency_level::QUORUM,
+                        internal_distributed_query_state(),
+                        {meta::DEFAULT_SUPERUSER_NAME},
+                        cql3::query_processor::cache_internal::yes).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+                    if (results->empty()) {
+                        return make_ready_future<bool>(false);
+                    }
+
+                    return make_ready_future<bool>(p(results->one()));
+                });
+            }
+
+            return make_ready_future<bool>(p(results->one()));
+        });
+    });
 }

 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
+        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name);

-    auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
-        , internal_distributed_query_state(), cql3::query_processor::cache_internal::no
-        );
-    if (results->empty()) {
-        co_return false;
-    }
-    static const sstring col_name = sstring(meta::roles_table::role_col_name);
+    return do_with(std::move(p), [&qp](const auto& p) {
+        return qp.execute_internal(
+                query,
+                db::consistency_level::QUORUM,
+                internal_distributed_query_state(),
+                cql3::query_processor::cache_internal::no).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+            if (results->empty()) {
+                return false;
+            }

-    co_return boost::algorithm::any_of(*results, [&](const cql3::untyped_result_set_row& row) {
-        auto superuser = rolename ? std::string_view(*rolename) : meta::DEFAULT_SUPERUSER_NAME;
-        const bool is_nondefault = row.get_as<sstring>(col_name) != superuser;
-        return is_nondefault && p(row);
+            static const sstring col_name = sstring(meta::roles_table::role_col_name);
+
+            return boost::algorithm::any_of(*results, [&p](const cql3::untyped_result_set_row& row) {
+                const bool is_nondefault = row.get_as<sstring>(col_name) != meta::DEFAULT_SUPERUSER_NAME;
+                return is_nondefault && p(row);
+            });
+        });
    });
 }

--- a/auth/roles-metadata.hh
+++ b/auth/roles-metadata.hh
@@ -43,17 +43,13 @@ constexpr std::string_view role_col_name{"role", 4};
 ///
 future<bool> default_role_row_satisfies(
        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
+        std::function<bool(const cql3::untyped_result_set_row&)>);

 ///
 /// Check that any nondefault role satisfies a predicate. `false` if no nondefault roles exist.
 ///
 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
+        std::function<bool(const cql3::untyped_result_set_row&)>);

 }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -7,7 +7,6 @@
 */

 #include <seastar/core/coroutine.hh>
-#include "auth/resource.hh"
 #include "auth/service.hh"

 #include <algorithm>
@@ -21,7 +20,6 @@
 #include "auth/allow_all_authorizer.hh"
 #include "auth/common.hh"
 #include "auth/role_or_anonymous.hh"
-#include "cql3/functions/function_name.hh"
 #include "cql3/functions/functions.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
@@ -68,7 +66,6 @@ private:
    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
-    void on_update_tablet_metadata() override {}

    void on_drop_keyspace(const sstring& ks_name) override {
        // Do it in the background.
@@ -78,12 +75,6 @@ private:
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
        });
-        (void)_authorizer.revoke_all(
-            auth::make_functions_resource(ks_name)).handle_exception_type([](const unsupported_authorization_operation&) {
-            // Nothing.
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
-        });
    }

    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
@@ -98,22 +89,8 @@ private:
    }

    void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_drop_function(const sstring& ks_name, const sstring& function_name) override {
-        (void)_authorizer.revoke_all(
-            auth::make_functions_resource(ks_name, function_name)).handle_exception_type([](const unsupported_authorization_operation&) {
-            // Nothing.
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
-        });
-    }
-    void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {
-        (void)_authorizer.revoke_all(
-            auth::make_functions_resource(ks_name, aggregate_name)).handle_exception_type([](const unsupported_authorization_operation&) {
-            // Nothing.
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
-        });
-    }
+    void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
+    void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
    void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
 };

@@ -178,8 +155,7 @@ future<> service::create_keyspace_if_missing(::service::migration_manager& mm) c
                    opts,
                    true);

-            co_return co_await mm.announce(::service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts),
-                    std::move(group0_guard), format("auth_service: create {} keyspace", meta::AUTH_KS));
+            co_return co_await mm.announce(mm.prepare_new_keyspace_announcement(ksm, ts), std::move(group0_guard));
        }
    }
 }
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -28,8 +28,6 @@
 #include "log.hh"
 #include "utils/class_registrator.hh"
 #include "replica/database.hh"
-#include "service/migration_manager.hh"
-#include "password_authenticator.hh"

 namespace auth {

@@ -129,13 +127,6 @@ static bool has_can_login(const cql3::untyped_result_set_row& row) {
    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob("can_login")).is_null());
 }

-standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::service::migration_manager& mm)
-    : _qp(qp)
-    , _migration_manager(mm)
-    , _stopped(make_ready_future<>())
-    , _superuser(password_authenticator::default_superuser(qp.db().get_config()))
-{}
-
 std::string_view standard_role_manager::qualified_java_name() const noexcept {
    return "org.apache.cassandra.auth.CassandraRoleManager";
 }
@@ -177,7 +168,7 @@ future<> standard_role_manager::create_metadata_tables_if_missing() const {
 }

 future<> standard_role_manager::create_default_role_if_missing() const {
-    return default_role_row_satisfies(_qp, &has_can_login, _superuser).then([this](bool exists) {
+    return default_role_row_satisfies(_qp, &has_can_login).then([this](bool exists) {
        if (!exists) {
            static const sstring query = format("INSERT INTO {} ({}, is_superuser, can_login) VALUES (?, true, true)",
                    meta::roles_table::qualified_name,
@@ -187,9 +178,9 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_query_state(),
-                    {_superuser},
-                    cql3::query_processor::cache_internal::no).then([this](auto&&) {
-                log.info("Created default superuser role '{}'.", _superuser);
+                    {meta::DEFAULT_SUPERUSER_NAME},
+                    cql3::query_processor::cache_internal::no).then([](auto&&) {
+                log.info("Created default superuser role '{}'.", meta::DEFAULT_SUPERUSER_NAME);
                return make_ready_future<>();
            });
        }
@@ -241,7 +232,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().real_database(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -34,10 +34,13 @@ class standard_role_manager final : public role_manager {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    seastar::abort_source _as;
-    std::string _superuser;

 public:
-    standard_role_manager(cql3::query_processor&, ::service::migration_manager&);
+    standard_role_manager(cql3::query_processor& qp, ::service::migration_manager& mm)
+            : _qp(qp)
+            , _migration_manager(mm)
+            , _stopped(make_ready_future<>()) {
+    }

    virtual std::string_view qualified_java_name() const noexcept override;

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -37,8 +37,10 @@
 // The constants q1 and q2 are used to determine the proportional factor at each stage.
 class backlog_controller {
 public:
-    using scheduling_group = seastar::scheduling_group;
-
+    struct scheduling_group {
+        seastar::scheduling_group cpu = default_scheduling_group();
+        seastar::io_priority_class io = default_priority_class();
+    };
    future<> shutdown() {
        _update_timer.cancel();
        return std::move(_inflight_update);
@@ -56,11 +58,11 @@ protected:
    };

    scheduling_group _scheduling_group;
+    timer<> _update_timer;

    std::vector<control_point> _control_points;

    std::function<float()> _current_backlog;
-    timer<> _update_timer;
    // updating shares for an I/O class may contact another shard and returns a future.
    future<> _inflight_update;

@@ -80,9 +82,9 @@ protected:
                       std::vector<control_point> control_points, std::function<float()> backlog,
                       float static_shares = 0)
        : _scheduling_group(std::move(sg))
+        , _update_timer([this] { adjust(); })
        , _control_points()
        , _current_backlog(std::move(backlog))
-        , _update_timer([this] { adjust(); })
        , _inflight_update(make_ready_future<>())
        , _static_shares(static_shares)
    {
--- a/bin/cqlsh
+++ b/bin/cqlsh
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-# Copyright (C) 2023-present ScyllaDB
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-here=$(dirname "$0")
-exec "$here/../tools/cqlsh/bin/cqlsh" "$@"
-
--- a/bytes.hh
+++ b/bytes.hh
@@ -17,7 +17,7 @@
 #include <functional>
 #include <compare>
 #include "utils/mutable_view.hh"
-#include "utils/simple_hashers.hh"
+#include <xxhash.h>

 using bytes = basic_sstring<int8_t, uint32_t, 31, false>;
 using bytes_view = std::basic_string_view<int8_t>;
@@ -160,7 +160,18 @@ struct appending_hash<bytes_view> {
    }
 };

-using bytes_view_hasher = simple_xx_hasher;
+struct bytes_view_hasher : public hasher {
+    XXH64_state_t _state;
+    bytes_view_hasher(uint64_t seed = 0) noexcept {
+        XXH64_reset(&_state, seed);
+    }
+    void update(const char* ptr, size_t length) noexcept {
+        XXH64_update(&_state, ptr, length);
+    }
+    size_t finalize() {
+        return static_cast<size_t>(XXH64_digest(&_state));
+    }
+};

 namespace std {
 template <>
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -53,10 +53,6 @@ public:
        using difference_type = std::ptrdiff_t;
        using pointer = bytes_view*;
        using reference = bytes_view&;
-
-        struct implementation {
-            blob_storage* current_chunk;
-        };
    private:
        chunk* _current = nullptr;
    public:
@@ -79,11 +75,11 @@ public:
            ++(*this);
            return tmp;
        }
-        bool operator==(const fragment_iterator&) const = default;
-        implementation extract_implementation() const {
-            return implementation {
-                .current_chunk = _current,
-            };
+        bool operator==(const fragment_iterator& other) const {
+            return _current == other._current;
+        }
+        bool operator!=(const fragment_iterator& other) const {
+            return _current != other._current;
        }
    };
    using const_iterator = fragment_iterator;
@@ -436,6 +432,10 @@ public:
        return true;
    }

+    bool operator!=(const bytes_ostream& other) const {
+        return !(*this == other);
+    }
+
    // Makes this instance empty.
    //
    // The first buffer is not deallocated, so callers may rely on the
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -98,16 +98,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    bool _next_row_in_range = false;
    bool _has_rt = false;

-    // True iff current population interval starts at before_all_clustered_rows
-    // and _last_row is unset. (And the read isn't reverse).
-    //
-    // Rationale: in the "most general" step of cache population,
-    // we mark the `(_last_row, ...] `range as continuous, which can involve doing something to `_last_row`.
-    // But when populating the range `(before_all_clustered_rows, ...)`,
-    // a rows_entry at `before_all_clustered_rows` needn't exist.
-    // Thus this case needs a special treatment which doesn't involve `_last_row`.
-    // And for that, this case it has to be recognized (via this flag).
-    //
+    // True iff current population interval, since the previous clustering row, starts before all clustered rows.
    // We cannot just look at _lower_bound, because emission of range tombstones changes _lower_bound and
    // because we mark clustering intervals as continuous when consuming a clustering_row, it would prevent
    // us from marking the interval as continuous.
@@ -119,9 +110,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    flat_mutation_reader_v2* _underlying = nullptr;
    flat_mutation_reader_v2_opt _underlying_holder;

-    gc_clock::time_point _read_time;
-    gc_clock::time_point _gc_before;
-
    future<> do_fill_buffer();
    future<> ensure_underlying();
    void copy_from_cache_to_buffer();
@@ -156,8 +144,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    bool maybe_add_to_cache(const range_tombstone_change& rtc);
    void maybe_add_to_cache(const static_row& sr);
    void maybe_set_static_row_continuous();
-    void set_rows_entry_continuous(rows_entry& e);
-    void restore_continuity_after_insertion(const mutation_partition::rows_type::iterator&);
    void finish_reader() {
        push_mutation_fragment(*_schema, _permit, partition_end());
        _end_of_stream = true;
@@ -192,20 +178,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader_v2::impl {
    const schema& table_schema() {
        return *_snp->schema();
    }
-
-    gc_clock::time_point get_read_time() {
-        return _read_context.tombstone_gc_state() ? gc_clock::now() : gc_clock::time_point::min();
-    }
-
-    gc_clock::time_point get_gc_before(const schema& schema, dht::decorated_key dk, const gc_clock::time_point query_time) {
-        auto gc_state = _read_context.tombstone_gc_state();
-        if (gc_state) {
-            return gc_state->get_gc_before_for_key(schema.shared_from_this(), dk, query_time);
-        }
-
-        return gc_clock::time_point::min();
-    }
-
 public:
    cache_flat_mutation_reader(schema_ptr s,
                               dht::decorated_key dk,
@@ -224,8 +196,6 @@ public:
        , _read_context_holder()
        , _read_context(ctx)    // ctx is owned by the caller, who's responsible for closing it.
        , _next_row(*_schema, *_snp, false, _read_context.is_reversed())
-        , _read_time(get_read_time())
-        , _gc_before(get_gc_before(*_schema, dk, _read_time))
    {
        clogger.trace("csm {}: table={}.{}, reversed={}, snap={}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name(), _read_context.is_reversed(),
                      fmt::ptr(&*_snp));
@@ -352,7 +322,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer() {
            });
        }
        _state = state::reading_from_underlying;
-        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema) && !_read_context.is_reversed() && !_last_row;
+        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema) && !_read_context.is_reversed();
        _underlying_upper_bound = _next_row_in_range ? position_in_partition::before_key(_next_row.position())
                                                     : position_in_partition(_upper_bound);
        if (!_read_context.partition_exists()) {
@@ -474,15 +444,14 @@ future<> cache_flat_mutation_reader::read_from_underlying() {
                                if (insert_result.second) {
                                    clogger.trace("csm {}: L{}: inserted dummy at {}", fmt::ptr(this), __LINE__, _upper_bound);
                                    _snp->tracker()->insert(*insert_result.first);
-                                    restore_continuity_after_insertion(insert_result.first);
                                }
                                if (_read_context.is_reversed()) [[unlikely]] {
                                    clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), _last_row.position(), insert_result.first->position(), _current_tombstone);
-                                    set_rows_entry_continuous(*_last_row);
+                                    _last_row->set_continuous(true);
                                    _last_row->set_range_tombstone(_current_tombstone);
                                } else {
                                    clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(), _last_row.position(), _current_tombstone);
-                                    set_rows_entry_continuous(*insert_result.first);
+                                    insert_result.first->set_continuous(true);
                                    insert_result.first->set_range_tombstone(_current_tombstone);
                                }
                                maybe_drop_last_entry(_current_tombstone);
@@ -517,11 +486,11 @@ bool cache_flat_mutation_reader::ensure_population_lower_bound() {
        rows_entry::tri_compare cmp(*_schema);
        partition_snapshot_row_cursor cur(*_schema, *_snp, false, _read_context.is_reversed());

-        if (!cur.advance_to(to_query_domain(_last_row.position()))) {
+        if (!cur.advance_to(_last_row.position())) {
            return false;
        }

-        if (cmp(cur.table_position(), _last_row.position()) != 0) {
+        if (cmp(cur.position(), _last_row.position()) != 0) {
            return false;
        }

@@ -543,7 +512,7 @@ void cache_flat_mutation_reader::maybe_update_continuity() {
    position_in_partition::equal_compare eq(*_schema);
    if (can_populate()
            && ensure_population_lower_bound()
-            && !eq(_last_row.position(), _next_row.table_position())) {
+            && !eq(_last_row.position(), _next_row.position())) {
        with_allocator(_snp->region().allocator(), [&] {
            rows_entry& e = _next_row.ensure_entry_in_latest().row;
            auto& rows = _snp->version()->partition().mutable_clustered_rows();
@@ -565,14 +534,14 @@ void cache_flat_mutation_reader::maybe_update_continuity() {
                        }
                        clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(),
                                      _last_row.position(), _current_tombstone);
-                        set_rows_entry_continuous(*insert_result.first);
+                        insert_result.first->set_continuous(true);
                        insert_result.first->set_range_tombstone(_current_tombstone);
                        clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), _last_row.position());
-                        set_rows_entry_continuous(*_last_row);
+                        _last_row->set_continuous(true);
                    });
                } else {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), _last_row.position(), _current_tombstone);
-                    set_rows_entry_continuous(*_last_row);
+                    _last_row->set_continuous(true);
                    _last_row->set_range_tombstone(_current_tombstone);
                }
            } else {
@@ -590,18 +559,18 @@ void cache_flat_mutation_reader::maybe_update_continuity() {
                        if (insert_result.second) {
                            clogger.trace("csm {}: L{}: inserted dummy at {}", fmt::ptr(this), __LINE__, insert_result.first->position());
                            _snp->tracker()->insert(*insert_result.first);
-                            clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(),
-                                          _last_row.position(), _current_tombstone);
-                            set_rows_entry_continuous(*insert_result.first);
-                            insert_result.first->set_range_tombstone(_current_tombstone);
                        }
+                        clogger.trace("csm {}: set_continuous({}), prev={}, rt={}", fmt::ptr(this), insert_result.first->position(),
+                                      _last_row.position(), _current_tombstone);
+                        insert_result.first->set_continuous(true);
+                        insert_result.first->set_range_tombstone(_current_tombstone);
                        clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), e.position());
-                        set_rows_entry_continuous(e);
+                        e.set_continuous(true);
                    });
                } else {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), e.position(), _current_tombstone);
                    e.set_range_tombstone(_current_tombstone);
-                    set_rows_entry_continuous(e);
+                    e.set_continuous(true);
                }
            }
            maybe_drop_last_entry(_current_tombstone);
@@ -637,21 +606,20 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        it = insert_result.first;
        if (insert_result.second) {
            _snp->tracker()->insert(*it);
-            restore_continuity_after_insertion(it);
        }

        rows_entry& e = *it;
        if (ensure_population_lower_bound()) {
            if (_read_context.is_reversed()) [[unlikely]] {
                clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), _last_row.position());
-                set_rows_entry_continuous(*_last_row);
+                _last_row->set_continuous(true);
                // _current_tombstone must also apply to _last_row itself (if it's non-dummy)
                // because otherwise there would be a rtc after it, either creating a different entry,
                // or clearing _last_row if population did not happen.
                _last_row->set_range_tombstone(_current_tombstone);
            } else {
                clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), e.position());
-                set_rows_entry_continuous(e);
+                e.set_continuous(true);
                e.set_range_tombstone(_current_tombstone);
            }
        } else {
@@ -702,25 +670,20 @@ bool cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone_change
        it = insert_result.first;
        if (insert_result.second) {
            _snp->tracker()->insert(*it);
-            restore_continuity_after_insertion(it);
        }

        rows_entry& e = *it;
        if (ensure_population_lower_bound()) {
            // underlying may emit range_tombstone_change fragments with the same position.
            // In such case, the range to which the tombstone from the first fragment applies is empty and should be ignored.
-            //
-            // Note: we are using a query schema comparator to compare table schema positions here,
-            // but this is okay because we are only checking for equality,
-            // which is preserved by schema reversals.
-            if (q_cmp(_last_row.position(), it->position()) != 0) {
+            if (q_cmp(_last_row.position(), it->position()) < 0) {
                if (_read_context.is_reversed()) [[unlikely]] {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), _last_row.position(), prev);
-                    set_rows_entry_continuous(*_last_row);
+                    _last_row->set_continuous(true);
                    _last_row->set_range_tombstone(prev);
                } else {
                    clogger.trace("csm {}: set_continuous({}), rt={}", fmt::ptr(this), e.position(), prev);
-                    set_rows_entry_continuous(e);
+                    e.set_continuous(true);
                    e.set_range_tombstone(prev);
                }
            }
@@ -767,51 +730,9 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
        }
    }

+    // We add the row to the buffer even when it's full.
+    // This simplifies the code. For more info see #3139.
    if (_next_row_in_range) {
-        bool remove_row = false;
-
-        if (_read_context.tombstone_gc_state() // do not compact rows when tombstone_gc_state is not set (used in some unit tests)
-            && !_next_row.dummy()
-            && _snp->at_latest_version()
-            && _snp->at_oldest_version()) {
-            deletable_row& row = _next_row.latest_row();
-            tombstone range_tomb = _next_row.range_tombstone_for_row();
-            auto t = row.deleted_at();
-            t.apply(range_tomb);
-
-            auto row_tomb_expired = [&](row_tombstone tomb) {
-                return (tomb && tomb.max_deletion_time() < _gc_before);
-            };
-
-            auto is_row_dead = [&](const deletable_row& row) {
-                auto& m = row.marker();
-                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before);
-            };
-
-            if (row_tomb_expired(t) || is_row_dead(row)) {
-                can_gc_fn always_gc = [&](tombstone) { return true; };
-                const schema& row_schema = _next_row.latest_row_schema();
-
-                _read_context.cache()._tracker.on_row_compacted();
-
-                with_allocator(_snp->region().allocator(), [&] {
-                    deletable_row row_copy(row_schema, row);
-                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, always_gc, _gc_before, nullptr);
-                    std::swap(row, row_copy);
-                });
-                remove_row = row.empty();
-
-                auto tomb_expired = [&](tombstone tomb) {
-                    return (tomb && tomb.deletion_time < _gc_before);
-                };
-
-                auto latests_range_tomb = _next_row.get_iterator_in_latest_version()->range_tombstone();
-                if (tomb_expired(latests_range_tomb)) {
-                    _next_row.get_iterator_in_latest_version()->set_range_tombstone({});
-                }
-            }
-        }
-
        if (_next_row.range_tombstone_for_row() != _current_tombstone) [[unlikely]] {
            auto tomb = _next_row.range_tombstone_for_row();
            auto new_lower_bound = position_in_partition::before_key(_next_row.position());
@@ -821,31 +742,8 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
            _current_tombstone = tomb;
            _read_context.cache()._tracker.on_range_tombstone_read();
        }
-
-        if (remove_row) {
-            _read_context.cache()._tracker.on_row_compacted_away();
-
-            _lower_bound = position_in_partition::after_key(*_schema, _next_row.position());
-
-            partition_snapshot_row_weakref row_ref(_next_row);
-            move_to_next_entry();
-
-            with_allocator(_snp->region().allocator(), [&] {
-                cache_tracker& tracker = _read_context.cache()._tracker;
-                if (row_ref->is_linked()) {
-                    tracker.get_lru().remove(*row_ref);
-                }
-                row_ref->on_evicted(tracker);
-            });
-
-            _snp->region().allocator().invalidate_references();
-            _next_row.force_valid();
-        } else {
-            // We add the row to the buffer even when it's full.
-            // This simplifies the code. For more info see #3139.
-            add_to_buffer(_next_row);
-            move_to_next_entry();
-        }
+        add_to_buffer(_next_row);
+        move_to_next_entry();
    } else {
        move_to_next_range();
    }
@@ -996,7 +894,7 @@ void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_curs
    if (!row.dummy()) {
        _read_context.cache().on_row_hit();
        if (_read_context.digest_requested()) {
-            row.latest_row_prepare_hash();
+            row.latest_row().cells().prepare_hash(table_schema(), column_kind::regular_column);
        }
        add_clustering_row_to_buffer(mutation_fragment_v2(*_schema, _permit, row.row()));
    } else {
@@ -1059,28 +957,6 @@ void cache_flat_mutation_reader::maybe_set_static_row_continuous() {
    }
 }

-// Last dummies can exist in a quasi-evicted state, where they are unlinked from LRU,
-// but still alive.
-// But while in this state, they mustn't carry any information (i.e. continuity),
-// due to the "older versions are evicted first" rule of MVCC.
-// Thus, when we make an entry continuous, we must ensure that it isn't an
-// unlinked last dummy.
-inline
-void cache_flat_mutation_reader::set_rows_entry_continuous(rows_entry& e) {
-    e.set_continuous(true);
-    if (!e.is_linked()) [[unlikely]] {
-        _snp->tracker()->touch(e);
-    }
-}
-
-inline
-void cache_flat_mutation_reader::restore_continuity_after_insertion(const mutation_partition::rows_type::iterator& it) {
-    if (auto x = std::next(it); x->continuous()) {
-        it->set_continuous(true);
-        it->set_range_tombstone(x->range_tombstone());
-    }
-}
-
 inline
 bool cache_flat_mutation_reader::can_populate() const {
    return _snp->at_latest_version() && _read_context.cache().phase_of(_read_context.key()) == _read_context.phase();
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -68,6 +68,7 @@ public:
            _pos = -1;
        }
        bool operator==(const iterator& o) const { return _pos == o._pos; }
+        bool operator!=(const iterator& o) const { return _pos != o._pos; }
    };
 public:
    cartesian_product(const std::vector<std::vector<T>>& vec_of_vecs) : _vec_of_vecs(vec_of_vecs) {}
--- a/cdc/cdc_options.hh
+++ b/cdc/cdc_options.hh
@@ -65,6 +65,7 @@ public:
    void ttl(int v) { _ttl = v; }

    bool operator==(const options& o) const;
+    bool operator!=(const options& o) const;
 };

 } // namespace cdc
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -13,7 +13,6 @@
 #include <seastar/core/sleep.hh>
 #include <seastar/core/coroutine.hh>

-#include "gms/endpoint_state.hh"
 #include "keys.hh"
 #include "schema/schema_builder.hh"
 #include "replica/database.hh"
@@ -26,7 +25,6 @@
 #include "gms/inet_address.hh"
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
-#include "utils/error_injection.hh"
 #include "utils/UUID_gen.hh"

 #include "cdc/generation.hh"
@@ -68,10 +66,10 @@ static constexpr auto stream_id_index_shift = stream_id_version_shift + stream_i
 static constexpr auto stream_id_random_shift = stream_id_index_shift + stream_id_index_bits;

 /**
- * Responsibility for encoding stream_id moved from the create_stream_ids
- * function to this constructor, to keep knowledge of composition in a
- * single place. Note the make_new_generation_description function
- * defines the "order" in which we view vnodes etc.
+ * Responsibilty for encoding stream_id moved from factory method to
+ * this constructor, to keep knowledge of composition in a single place.
+ * Note this is private and friended to topology_description_generator,
+ * because he is the one who defined the "order" we view vnodes etc.
 */
 stream_id::stream_id(dht::token token, size_t vnode_index)
    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
@@ -155,18 +153,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
+topology_description::topology_description(std::vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-utils::chunked_vector<token_range_description>&& topology_description::entries() && {
+std::vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -185,48 +183,98 @@ static std::vector<stream_id> create_stream_ids(
    return result;
 }

+class topology_description_generator final {
+    const std::unordered_set<dht::token>& _bootstrap_tokens;
+    const locator::token_metadata_ptr _tmptr;
+    const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& _get_sharding_info;
+
+    // Compute a set of tokens that split the token ring into vnodes
+    auto get_tokens() const {
+        auto tokens = _tmptr->sorted_tokens();
+        auto it = tokens.insert(
+                tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
+        std::sort(it, tokens.end());
+        std::inplace_merge(tokens.begin(), it, tokens.end());
+        tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
+        return tokens;
+    }
+
+    token_range_description create_description(size_t index, dht::token start, dht::token end) const {
+        token_range_description desc;
+
+        desc.token_range_end = end;
+
+        auto [shard_count, ignore_msb] = _get_sharding_info(end);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
+        desc.sharding_ignore_msb = ignore_msb;
+
+        return desc;
+    }
+public:
+    topology_description_generator(
+            const std::unordered_set<dht::token>& bootstrap_tokens,
+            const locator::token_metadata_ptr tmptr,
+            // This function must return sharding parameters for a node that owns the vnode ending with
+            // the given token. Returns <shard_count, ignore_msb> pair.
+            const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info)
+        : _bootstrap_tokens(bootstrap_tokens)
+        , _tmptr(std::move(tmptr))
+        , _get_sharding_info(get_sharding_info)
+    {}
+
+    /*
+     * Generate a set of CDC stream identifiers such that for each shard
+     * and vnode pair there exists a stream whose token falls into this vnode
+     * and is owned by this shard. It is sometimes not possible to generate
+     * a CDC stream identifier for some (vnode, shard) pair because not all
+     * shards have to own tokens in a vnode. Small vnode can be totally owned
+     * by a single shard. In such case, a stream identifier that maps to
+     * end of the vnode is generated.
+     *
+     * Then build a cdc::topology_description which maps tokens to generated
+     * stream identifiers, such that if token T is owned by shard S in vnode V,
+     * it gets mapped to the stream identifier generated for (S, V).
+     */
+    // Run in seastar::async context.
+    topology_description generate() const {
+        const auto tokens = get_tokens();
+
+        std::vector<token_range_description> vnode_descriptions;
+        vnode_descriptions.reserve(tokens.size());
+
+        vnode_descriptions.push_back(
+                create_description(0, tokens.back(), tokens.front()));
+        for (size_t idx = 1; idx < tokens.size(); ++idx) {
+            vnode_descriptions.push_back(
+                    create_description(idx, tokens[idx - 1], tokens[idx]));
+        }
+
+        return {std::move(vnode_descriptions)};
+    }
+};
+
 bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
    auto my_host_id = g.get_host_id(me);
-    return g.for_each_endpoint_state_until([&] (const gms::inet_address& node, const gms::endpoint_state& eps) {
-        return stop_iteration(my_host_id < g.get_host_id(node));
-    }) == stop_iteration::no;
+    auto& eps = g.get_endpoint_states();
+    return std::none_of(eps.begin(), eps.end(),
+            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
+        return my_host_id < g.get_host_id(ep.first);
+    });
 }

-bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm) {
-    if (tm.sorted_tokens().size() != gen.entries().size()) {
-        // We probably have garbage streams from old generations
-        cdc_log.info("Generation size does not match the token ring");
-        return false;
-    } else {
-        std::unordered_set<dht::token> gen_ends;
-        for (const auto& entry : gen.entries()) {
-            gen_ends.insert(entry.token_range_end);
-        }
-        for (const auto& metadata_token : tm.sorted_tokens()) {
-            if (!gen_ends.contains(metadata_token)) {
-                cdc_log.warn("CDC generation missing token {}", metadata_token);
-                return false;
-            }
-        }
-        return true;
-    }
-}
-
-static future<utils::chunked_vector<mutation>> get_common_cdc_generation_mutations(
+future<utils::chunked_vector<mutation>> get_cdc_generation_mutations(
        schema_ptr s,
-        const partition_key& pkey,
-        noncopyable_function<clustering_key (dht::token)>&& get_ckey_from_range_end,
+        utils::UUID id,
        const cdc::topology_description& desc,
        size_t mutation_size_threshold,
        api::timestamp_type ts) {
    utils::chunked_vector<mutation> res;
-    res.emplace_back(s, pkey);
+    res.emplace_back(s, partition_key::from_singular(*s, id));
+    res.back().set_static_cell(to_bytes("num_ranges"), int32_t(desc.entries().size()), ts);
    size_t size_estimate = 0;
-    size_t total_size_estimate = 0;
    for (auto& e : desc.entries()) {
        if (size_estimate >= mutation_size_threshold) {
-            total_size_estimate += size_estimate;
-            res.emplace_back(s, pkey);
+            res.emplace_back(s, partition_key::from_singular(*s, id));
            size_estimate = 0;
        }

@@ -237,60 +285,16 @@ static future<utils::chunked_vector<mutation>> get_common_cdc_generation_mutatio
        }

        size_estimate += e.streams.size() * 20;
-        auto ckey = get_ckey_from_range_end(e.token_range_end);
+        auto ckey = clustering_key::from_singular(*s, dht::token::to_int64(e.token_range_end));
        res.back().set_cell(ckey, to_bytes("streams"), make_set_value(db::cdc_streams_set_type, std::move(streams)), ts);
        res.back().set_cell(ckey, to_bytes("ignore_msb"), int8_t(e.sharding_ignore_msb), ts);

        co_await coroutine::maybe_yield();
    }

-    total_size_estimate += size_estimate;
-
-    // Copy mutations n times, where n is picked so that the memory size of all mutations together exceeds `max_command_size`.
-    utils::get_local_injector().inject("cdc_generation_mutations_replication", [&res, total_size_estimate, mutation_size_threshold] {
-        utils::chunked_vector<mutation> new_res;
-
-        size_t number_of_copies = (mutation_size_threshold / total_size_estimate + 1) * 2;
-        for (size_t i = 0; i < number_of_copies; ++i) {
-            std::copy(res.begin(), res.end(), std::back_inserter(new_res));
-        }
-
-        res = std::move(new_res);
-    });
-
    co_return res;
 }

-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v2(
-        schema_ptr s,
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        size_t mutation_size_threshold,
-        api::timestamp_type ts) {
-    auto pkey = partition_key::from_singular(*s, id);
-    auto get_ckey = [s] (dht::token range_end) {
-        return clustering_key::from_singular(*s, dht::token::to_int64(range_end));
-    };
-
-    auto res = co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
-    res.back().set_static_cell(to_bytes("num_ranges"), int32_t(desc.entries().size()), ts);
-    co_return res;
-}
-
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
-        schema_ptr s,
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        size_t mutation_size_threshold,
-        api::timestamp_type ts) {
-    auto pkey = partition_key::from_singular(*s, CDC_GENERATIONS_V3_KEY);
-    auto get_ckey = [&] (dht::token range_end) {
-        return clustering_key::from_exploded(*s, {timeuuid_type->decompose(id), long_type->decompose(dht::token::to_int64(range_end))}) ;
-    };
-
-    co_return co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
-}
-
 // non-static for testing
 size_t limit_of_streams_in_topology_description() {
    // Each stream takes 16B and we don't want to exceed 4MB so we can have
@@ -323,47 +327,13 @@ topology_description limit_number_of_streams_if_needed(topology_description&& de
    return topology_description(std::move(entries));
 }

-// Compute a set of tokens that split the token ring into vnodes.
-static auto get_tokens(const std::unordered_set<dht::token>& bootstrap_tokens, const locator::token_metadata_ptr tmptr) {
-    auto tokens = tmptr->sorted_tokens();
-    auto it = tokens.insert(tokens.end(), bootstrap_tokens.begin(), bootstrap_tokens.end());
-    std::sort(it, tokens.end());
-    std::inplace_merge(tokens.begin(), it, tokens.end());
-    tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
-    return tokens;
-}
-
-static token_range_description create_token_range_description(
-        size_t index,
-        dht::token start,
-        dht::token end,
-        const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info) {
-    token_range_description desc;
-
-    desc.token_range_end = end;
-
-    auto [shard_count, ignore_msb] = get_sharding_info(end);
-    desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
-    desc.sharding_ignore_msb = ignore_msb;
-
-    return desc;
-}
-
-cdc::topology_description make_new_generation_description(
+std::pair<utils::UUID, cdc::topology_description> make_new_generation_data(
        const std::unordered_set<dht::token>& bootstrap_tokens,
        const noncopyable_function<std::pair<size_t, uint8_t>(dht::token)>& get_sharding_info,
        const locator::token_metadata_ptr tmptr) {
-    const auto tokens = get_tokens(bootstrap_tokens, tmptr);
-
-    utils::chunked_vector<token_range_description> vnode_descriptions;
-    vnode_descriptions.reserve(tokens.size());
-
-    vnode_descriptions.push_back(create_token_range_description(0, tokens.back(), tokens.front(), get_sharding_info));
-    for (size_t idx = 1; idx < tokens.size(); ++idx) {
-        vnode_descriptions.push_back(create_token_range_description(idx, tokens[idx - 1], tokens[idx], get_sharding_info));
-    }
-
-    return {std::move(vnode_descriptions)};
+    auto gen = topology_description_generator(bootstrap_tokens, tmptr, get_sharding_info).generate();
+    auto uuid = utils::make_random_uuid();
+    return {uuid, std::move(gen)};
 }

 db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milliseconds ring_delay) {
@@ -395,9 +365,7 @@ future<cdc::generation_id> generation_service::legacy_make_new_generation(const
            return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
        }
    };
-
-    auto uuid = utils::make_random_uuid();
-    auto gen = make_new_generation_description(bootstrap_tokens, get_sharding_info, tmptr);
+    auto [uuid, gen] = make_new_generation_data(bootstrap_tokens, get_sharding_info, tmptr);

    // Our caller should ensure that there are normal tokens in the token ring.
    auto normal_token_owners = tmptr->count_normal_token_owners();
@@ -451,12 +419,8 @@ future<cdc::generation_id> generation_service::legacy_make_new_generation(const
 * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
 * which means it will gossip the generation's timestamp.
 */
-static std::optional<cdc::generation_id> get_generation_id_for(const gms::inet_address& endpoint, const gms::endpoint_state& eps) {
-    const auto* gen_id_ptr = eps.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
-    if (!gen_id_ptr) {
-        return std::nullopt;
-    }
-    auto gen_id_string = gen_id_ptr->value();
+static std::optional<cdc::generation_id> get_generation_id_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto gen_id_string = g.get_application_state_value(endpoint, gms::application_state::CDC_GENERATION_ID);
    cdc_log.trace("endpoint={}, gen_id_string={}", endpoint, gen_id_string);
    return gms::versioned_value::cdc_generation_id_from_string(gen_id_string);
 }
@@ -660,21 +624,21 @@ future<> generation_service::maybe_rewrite_streams_descriptions() {

    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
    std::vector<time_and_ttl> times_and_ttls;
-    _db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
-        auto& s = *t->schema();
+    for (auto& [_, cf] : _db.get_column_families()) {
+        auto& s = *cf->schema();
        auto base = cdc::get_base_table(_db, s.ks_name(), s.cf_name());
        if (!base) {
            // Not a CDC log table.
-            return;
+            continue;
        }
        auto& cdc_opts = base->cdc_options();
        if (!cdc_opts.enabled()) {
            // This table is named like a CDC log table but it's not one.
-            return;
+            continue;
        }

        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
-    });
+    }

    if (times_and_ttls.empty()) {
        // There's no point in rewriting old generations' streams (they don't contain any data).
@@ -762,8 +726,8 @@ future<> generation_service::stop() {
        cdc_log.error("CDC stream rewrite failed: ", std::current_exception());
    }

-    if (_joined && (this_shard_id() == 0)) {
-        co_await leave_ring();
+    if (this_shard_id() == 0) {
+        co_await _gossiper.unregister_(shared_from_this());
    }

    _stopped = true;
@@ -775,6 +739,7 @@ generation_service::~generation_service() {

 future<> generation_service::after_join(std::optional<cdc::generation_id>&& startup_gen_id) {
    assert_shard_zero(__PRETTY_FUNCTION__);
+    assert(_sys_ks.local().bootstrap_complete());

    _gen_id = std::move(startup_gen_id);
    _gossiper.register_(shared_from_this());
@@ -792,24 +757,18 @@ future<> generation_service::after_join(std::optional<cdc::generation_id>&& star
    _cdc_streams_rewrite_complete = maybe_rewrite_streams_descriptions();
 }

-future<> generation_service::leave_ring() {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-    _joined = false;
-    co_await _gossiper.unregister_(shared_from_this());
-}
-
-future<> generation_service::on_join(gms::inet_address ep, gms::endpoint_state_ptr ep_state, gms::permit_id pid) {
+future<> generation_service::on_join(gms::inet_address ep, gms::endpoint_state ep_state) {
    assert_shard_zero(__PRETTY_FUNCTION__);

-    auto val = ep_state->get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
+    auto val = ep_state.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
    if (!val) {
        return make_ready_future();
    }

-    return on_change(ep, gms::application_state::CDC_GENERATION_ID, *val, pid);
+    return on_change(ep, gms::application_state::CDC_GENERATION_ID, *val);
 }

-future<> generation_service::on_change(gms::inet_address ep, gms::application_state app_state, const gms::versioned_value& v, gms::permit_id) {
+future<> generation_service::on_change(gms::inet_address ep, gms::application_state app_state, const gms::versioned_value& v) {
    assert_shard_zero(__PRETTY_FUNCTION__);

    if (app_state != gms::application_state::CDC_GENERATION_ID) {
@@ -829,21 +788,22 @@ future<> generation_service::check_and_repair_cdc_streams() {
    }

    std::optional<cdc::generation_id> latest = _gen_id;
-    _gossiper.for_each_endpoint_state([&] (const gms::inet_address& addr, const gms::endpoint_state& state) {
+    const auto& endpoint_states = _gossiper.get_endpoint_states();
+    for (const auto& [addr, state] : endpoint_states) {
        if (_gossiper.is_left(addr)) {
            cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
-            return;
+            continue;
        }
        if (!_gossiper.is_normal(addr)) {
            throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
        }

-        const auto gen_id = get_generation_id_for(addr, state);
+        const auto gen_id = get_generation_id_for(addr, _gossiper);
        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
            latest = gen_id;
        }
-    });
+    }

    auto tmptr = _token_metadata.get();
    auto sys_dist_ks = get_sys_dist_ks();
@@ -898,9 +858,24 @@ future<> generation_service::check_and_repair_cdc_streams() {
                " even though some node gossiped about it.",
                latest, db_clock::now());
            should_regenerate = true;
-        } else if (!is_cdc_generation_optimal(*gen, *tmptr)) {
-            should_regenerate = true;
-            cdc_log.info("CDC generation {} needs repair, regenerating", latest);
+        } else {
+            if (tmptr->sorted_tokens().size() != gen->entries().size()) {
+                // We probably have garbage streams from old generations
+                cdc_log.info("Generation size does not match the token ring, regenerating");
+                should_regenerate = true;
+            } else {
+                std::unordered_set<dht::token> gen_ends;
+                for (const auto& entry : gen->entries()) {
+                    gen_ends.insert(entry.token_range_end);
+                }
+                for (const auto& metadata_token : tmptr->sorted_tokens()) {
+                    if (!gen_ends.contains(metadata_token)) {
+                        cdc_log.warn("CDC generation {} missing token {}. Regenerating.", latest, metadata_token);
+                        should_regenerate = true;
+                        break;
+                    }
+                }
+            }
        }
    }

@@ -960,13 +935,17 @@ future<> generation_service::legacy_handle_cdc_generation(std::optional<cdc::gen
        co_return;
    }

-    if (!_sys_dist_ks.local_is_initialized() || !_sys_dist_ks.local().started()) {
-        on_internal_error(cdc_log, "Legacy handle CDC generation with sys.dist.ks. down");
+    if (!_sys_ks.local().bootstrap_complete() || !_sys_dist_ks.local_is_initialized()
+            || !_sys_dist_ks.local().started()) {
+        // The service should not be listening for generation changes until after the node
+        // is bootstrapped. Therefore we would previously assume that this condition
+        // can never become true and call on_internal_error here, but it turns out that
+        // it may become true on decommission: the node enters NEEDS_BOOTSTRAP
+        // state before leaving the token ring, so bootstrap_complete() becomes false.
+        // In that case we can simply return.
+        co_return;
    }

-    // The service should not be listening for generation changes until after the node
-    // is bootstrapped and since the node leaves the ring on decommission
-
    if (co_await container().map_reduce(and_reducer(), [ts = get_ts(*gen_id)] (generation_service& svc) {
        return !svc._cdc_metadata.prepare(ts);
    })) {
@@ -1029,12 +1008,12 @@ future<> generation_service::legacy_scan_cdc_generations() {
    assert_shard_zero(__PRETTY_FUNCTION__);

    std::optional<cdc::generation_id> latest;
-    _gossiper.for_each_endpoint_state([&] (const gms::inet_address& node, const gms::endpoint_state& eps) {
-        auto gen_id = get_generation_id_for(node, eps);
+    for (const auto& ep: _gossiper.get_endpoint_states()) {
+        auto gen_id = get_generation_id_for(ep.first, _gossiper);
        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
            latest = gen_id;
        }
-    });
+    }

    if (latest) {
        cdc_log.info("Latest generation seen during startup: {}", *latest);
@@ -1111,8 +1090,19 @@ shared_ptr<db::system_distributed_keyspace> generation_service::get_sys_dist_ks(
    return _sys_dist_ks.local_shared();
 }

+std::ostream& operator<<(std::ostream& os, const generation_id& gen_id) {
+    std::visit(make_visitor(
+    [&os] (const generation_id_v1& id) { os << id.ts; },
+    [&os] (const generation_id_v2& id) { os << "(" << id.ts << ", " << id.id << ")"; }
+    ), gen_id);
+    return os;
+}
+
 db_clock::time_point get_ts(const generation_id& gen_id) {
-    return std::visit([] (auto& id) { return id.ts; }, gen_id);
+    return std::visit(make_visitor(
+    [] (const generation_id_v1& id) { return id.ts; },
+    [] (const generation_id_v2& id) { return id.ts; }
+    ), gen_id);
 }

 } // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -92,13 +92,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    utils::chunked_vector<token_range_description> _entries;
+    std::vector<token_range_description> _entries;
 public:
-    topology_description(utils::chunked_vector<token_range_description> entries);
+    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const utils::chunked_vector<token_range_description>& entries() const&;
-    utils::chunked_vector<token_range_description>&& entries() &&;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
 };

 /**
@@ -133,28 +133,7 @@ public:
 */
 bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);

-/*
- * Checks if the CDC generation is optimal, which is true if its `topology_description` is consistent
- * with `token_metadata`.
-*/
-bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm);
-
-/*
- * Generate a set of CDC stream identifiers such that for each shard
- * and vnode pair there exists a stream whose token falls into this vnode
- * and is owned by this shard. It is sometimes not possible to generate
- * a CDC stream identifier for some (vnode, shard) pair because not all
- * shards have to own tokens in a vnode. Small vnode can be totally owned
- * by a single shard. In such case, a stream identifier that maps to
- * end of the vnode is generated.
- *
- * Then build a cdc::topology_description which maps tokens to generated
- * stream identifiers, such that if token T is owned by shard S in vnode V,
- * it gets mapped to the stream identifier generated for (S, V).
- *
- * Run in seastar::async context.
- */
-cdc::topology_description make_new_generation_description(
+std::pair<utils::UUID, cdc::topology_description> make_new_generation_data(
    const std::unordered_set<dht::token>& bootstrap_tokens,
    const noncopyable_function<std::pair<size_t, uint8_t> (dht::token)>& get_sharding_info,
    const locator::token_metadata_ptr);
@@ -165,20 +144,9 @@ db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milli
 // using `mutation_size_threshold` to decide on the mutation sizes. The partition key of each mutation
 // is given by `gen_uuid`. The timestamp of each cell in each mutation is given by `mutation_timestamp`.
 //
-// Works only for the CDC_GENERATIONS_V2 schema (in system_distributed keyspace).
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v2(
-    schema_ptr, utils::UUID gen_uuid, const cdc::topology_description&,
-    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);
-
-// The partition key of all rows in the single-partition CDC_GENERATIONS_V3 schema (in system keyspace).
-static constexpr auto CDC_GENERATIONS_V3_KEY = "cdc_generations";
-
-// Translates the CDC generation data given by a `cdc::topology_description` into a vector of mutations,
-// using `mutation_size_threshold` to decide on the mutation sizes. The first clustering key column is
-// given by `gen_uuid`. The timestamp of each cell in each mutation is given by `mutation_timestamp`.
-//
-// Works only for the CDC_GENERATIONS_V3 schema (in system keyspace).
-future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
+// Works for only specific schemas: CDC_GENERATIONS_V2 (in system_distributed_keyspace)
+// and CDC_GENERATIONS_V3 (in system_keyspace).
+future<utils::chunked_vector<mutation>> get_cdc_generation_mutations(
    schema_ptr, utils::UUID gen_uuid, const cdc::topology_description&,
    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);

--- a/cdc/generation_id.hh
+++ b/cdc/generation_id.hh
@@ -28,35 +28,7 @@ struct generation_id_v2 {

 using generation_id = std::variant<generation_id_v1, generation_id_v2>;

+std::ostream& operator<<(std::ostream&, const generation_id&);
 db_clock::time_point get_ts(const generation_id&);

 } // namespace cdc
-
-template <>
-struct fmt::formatter<cdc::generation_id_v1> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    template <typename FormatContext>
-    auto format(const cdc::generation_id_v1& gen_id, FormatContext& ctx) const {
-        return fmt::format_to(ctx.out(), "{}", gen_id.ts);
-    }
-};
-
-template <>
-struct fmt::formatter<cdc::generation_id_v2> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    template <typename FormatContext>
-    auto format(const cdc::generation_id_v2& gen_id, FormatContext& ctx) const {
-        return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
-    }
-};
-
-template <>
-struct fmt::formatter<cdc::generation_id> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    template <typename FormatContext>
-    auto format(const cdc::generation_id& gen_id, FormatContext& ctx) const {
-        return std::visit([&ctx] (auto& id) {
-            return fmt::format_to(ctx.out(), "{}", id);
-        }, gen_id);
-    }
-};
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -98,20 +98,19 @@ public:
     * Must be called on shard 0 - that's where the generation management happens.
     */
    future<> after_join(std::optional<cdc::generation_id>&& startup_gen_id);
-    future<> leave_ring();

    cdc::metadata& get_cdc_metadata() {
        return _cdc_metadata;
    }

-    virtual future<> before_change(gms::inet_address, gms::endpoint_state_ptr, gms::application_state, const gms::versioned_value&) override { return make_ready_future(); }
-    virtual future<> on_alive(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_dead(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_remove(gms::inet_address, gms::permit_id) override { return make_ready_future(); }
-    virtual future<> on_restart(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override { return make_ready_future(); }
+    virtual future<> before_change(gms::inet_address, gms::endpoint_state, gms::application_state, const gms::versioned_value&) override { return make_ready_future(); }
+    virtual future<> on_alive(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }
+    virtual future<> on_dead(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }
+    virtual future<> on_remove(gms::inet_address) override { return make_ready_future(); }
+    virtual future<> on_restart(gms::inet_address, gms::endpoint_state) override { return make_ready_future(); }

-    virtual future<> on_join(gms::inet_address, gms::endpoint_state_ptr, gms::permit_id) override;
-    virtual future<> on_change(gms::inet_address, gms::application_state, const gms::versioned_value&, gms::permit_id) override;
+    virtual future<> on_join(gms::inet_address, gms::endpoint_state) override;
+    virtual future<> on_change(gms::inet_address, gms::application_state, const gms::versioned_value&) override;

    future<> check_and_repair_cdc_streams();

--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -395,6 +395,9 @@ bool cdc::options::operator==(const options& o) const {
    return enabled() == o.enabled() && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl
            && _delta_mode == o._delta_mode;
 }
+bool cdc::options::operator!=(const options& o) const {
+    return !(*this == o);
+}

 namespace cdc {

@@ -632,6 +635,9 @@ public:
    bool operator==(const collection_iterator& x) const {
        return _v == x._v;
    }
+    bool operator!=(const collection_iterator& x) const {
+        return !(*this == x);
+    }
 private:
    void next() {
        --_rem;
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -40,7 +40,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const utils::chunked_vector<cdc::token_range_description>& entries,
+        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -389,7 +389,7 @@ struct extract_changes_visitor {
    }

    void partition_delete(const tombstone& t) {
-        _result[t.timestamp].partition_deletions = partition_deletion{t};
+        _result[t.timestamp].partition_deletions = {t};
    }

    constexpr bool finished() const { return false; }
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -93,6 +93,9 @@ public:
        bool operator==(const iterator& other) const {
            return _position == other._position;
        }
+        bool operator!=(const iterator& other) const {
+            return !(*this == other);
+        }
    };
 public:
    explicit partition_cells_range(const mutation_partition& mp) : _mp(mp) { }
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -21,27 +21,27 @@ public:
            : file_impl(*get_file_impl(f)),  _error_handler(error_handler), _file(f) {
    }

-    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, io_intent* intent) override {
+    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->write_dma(pos, buffer, len, intent);
+            return get_file_impl(_file)->write_dma(pos, buffer, len, pc);
        });
    }

-    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, io_intent* intent) override {
+    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->write_dma(pos, iov, intent);
+            return get_file_impl(_file)->write_dma(pos, iov, pc);
        });
    }

-    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, io_intent* intent) override {
+    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->read_dma(pos, buffer, len, intent);
+            return get_file_impl(_file)->read_dma(pos, buffer, len, pc);
        });
    }

-    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, io_intent* intent) override {
+    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->read_dma(pos, iov, intent);
+            return get_file_impl(_file)->read_dma(pos, iov, pc);
        });
    }

@@ -99,9 +99,9 @@ public:
        });
    }

-    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, io_intent* intent) override {
+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
        return do_io_check(_error_handler, [&] {
-            return get_file_impl(_file)->dma_read_bulk(offset, range_size, intent);
+            return get_file_impl(_file)->dma_read_bulk(offset, range_size, pc);
        });
    }
 private:
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -15,6 +15,12 @@

 std::atomic<int64_t> clocks_offset;

+std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
+    auto t = db_clock::to_time_t(tp);
+    ::tm t_buf;
+    return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
+}
+
 std::string format_timestamp(api::timestamp_type ts) {
    auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
    ::tm t_buf;
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -75,7 +75,8 @@ public:
            const interval::interval_type& iv = *_i;
            return position_range{iv.lower().position(), iv.upper().position()};
        }
-        bool operator==(const position_range_iterator& other) const = default;
+        bool operator==(const position_range_iterator& other) const { return _i == other._i; }
+        bool operator!=(const position_range_iterator& other) const { return _i != other._i; }
        position_range_iterator& operator++() {
            ++_i;
            return *this;
--- a/cmake/add_version_library.cmake
+++ b/cmake/add_version_library.cmake
@@ -1,31 +1,20 @@
 ###
 ### Generate version file and supply appropriate compile definitions for release.cc
 ###
-function(generate_scylla_version)
+function(add_version_library name source)
  set(version_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-VERSION-FILE)
  set(release_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-RELEASE-FILE)
-  set(product_file ${CMAKE_CURRENT_BINARY_DIR}/SCYLLA-PRODUCT-FILE)
  execute_process(
    COMMAND ${CMAKE_SOURCE_DIR}/SCYLLA-VERSION-GEN --output-dir "${CMAKE_CURRENT_BINARY_DIR}"
    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
-
  file(STRINGS ${version_file} scylla_version)
  file(STRINGS ${release_file} scylla_release)
-  file(STRINGS ${product_file} scylla_product)

-  string(REPLACE "-" "~" scylla_version_tilde ${scylla_version})
-
-  set(Scylla_VERSION "${scylla_version_tilde}" CACHE INTERNAL "")
-  set(Scylla_RELEASE "${scylla_release}" CACHE INTERNAL "")
-  set(Scylla_PRODUCT "${scylla_product}" CACHE INTERNAL "")
-endfunction(generate_scylla_version)
-
-function(add_version_library name source)
  add_library(${name} OBJECT ${source})
  target_compile_definitions(${name}
    PRIVATE
-      SCYLLA_VERSION=\"${Scylla_VERSION}\"
-      SCYLLA_RELEASE=\"${Scylla_RELEASE}\")
+      SCYLLA_VERSION=\"${scylla_version}\"
+      SCYLLA_RELEASE=\"${scylla_release}\")
  target_link_libraries(${name}
    PRIVATE
      Seastar::seastar)
--- a/cmake/add_whole_archive.cmake
+++ b/cmake/add_whole_archive.cmake
@@ -5,6 +5,15 @@
 # actually compiling a sample program.
 function(add_whole_archive name library)
  add_library(${name} INTERFACE)
-  target_link_libraries(${name} INTERFACE
-    "$<LINK_LIBRARY:WHOLE_ARCHIVE,${library}>")
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
+    target_link_libraries(${name} INTERFACE
+      "$<LINK_LIBRARY:WHOLE_ARCHIVE,${library}>")
+  else()
+    add_dependencies(${name} ${library})
+    target_include_directories(${name} INTERFACE
+      ${CMAKE_SOURCE_DIR})
+    target_link_options(auth INTERFACE
+      "$<$<CXX_COMPILER_ID:Clang>:SHELL:LINKER:-force_load $<TARGET_LINKER_FILE:${library}>>"
+      "$<$<CXX_COMPILER_ID:GNU>:SHELL:LINKER:--whole-archive $<TARGET_LINKER_FILE:${library}> LINKER:--no-whole-archive>")
+  endif()
 endfunction()
--- a/cmake/build_submodule.cmake
+++ b/cmake/build_submodule.cmake
@@ -1,50 +0,0 @@
-function(build_submodule name dir)
-  cmake_parse_arguments(parsed_args "NOARCH" "" "" ${ARGN})
-  set(version_release "${Scylla_VERSION}-${Scylla_RELEASE}")
-  set(product_version_release
-    "${Scylla_PRODUCT}-${Scylla_VERSION}-${Scylla_RELEASE}")
-  set(working_dir ${CMAKE_CURRENT_SOURCE_DIR}/${dir})
-  if(parsed_args_NOARCH)
-    set(arch "noarch")
-  else()
-    set(arch "${CMAKE_SYSTEM_PROCESSOR}")
-  endif()
-  set(reloc_args ${parsed_args_UNPARSED_ARGUMENTS})
-  set(reloc_pkg "${working_dir}/build/${Scylla_PRODUCT}-${name}-${version_release}.${arch}.tar.gz")
-  add_custom_command(
-    OUTPUT ${reloc_pkg}
-    COMMAND reloc/build_reloc.sh --version ${product_version_release} --nodeps ${reloc_args}
-    WORKING_DIRECTORY "${working_dir}"
-    JOB_POOL submodule_pool)
-  add_custom_target(dist-${name}-tar
-    DEPENDS ${reloc_pkg})
-  add_custom_target(dist-${name}-rpm
-    COMMAND reloc/build_rpm.sh --reloc-pkg ${reloc_pkg}
-    DEPENDS ${reloc_pkg}
-    WORKING_DIRECTORY "${working_dir}")
-  add_custom_target(dist-${name}-deb
-    COMMAND reloc/build_deb.sh --reloc-pkg ${reloc_pkg}
-    DEPENDS ${reloc_pkg}
-    WORKING_DIRECTORY "${working_dir}")
-  add_custom_target(dist-${name}
-    DEPENDS dist-${name}-tar dist-${name}-rpm dist-${name}-deb)
-endfunction()
-
-macro(dist_submodule name dir pkgs)
-  # defined as a macro, so that we can append the path to the dist tarball to
-  # specfied "pkgs"
-  cmake_parse_arguments(parsed_args "NOARCH" "" "" ${ARGN})
-  if(parsed_args_NOARCH)
-    set(arch "noarch")
-  else()
-    set(arch "${CMAKE_SYSTEM_PROCESSOR}")
-  endif()
-  set(pkg_name "${Scylla_PRODUCT}-${name}-${Scylla_VERSION}-${Scylla_RELEASE}.${arch}.tar.gz")
-  set(reloc_pkg "${CMAKE_SOURCE_DIR}/tools/${dir}/build/${pkg_name}")
-  set(dist_pkg "${CMAKE_CURRENT_BINARY_DIR}/${pkg_name}")
-  add_custom_command(
-    OUTPUT ${dist_pkg}
-    COMMAND ${CMAKE_COMMAND} -E copy ${reloc_pkg} ${dist_pkg}
-    DEPENDS dist-${name}-tar)
-  list(APPEND ${pkgs} "${dist_pkg}")
-endmacro()
--- a/cmake/generate_cql_grammar.cmake
+++ b/cmake/generate_cql_grammar.cmake
@@ -1,5 +1,7 @@
-find_program (ANTLR3 antlr3
-  REQUIRED)
+find_program (ANTLR3 antlr3)
+if(NOT ANTLR3)
+  message(FATAL "antlr3 is required")
+endif()

 # Parse antlr3 grammar files and generate C++ sources
 function(generate_cql_grammar)
--- a/cmake/mode.COVERAGE.cmake
+++ b/cmake/mode.COVERAGE.cmake
@@ -1,23 +0,0 @@
-set(Seastar_OptimizationLevel_COVERAGE "g")
-set(CMAKE_CXX_FLAGS_COVERAGE
-  ""
-  CACHE
-  INTERNAL
-  "")
-string(APPEND CMAKE_CXX_FLAGS_COVERAGE
-  " -O${Seastar_OptimizationLevel_SANITIZE}")
-
-set(Seastar_DEFINITIONS_COVERAGE
-  SCYLLA_BUILD_MODE=debug
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
-
-set(CMAKE_CXX_FLAGS_COVERAGE
-  " -O${Seastar_OptimizationLevel_COVERAGE} -fprofile-instr-generate -fcoverage-mapping -g -gz")
-
-set(CMAKE_STATIC_LINKER_FLAGS_COVERAGE
-  "-fprofile-instr-generate -fcoverage-mapping")
-
-set(stack_usage_threshold_in_KB 40)
--- a/cmake/mode.RELEASE.cmake
+++ b/cmake/mode.RELEASE.cmake
@@ -12,16 +12,16 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
 else()
  set(clang_inline_threshold 2500)
 endif()
-add_compile_options(
-  "$<$<CXX_COMPILER_ID:GNU>:--param;inline-unit-growth=300>"
-  "$<$<CXX_COMPILER_ID:Clang>:-mllvm;-inline-threshold=${clang_inline_threshold}>"
+string(APPEND CMAKE_CXX_FLAGS_RELEASE
+  " $<$<CXX_COMPILER_ID:GNU>:--param inline-unit-growth=300"
+  " $<$<CXX_COMPILER_ID:Clang>:-mllvm -inline-threshold=${clang_inline_threshold}>"
  # clang generates 16-byte loads that break store-to-load forwarding
  # gcc also has some trouble: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103554
-  "-fno-slp-vectorize")
-set(Seastar_DEFINITIONS_RELEASE
+  " -fno-slp-vectorize")
+set(Seastar_DEFINITIONS_DEBUG
  SCYLLA_BUILD_MODE=release)

-set(CMAKE_EXE_LINKER_FLAGS_RELEASE
+set(CMAKE_STATIC_LINKER_FLAGS_RELEASE
  "-Wl,--gc-sections")

 set(stack_usage_threshold_in_KB 13)
--- a/cmake/mode.SANITIZE.cmake
+++ b/cmake/mode.SANITIZE.cmake
@@ -1,17 +0,0 @@
-set(Seastar_OptimizationLevel_SANITIZE "s")
-set(CMAKE_CXX_FLAGS_SANITIZE
-  ""
-  CACHE
-  INTERNAL
-  "")
-string(APPEND CMAKE_CXX_FLAGS_SANITIZE
-  " -O${Seastar_OptimizationLevel_SANITIZE}")
-
-set(Seastar_DEFINITIONS_SANITIZE
-  SCYLLA_BUILD_MODE=sanitize
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
-
-set(stack_usage_threshold_in_KB 50)
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -1,7 +1,9 @@
 set(disabled_warnings
  c++11-narrowing
  mismatched-tags
+  missing-braces
  overloaded-virtual
+  parentheses-equality
  unsupported-friend)
 include(CheckCXXCompilerFlag)
 foreach(warning ${disabled_warnings})
@@ -11,23 +13,13 @@ foreach(warning ${disabled_warnings})
  endif()
 endforeach()
 list(TRANSFORM _supported_warnings PREPEND "-Wno-")
-string(JOIN " " CMAKE_CXX_FLAGS
-  "-Wall"
-  "-Werror"
-  "-Wno-error=deprecated-declarations"
-  "-Wimplicit-fallthrough"
-  ${_supported_warnings})
+string(JOIN " " CMAKE_CXX_FLAGS "-Wall" "-Werror" ${_supported_warnings})

 function(default_target_arch arch)
  set(x86_instruction_sets i386 i686 x86_64)
  if(CMAKE_SYSTEM_PROCESSOR IN_LIST x86_instruction_sets)
    set(${arch} "westmere" PARENT_SCOPE)
-  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-    # we always use intrinsics like vmull.p64 for speeding up crc32 calculations
-    # on the aarch64 architectures, and they require the crypto extension, so
-    # we have to add "+crypto" in the architecture flags passed to -march. the
-    # same applies to crc32 instructions, which need the ARMv8-A CRC32 extension
-    # please note, Seastar also sets -march when compiled with DPDK enabled.
+  elseif(CMAKE_SYSTEM_PROCESSOR EQUAL "aarch64")
    set(${arch} "armv8-a+crc+crypto" PARENT_SCOPE)
  else()
    set(${arch} "" PARENT_SCOPE)
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -29,27 +29,32 @@
 #include <seastar/core/shared_ptr.hh>

 #include "dht/i_partitioner.hh"
-#include "sstables/exceptions.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstable_writer.hh"
 #include "sstables/progress_monitor.hh"
 #include "sstables/sstables_manager.hh"
 #include "compaction.hh"
+#include "compaction_manager.hh"
 #include "schema/schema.hh"
 #include "db/system_keyspace.hh"
+#include "service/priority_manager.hh"
 #include "db_clock.hh"
 #include "mutation/mutation_compactor.hh"
 #include "leveled_manifest.hh"
+#include "dht/token.hh"
 #include "dht/partition_filter.hh"
 #include "mutation_writer/shard_based_splitting_writer.hh"
 #include "mutation_writer/partition_based_splitting_writer.hh"
 #include "mutation/mutation_source_metadata.hh"
 #include "mutation/mutation_fragment_stream_validator.hh"
+#include "utils/UUID_gen.hh"
+#include "utils/utf8.hh"
+#include "utils/fmt-compat.hh"
 #include "utils/error_injection.hh"
-#include "readers/multi_range.hh"
+#include "readers/filtering.hh"
 #include "readers/compacting.hh"
 #include "tombstone_gc.hh"
-#include "replica/database.hh"
+#include "keys.hh"

 namespace sstables {

@@ -143,12 +148,27 @@ std::ostream& operator<<(std::ostream& os, compaction_type_options::scrub::quara
    return os << to_string(quarantine_mode);
 }

-static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
-    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
-        return api::min_timestamp;
+std::ostream& operator<<(std::ostream& os, pretty_printed_data_size data) {
+    static constexpr const char* suffixes[] = { " bytes", "kB", "MB", "GB", "TB", "PB" };
+
+    unsigned exp = 0;
+    while ((data._size >= 1000) && (exp < sizeof(suffixes))) {
+        exp++;
+        data._size /= 1000;
    }

+    os << data._size << suffixes[exp];
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
+    uint64_t throughput = tp._duration.count() > 0 ? tp._size / tp._duration.count() : 0;
+    os << pretty_printed_data_size(throughput) << "/s";
+    return os;
+}
+
+static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -159,7 +179,6 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
            hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
        }
        if (sst->filter_has_key(*hk)) {
-            bloom_filter_checks++;
            timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
        }
    }
@@ -395,12 +414,9 @@ private:

 class formatted_sstables_list {
    bool _include_origin = true;
-    std::vector<std::string> _ssts;
+    std::vector<sstring> _ssts;
 public:
    formatted_sstables_list() = default;
-    void reserve(size_t n) {
-        _ssts.reserve(n);
-    }
    explicit formatted_sstables_list(const std::vector<shared_sstable>& ssts, bool include_origin) : _include_origin(include_origin) {
        _ssts.reserve(ssts.size());
        for (const auto& sst : ssts) {
@@ -415,7 +431,9 @@ public:
 };

 std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst) {
-    fmt::print(os, "[{}]", fmt::join(lst._ssts, ","));
+    os << "[";
+    os << boost::algorithm::join(lst._ssts, ",");
+    os << "]";
    return os;
 }

@@ -423,9 +441,9 @@ class compaction {
 protected:
    compaction_data& _cdata;
    table_state& _table_s;
-    const compaction_sstable_creator_fn _sstable_creator;
-    const schema_ptr _schema;
-    const reader_permit _permit;
+    compaction_sstable_creator_fn _sstable_creator;
+    schema_ptr _schema;
+    reader_permit _permit;
    std::vector<shared_sstable> _sstables;
    std::vector<generation_type> _input_sstable_generations;
    // Unused sstables are tracked because if compaction is interrupted we can only delete them.
@@ -434,32 +452,28 @@ protected:
    std::vector<shared_sstable> _new_unused_sstables;
    std::vector<shared_sstable> _all_new_sstables;
    lw_shared_ptr<sstable_set> _compacting;
-    const sstables::compaction_type _type;
-    const uint64_t _max_sstable_size;
-    const uint32_t _sstable_level;
+    sstables::compaction_type _type;
+    uint64_t _max_sstable_size;
+    uint32_t _sstable_level;
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
-    // fully expired files, which are skipped, aren't taken into account.
-    uint64_t _compacting_data_file_size = 0;
    uint64_t _estimated_partitions = 0;
-    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
-    const bool _can_split_large_partition = false;
+    bool _can_split_large_partition = false;
    bool _contains_multi_fragment_runs = false;
    mutation_source_metadata _ms_metadata = {};
-    const compaction_sstable_replacer_fn _replacer;
-    const run_id _run_identifier;
+    compaction_sstable_replacer_fn _replacer;
+    run_id _run_identifier;
+    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
    std::optional<sstable_set> _sstable_set;
    // used to incrementally calculate max purgeable timestamp, as we iterate through decorated keys.
    std::optional<sstable_set::incremental_selector> _selector;
    std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
    // optional owned_ranges vector for cleanup;
-    const owned_ranges_ptr _owned_ranges = {};
-    // required for reshard compaction.
-    const dht::sharder* _sharder = nullptr;
-    const std::optional<dht::incremental_owned_ranges_checker> _owned_ranges_checker;
+    owned_ranges_ptr _owned_ranges = {};
+    std::optional<dht::incremental_owned_ranges_checker> _owned_ranges_checker;
    // Garbage collected sstables that are sealed but were not added to SSTable set yet.
    std::vector<shared_sstable> _unused_garbage_collected_sstables;
    // Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
@@ -484,11 +498,11 @@ protected:
        , _can_split_large_partition(descriptor.can_split_large_partition)
        , _replacer(std::move(descriptor.replacer))
        , _run_identifier(descriptor.run_identifier)
+        , _io_priority(descriptor.io_priority)
        , _sstable_set(std::move(descriptor.all_sstables_snapshot))
        , _selector(_sstable_set ? _sstable_set->make_incremental_selector() : std::optional<sstable_set::incremental_selector>{})
        , _compacting_for_max_purgeable_func(std::unordered_set<shared_sstable>(_sstables.begin(), _sstables.end()))
        , _owned_ranges(std::move(descriptor.owned_ranges))
-        , _sharder(descriptor.sharder)
        , _owned_ranges_checker(_owned_ranges ? std::optional<dht::incremental_owned_ranges_checker>(*_owned_ranges) : std::nullopt)
    {
        for (auto& sst : _sstables) {
@@ -503,9 +517,9 @@ protected:
    virtual uint64_t partitions_per_sstable() const {
        // some tests use _max_sstable_size == 0 for force many one partition per sstable
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
-        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_compacting_data_file_size) / max_sstable_size)));
+        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -557,17 +571,18 @@ protected:
    // Tombstone expiration is enabled based on the presence of sstable set.
    // If it's not present, we cannot purge tombstones without the risk of resurrecting data.
    bool tombstone_expiration_enabled() const {
-        return bool(_sstable_set) && _table_s.tombstone_gc_enabled();
+        return bool(_sstable_set);
    }

-    compaction_writer create_gc_compaction_writer(run_id gc_run) const {
+    compaction_writer create_gc_compaction_writer() const {
        auto sst = _sstable_creator(this_shard_id());

+        auto&& priority = _io_priority;
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
-        cfg.run_identifier = gc_run;
+        cfg.run_identifier = _run_identifier;
        cfg.monitor = monitor.get();
-        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats());
+        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }

@@ -586,14 +601,8 @@ protected:
    // When compaction finishes, all the temporary sstables generated here will be deleted and removed
    // from table's sstable set.
    compacted_fragments_writer get_gc_compacted_fragments_writer() {
-        // because the temporary sstable run can overlap with the non-gc sstables run created by
-        // get_compacted_fragments_writer(), we have to use a different run_id. the gc_run_id is
-        // created here as:
-        // 1. it can be shared across all sstables created by this writer
-        // 2. it is optional, as gc writer is not always used
-        auto gc_run = run_id::create_random_id();
        return compacted_fragments_writer(*this,
-             [this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
+             [this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
             [this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
             _stop_request_observable);
    }
@@ -610,8 +619,23 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    }
+
+    flat_mutation_reader_v2::filter make_partition_filter() const {
+        return [this] (const dht::decorated_key& dk) {
+#ifdef SEASTAR_DEBUG
+            // sstables should never be shared with other shards at this point.
+            assert(dht::shard_of(*_schema, dk.token()) == this_shard_id());
+#endif
+
+            if (!_owned_ranges_checker->belongs_to_current_node(dk.token())) {
+                log_trace("Token {} does not belong to this node, skipping", dk.token());
+                return false;
+            }
+            return true;
+        };
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -624,55 +648,17 @@ public:
    }
 private:
    // Default range sstable reader that will only return mutation that belongs to current shard.
-    virtual flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                        reader_permit permit,
-                                                        const dht::partition_range& range,
-                                                        const query::partition_slice& slice,
-                                                        tracing::trace_state_ptr,
-                                                        streamed_mutation::forwarding fwd,
-                                                        mutation_reader::forwarding) const = 0;
+    virtual flat_mutation_reader_v2 make_sstable_reader() const = 0;

+    // Make a filtering reader if needed
+    // FIXME: the sstable reader itself should be pass the owned ranges
+    // so it can skip over the disowned ranges efficiently using the index.
+    // Ref https://github.com/scylladb/scylladb/issues/12998
    flat_mutation_reader_v2 setup_sstable_reader() const {
        if (!_owned_ranges_checker) {
-            return make_sstable_reader(_schema,
-                                       _permit,
-                                       query::full_partition_range,
-                                       _schema->full_slice(),
-                                       tracing::trace_state_ptr(),
-                                       ::streamed_mutation::forwarding::no,
-                                       ::mutation_reader::forwarding::no);
+            return make_sstable_reader();
        }
-
-        auto source = mutation_source([this] (schema_ptr s,
-                reader_permit permit,
-                const dht::partition_range& range,
-                const query::partition_slice& slice,
-                tracing::trace_state_ptr trace_state,
-                streamed_mutation::forwarding fwd,
-                mutation_reader::forwarding fwd_mr) {
-            log_trace("Creating sstable set reader with range {}", range);
-            return make_sstable_reader(std::move(s),
-                                       std::move(permit),
-                                       range,
-                                       slice,
-                                       std::move(trace_state),
-                                       fwd,
-                                       fwd_mr);
-        });
-
-        auto owned_range_generator = [this] () -> std::optional<dht::partition_range> {
-            auto r = _owned_ranges_checker->next_owned_range();
-            if (r == nullptr) {
-                return std::nullopt;
-            }
-            log_trace("Skipping to the next owned range {}", *r);
-            return dht::to_partition_range(*r);
-        };
-
-        return make_flat_multi_range_reader(_schema, _permit, std::move(source),
-                                            std::move(owned_range_generator),
-                                            _schema->full_slice(),
-                                            tracing::trace_state_ptr());
+        return make_filtering_reader(make_sstable_reader(), make_partition_filter());
    }

    virtual sstables::sstable_set make_sstable_set_for_input() const {
@@ -682,7 +668,6 @@ private:
    future<> setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
        formatted_sstables_list formatted_msg;
-        formatted_msg.reserve(_sstables.size());
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

@@ -712,7 +697,6 @@ private:
            // for a better estimate for the number of partitions in the merged
            // sstable than just adding up the lengths of individual sstables.
            _estimated_partitions += sst->get_estimated_key_count();
-            _compacting_data_file_size += sst->ondisk_data_size();
            // TODO:
            // Note that this is not fully correct. Since we might be merging sstables that originated on
            // another shard (#cpu changed), we might be comparing RP:s with differing shard ids,
@@ -741,7 +725,7 @@ private:
        auto consumer = make_interposer_consumer([this] (flat_mutation_reader_v2 reader) mutable {
            return seastar::async([this, reader = std::move(reader)] () mutable {
                auto close_reader = deferred_close(reader);
-                auto cfc = get_compacted_fragments_writer();
+                auto cfc = compacted_fragments_writer(get_compacted_fragments_writer());
                reader.consume_in_thread(std::move(cfc));
            });
        });
@@ -800,7 +784,6 @@ protected:
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
-                .bloom_filter_checks = _bloom_filter_checks,
            },
        };

@@ -819,8 +802,8 @@ protected:
        // By the time being, using estimated key count.
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
-                _input_sstable_generations.size(), new_sstables_msg, utils::pretty_printed_data_size(_start_size), utils::pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), utils::pretty_printed_throughput(_start_size, duration),
+                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -841,7 +824,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
        };
    }

@@ -957,7 +940,7 @@ void compacted_fragments_writer::split_large_partition() {
        _c.log_debug("Closing active tombstone {} with {} for partition {}", _current_partition.current_emitted_tombstone, rtc, *_current_partition.dk);
        _compaction_writer->writer.consume(std::move(rtc));
    }
-    _c.log_debug("Splitting large partition {} in order to respect SSTable size limit of {}", *_current_partition.dk, utils::pretty_printed_data_size(_c._max_sstable_size));
+    _c.log_debug("Splitting large partition {} in order to respect SSTable size limit of {}", *_current_partition.dk, pretty_printed_data_size(_c._max_sstable_size));
    // Close partition in current writer, and open it again in a new writer.
    do_consume_end_of_partition();
    stop_current_writer();
@@ -1041,6 +1024,51 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

+class reshape_compaction : public compaction {
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+        : compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            finish_new_sstable(writer);
+        }
+    }
+};
+
 class regular_compaction : public compaction {
    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
    mutable compaction_read_monitor_generator _monitor_generator;
@@ -1052,20 +1080,15 @@ public:
    {
    }

-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_local_shard_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
-                std::move(trace),
-                sm_fwd,
-                mr_fwd,
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
                _monitor_generator);
    }

@@ -1084,7 +1107,7 @@ public:
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = make_sstable_writer_config(_type);
        cfg.monitor = monitor.get();
-        return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
+        return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
    }

    virtual void stop_sstable_writer(compaction_writer* writer) override {
@@ -1155,13 +1178,12 @@ private:
    }

    void update_pending_ranges() {
-        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
-        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
+        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : pending_replacements) {
+        for (auto& pending_replacement : _cdata.pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1175,75 +1197,7 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-    }
-};
-
-class reshape_compaction : public regular_compaction {
-private:
-    bool has_sstable_replacer() const noexcept {
-        return bool(_replacer);
-    }
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
-            : regular_compaction(table_s, std::move(descriptor), cdata) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
-    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
-        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_local_shard_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
-                std::move(trace),
-                sm_fwd,
-                mr_fwd,
-                default_read_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            if (has_sstable_replacer()) {
-                regular_compaction::stop_sstable_writer(writer);
-            } else {
-                finish_new_sstable(writer);
-            }
-        }
-    }
-
-    virtual void on_end_of_compaction() override {
-        if (has_sstable_replacer()) {
-            regular_compaction::on_end_of_compaction();
-        }
+        _cdata.pending_replacements.clear();
    }
 };

@@ -1287,8 +1241,62 @@ public:

 class scrub_compaction final : public regular_compaction {
 public:
-    static void report_validation_error(compaction_type type, const ::schema& schema, sstring what, std::string_view action = "") {
-        clogger.error("[{} compaction {}.{}] {}{}{}", type, schema.ks_name(), schema.cf_name(), what, action.empty() ? "" : "; ", action);
+    static void report_invalid_partition(compaction_type type, mutation_fragment_stream_validator& validator, const dht::decorated_key& new_key,
+            std::string_view action = "") {
+        const auto& schema = validator.schema();
+        const auto& current_key = validator.previous_partition_key();
+        clogger.error("[{} compaction {}.{}] Invalid partition {} ({}), partition is out-of-order compared to previous partition {} ({}){}{}",
+                type,
+                schema.ks_name(),
+                schema.cf_name(),
+                new_key.key().with_schema(schema),
+                new_key,
+                current_key.key().with_schema(schema),
+                current_key,
+                action.empty() ? "" : "; ",
+                action);
+    }
+    static void report_invalid_partition_start(compaction_type type, mutation_fragment_stream_validator& validator, const dht::decorated_key& new_key,
+            std::string_view action = "") {
+        const auto& schema = validator.schema();
+        const auto& current_key = validator.previous_partition_key();
+        clogger.error("[{} compaction {}.{}] Invalid partition start for partition {} ({}), previous partition {} ({}) didn't end with a partition-end fragment{}{}",
+                type,
+                schema.ks_name(),
+                schema.cf_name(),
+                new_key.key().with_schema(schema),
+                new_key,
+                current_key.key().with_schema(schema),
+                current_key,
+                action.empty() ? "" : "; ",
+                action);
+    }
+    static void report_invalid_mutation_fragment(compaction_type type, mutation_fragment_stream_validator& validator, const mutation_fragment_v2& mf,
+            std::string_view action = "") {
+        const auto& schema = validator.schema();
+        const auto& key = validator.previous_partition_key();
+        const auto prev_pos = validator.previous_position();
+        clogger.error("[{} compaction {}.{}] Invalid {} fragment{} ({}) in partition {} ({}),"
+                " fragment is out-of-order compared to previous {} fragment{} ({}){}{}",
+                type,
+                schema.ks_name(),
+                schema.cf_name(),
+                mf.mutation_fragment_kind(),
+                mf.has_key() ? format(" with key {}", mf.key().with_schema(schema)) : "",
+                mf.position(),
+                key.key().with_schema(schema),
+                key,
+                prev_pos.region(),
+                prev_pos.has_key() ? format(" with key {}", prev_pos.key().with_schema(schema)) : "",
+                prev_pos,
+                action.empty() ? "" : "; ",
+                action);
+    }
+    static void report_invalid_end_of_stream(compaction_type type, mutation_fragment_stream_validator& validator, std::string_view action = "") {
+        const auto& schema = validator.schema();
+        const auto& key = validator.previous_partition_key();
+        clogger.error("[{} compaction {}.{}] Invalid end-of-stream, last partition {} ({}) didn't end with a partition-end fragment{}{}",
+                type, schema.ks_name(), schema.cf_name(), key.key().with_schema(schema), key, action.empty() ? "" : "; ", action);
    }

 private:
@@ -1311,9 +1319,9 @@ private:
            ++_validation_errors;
        }

-        void on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
-            auto report_fn = [this, error] (std::string_view action = "") {
-                report_validation_error(compaction_type::Scrub, *_schema, error, action);
+        void on_unexpected_partition_start(const mutation_fragment_v2& ps) {
+            auto report_fn = [this, &ps] (std::string_view action = "") {
+                report_invalid_partition_start(compaction_type::Scrub, _validator, ps.as_partition_start().key(), action);
            };
            maybe_abort_scrub(report_fn);
            report_fn("Rectifying by adding assumed missing partition-end");
@@ -1335,9 +1343,9 @@ private:
            }
        }

-        skip on_invalid_partition(const dht::decorated_key& new_key, sstring error) {
-            auto report_fn = [this, error] (std::string_view action = "") {
-                report_validation_error(compaction_type::Scrub, *_schema, error, action);
+        skip on_invalid_partition(const dht::decorated_key& new_key) {
+            auto report_fn = [this, &new_key] (std::string_view action = "") {
+                report_invalid_partition(compaction_type::Scrub, _validator, new_key, action);
            };
            maybe_abort_scrub(report_fn);
            if (_scrub_mode == compaction_type_options::scrub::mode::segregate) {
@@ -1351,9 +1359,9 @@ private:
            return skip::yes;
        }

-        skip on_invalid_mutation_fragment(const mutation_fragment_v2& mf, sstring error) {
-            auto report_fn = [this, error] (std::string_view action = "") {
-                report_validation_error(compaction_type::Scrub, *_schema, error, action);
+        skip on_invalid_mutation_fragment(const mutation_fragment_v2& mf) {
+            auto report_fn = [this, &mf] (std::string_view action = "") {
+                report_invalid_mutation_fragment(compaction_type::Scrub, _validator, mf, "");
            };
            maybe_abort_scrub(report_fn);

@@ -1388,9 +1396,9 @@ private:
            return skip::yes;
        }

-        void on_invalid_end_of_stream(sstring error) {
-            auto report_fn = [this, error] (std::string_view action = "") {
-                report_validation_error(compaction_type::Scrub, *_schema, error, action);
+        void on_invalid_end_of_stream() {
+            auto report_fn = [this] (std::string_view action = "") {
+                report_invalid_end_of_stream(compaction_type::Scrub, _validator, action);
            };
            maybe_abort_scrub(report_fn);
            // Handle missing partition_end
@@ -1409,27 +1417,21 @@ private:
                    // and shouldn't be verified. We know the last fragment the
                    // validator saw is a partition-start, passing it another one
                    // will confuse it.
-                    if (!_skip_to_next_partition) {
-                        if (auto res = _validator(mf); !res) {
-                            on_unexpected_partition_start(mf, res.what());
-                        }
+                    if (!_skip_to_next_partition && !_validator(mf)) {
+                        on_unexpected_partition_start(mf);
                        // Continue processing this partition start.
                    }
                    _skip_to_next_partition = false;
                    // Then check that the partition monotonicity stands.
                    const auto& dk = mf.as_partition_start().key();
-                    if (auto res = _validator(dk); !res) {
-                        if (on_invalid_partition(dk, res.what()) == skip::yes) {
-                            continue;
-                        }
+                    if (!_validator(dk) && on_invalid_partition(dk) == skip::yes) {
+                        continue;
                    }
                } else if (_skip_to_next_partition) {
                    continue;
                } else {
-                    if (auto res = _validator(mf); !res) {
-                        if (on_invalid_mutation_fragment(mf, res.what()) == skip::yes) {
-                            continue;
-                        }
+                    if (!_validator(mf) && on_invalid_mutation_fragment(mf) == skip::yes) {
+                        continue;
                    }
                }
                push_mutation_fragment(std::move(mf));
@@ -1438,8 +1440,8 @@ private:
            _end_of_stream = _reader.is_end_of_stream() && _reader.is_buffer_empty();

            if (_end_of_stream) {
-                if (auto res = _validator.on_end_of_stream(); !res) {
-                    on_invalid_end_of_stream(res.what());
+                if (!_validator.on_end_of_stream()) {
+                    on_invalid_end_of_stream();
                }
            }
        }
@@ -1519,17 +1521,8 @@ public:
        return _scrub_finish_description;
    }

-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        if (!range.is_full()) {
-            on_internal_error(clogger, fmt::format("Scrub compaction in mode {} expected full partition range, but got {} instead", _options.operation_mode, range));
-        }
-        auto crawling_reader = _compacting->make_crawling_reader(std::move(s), std::move(permit), nullptr);
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        auto crawling_reader = _compacting->make_crawling_reader(_schema, _permit, _io_priority, nullptr);
        return make_flat_mutation_reader_v2<reader>(std::move(crawling_reader), _options.operation_mode, _validation_errors);
    }

@@ -1548,7 +1541,7 @@ public:
            return end_consumer;
        }
        return [this, end_consumer = std::move(end_consumer)] (flat_mutation_reader_v2 reader) mutable -> future<> {
-            auto cfg = mutation_writer::segregate_config{memory::stats().total_memory() / 10};
+            auto cfg = mutation_writer::segregate_config{_io_priority, memory::stats().total_memory() / 10};
            return mutation_writer::segregate_by_partition(std::move(reader), cfg,
                    [consumer = std::move(end_consumer), this] (flat_mutation_reader_v2 rd) {
                ++_bucket_count;
@@ -1595,7 +1588,7 @@ private:
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
@@ -1620,20 +1613,15 @@ public:
    ~resharding_compaction() { }

    // Use reader that makes sure no non-local mutation will not be filtered out.
-    flat_mutation_reader_v2 make_sstable_reader(schema_ptr s,
-                                                reader_permit permit,
-                                                const dht::partition_range& range,
-                                                const query::partition_slice& slice,
-                                                tracing::trace_state_ptr trace,
-                                                streamed_mutation::forwarding sm_fwd,
-                                                mutation_reader::forwarding mr_fwd) const override {
-        return _compacting->make_range_sstable_reader(std::move(s),
-                std::move(permit),
-                range,
-                slice,
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_range_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
                nullptr,
-                sm_fwd,
-                mr_fwd);
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no);

    }

@@ -1656,14 +1644,14 @@ public:
    }

    compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto shard = _sharder->shard_of(dk.token());
+        auto shard = dht::shard_of(*_schema, dk.token());
        auto sst = _sstable_creator(shard);
        setup_new_sstable(sst);

        auto cfg = make_sstable_writer_config(compaction_type::Reshard);
        // sstables generated for a given shard will share the same run identifier.
        cfg.run_identifier = _run_identifiers.at(shard);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), shard), sst};
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), _io_priority, shard), sst};
    }

    void stop_sstable_writer(compaction_writer* writer) override {
@@ -1734,31 +1722,82 @@ static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstable
    return descriptor.options.visit(visitor_factory);
 }

-static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s) {
-    auto schema = table_s.schema();
-    auto permit = table_s.make_compaction_reader_permit();
+future<uint64_t> scrub_validate_mode_validate_reader(flat_mutation_reader_v2 reader, const compaction_data& cdata) {
+    auto schema = reader.schema();

-    uint64_t validation_errors = 0;
+    uint64_t errors = 0;
+    std::exception_ptr ex;

-    for (const auto& sst : descriptor.sstables) {
-        clogger.info("Scrubbing in validate mode {}", sst->get_filename());
+    try {
+        auto validator = mutation_fragment_stream_validator(*schema);

-        validation_errors += co_await sst->validate(permit, cdata.abort, [&schema] (sstring what) {
-            scrub_compaction::report_validation_error(compaction_type::Scrub, *schema, what);
-        });
-        // Did validation actually finish because aborted?
-        if (cdata.is_stop_requested()) {
-            // Compaction manager will catch this exception and re-schedule the compaction.
-            throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
+        while (auto mf_opt = co_await reader()) {
+            if (cdata.is_stop_requested()) [[unlikely]] {
+                // Compaction manager will catch this exception and re-schedule the compaction.
+                throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
+            }
+
+            const auto& mf = *mf_opt;
+
+            if (mf.is_partition_start()) {
+                const auto& ps = mf.as_partition_start();
+                if (!validator(mf)) {
+                    scrub_compaction::report_invalid_partition_start(compaction_type::Scrub, validator, ps.key());
+                    validator.reset(mf);
+                    ++errors;
+                }
+                if (!validator(ps.key())) {
+                    scrub_compaction::report_invalid_partition(compaction_type::Scrub, validator, ps.key());
+                    validator.reset(ps.key());
+                    ++errors;
+                }
+            } else {
+                if (!validator(mf)) {
+                    scrub_compaction::report_invalid_mutation_fragment(compaction_type::Scrub, validator, mf);
+                    validator.reset(mf);
+                    ++errors;
+                }
+            }
        }
-
-        clogger.info("Finished scrubbing in validate mode {} - sstable is {}", sst->get_filename(), validation_errors == 0 ? "valid" : "invalid");
+        if (!validator.on_end_of_stream()) {
+            scrub_compaction::report_invalid_end_of_stream(compaction_type::Scrub, validator);
+            ++errors;
+        }
+    } catch (...) {
+        ex = std::current_exception();
    }

-    using scrub = sstables::compaction_type_options::scrub;
-    if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
-        for (auto& sst : descriptor.sstables) {
-            co_await sst->change_state(sstables::sstable_state::quarantine);
+    co_await reader.close();
+
+    if (ex) {
+        co_return coroutine::exception(std::move(ex));
+    }
+
+    co_return errors;
+}
+
+static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s) {
+    auto schema = table_s.schema();
+
+    formatted_sstables_list sstables_list_msg;
+    auto sstables = make_lw_shared<sstables::sstable_set>(sstables::make_partitioned_sstable_set(schema, false));
+    for (const auto& sst : descriptor.sstables) {
+        sstables_list_msg += sst;
+        sstables->insert(sst);
+    }
+
+    clogger.info("Scrubbing in validate mode {}", sstables_list_msg);
+
+    auto permit = table_s.make_compaction_reader_permit();
+    auto reader = sstables->make_crawling_reader(schema, permit, descriptor.io_priority, nullptr);
+
+    const auto validation_errors = co_await scrub_validate_mode_validate_reader(std::move(reader), cdata);
+
+    clogger.info("Finished scrubbing in validate mode {} - sstable(s) are {}", sstables_list_msg, validation_errors == 0 ? "valid" : "invalid");
+
+    if (validation_errors != 0) {
+        for (auto& sst : *sstables->all()) {
+            co_await sst->change_state(sstables::quarantine_dir);
        }
    }

@@ -1800,7 +1839,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
    int64_t min_timestamp = std::numeric_limits<int64_t>::max();

    for (auto& sstable : overlapping) {
-        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        if (sstable->get_max_local_deletion_time() >= gc_before) {
            min_timestamp = std::min(min_timestamp, sstable->get_stats_metadata().min_timestamp);
        }
@@ -1819,7 +1858,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable

    // SStables that do not contain live data is added to list of possibly expired sstables.
    for (auto& candidate : compacting) {
-        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
+        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        clogger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}",
                    candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before);
        // A fully expired sstable which has an ancestor undeleted shouldn't be compacted because
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -13,8 +13,8 @@
 #include "compaction/compaction_descriptor.hh"
 #include "gc_clock.hh"
 #include "compaction_weight_registration.hh"
+#include "service/priority_manager.hh"
 #include "utils/UUID.hh"
-#include "utils/pretty_printers.hh"
 #include "table_state.hh"
 #include <seastar/core/thread.hh>
 #include <seastar/core/abort_source.hh>
@@ -25,6 +25,21 @@ namespace sstables {

 bool is_eligible_for_compaction(const sstables::shared_sstable& sst) noexcept;

+class pretty_printed_data_size {
+    uint64_t _size;
+public:
+    pretty_printed_data_size(uint64_t size) : _size(size) {}
+    friend std::ostream& operator<<(std::ostream&, pretty_printed_data_size);
+};
+
+class pretty_printed_throughput {
+    uint64_t _size;
+    std::chrono::duration<float> _duration;
+public:
+    pretty_printed_throughput(uint64_t size, std::chrono::duration<float> dur) : _size(size), _duration(std::move(dur)) {}
+    friend std::ostream& operator<<(std::ostream&, pretty_printed_throughput);
+};
+
 // Return the name of the compaction type
 // as used over the REST api, e.g. "COMPACTION" or "CLEANUP".
 sstring compaction_name(compaction_type type);
@@ -77,15 +92,12 @@ struct compaction_stats {
    uint64_t start_size = 0;
    uint64_t end_size = 0;
    uint64_t validation_errors = 0;
-    // Bloom filter checks during max purgeable calculation
-    uint64_t bloom_filter_checks = 0;

    compaction_stats& operator+=(const compaction_stats& r) {
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
-        bloom_filter_checks += r.bloom_filter_checks;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
@@ -118,4 +130,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
 // For tests, can drop after we virtualize sstables.
 flat_mutation_reader_v2 make_scrubbing_reader(flat_mutation_reader_v2 rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);

+// For tests, can drop after we virtualize sstables.
+future<uint64_t> scrub_validate_mode_validate_reader(flat_mutation_reader_v2 rd, const compaction_data& info);
+
 }
--- a/compaction/compaction_backlog_manager.hh
+++ b/compaction/compaction_backlog_manager.hh
@@ -12,6 +12,7 @@
 #include <memory>
 #include <seastar/core/shared_ptr.hh>
 #include "sstables/shared_sstable.hh"
+#include "sstables/progress_monitor.hh"
 #include "timestamp.hh"

 class compaction_backlog_manager;
@@ -59,20 +60,18 @@ public:
    using ongoing_compactions = std::unordered_map<sstables::shared_sstable, backlog_read_progress_manager*>;

    struct impl {
-        // FIXME: Should provide strong exception safety guarantees
-        virtual void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts) = 0;
+        virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) = 0;
        virtual double backlog(const ongoing_writes& ow, const ongoing_compactions& oc) const = 0;
        virtual ~impl() { }
    };

    compaction_backlog_tracker(std::unique_ptr<impl> impl) : _impl(std::move(impl)) {}
    compaction_backlog_tracker(compaction_backlog_tracker&&);
-    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) = delete;
+    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) noexcept;
    compaction_backlog_tracker(const compaction_backlog_tracker&) = delete;
    ~compaction_backlog_tracker();

    double backlog() const;
-    // FIXME: Should provide strong exception safety guarantees
    void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts);
    void register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp);
    void register_compacting_sstable(sstables::shared_sstable sst, backlog_read_progress_manager& rp);
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -18,6 +18,7 @@
 #include "sstables/sstable_set.hh"
 #include "utils/UUID.hh"
 #include "dht/i_partitioner.hh"
+#include "compaction_weight_registration.hh"
 #include "compaction_fwd.hh"

 namespace sstables {
@@ -72,12 +73,6 @@ public:
            only, // scrub only quarantined sstables
        };
        quarantine_mode quarantine_operation_mode = quarantine_mode::include;
-
-        using quarantine_invalid_sstables = bool_class<class quarantine_invalid_sstables_tag>;
-
-        // Should invalid sstables be moved into quarantine.
-        // Only applies to validate-mode.
-        quarantine_invalid_sstables quarantine_sstables = quarantine_invalid_sstables::yes;
    };
    struct reshard {
    };
@@ -114,8 +109,8 @@ public:
        return compaction_type_options(upgrade{});
    }

-    static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes) {
-        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables});
+    static compaction_type_options make_scrub(scrub::mode mode) {
+        return compaction_type_options(scrub{mode});
    }

    template <typename... Visitor>
@@ -123,11 +118,6 @@ public:
        return std::visit(std::forward<Visitor>(visitor)..., _options);
    }

-    template <typename OptionType>
-    const auto& as() const {
-        return std::get<OptionType>(_options);
-    }
-
    const options_variant& options() const { return _options; }

    compaction_type type() const;
@@ -161,12 +151,12 @@ struct compaction_descriptor {
    compaction_type_options options = compaction_type_options::make_regular();
    // If engaged, compaction will cleanup the input sstables by skipping non-owned ranges.
    compaction::owned_ranges_ptr owned_ranges;
-    // Required for reshard compaction.
-    const dht::sharder* sharder;

    compaction_sstable_creator_fn creator;
    compaction_sstable_replacer_fn replacer;

+    ::io_priority_class io_priority = default_priority_class();
+
    // Denotes if this compaction task is comprised solely of completely expired SSTables
    sstables::has_only_fully_expired has_only_fully_expired = has_only_fully_expired::no;

@@ -176,6 +166,7 @@ struct compaction_descriptor {
    static constexpr uint64_t default_max_sstable_bytes = std::numeric_limits<uint64_t>::max();

    explicit compaction_descriptor(std::vector<sstables::shared_sstable> sstables,
+                                   ::io_priority_class io_priority,
                                   int level = default_level,
                                   uint64_t max_sstable_bytes = default_max_sstable_bytes,
                                   run_id run_identifier = run_id::create_random_id(),
@@ -187,15 +178,18 @@ struct compaction_descriptor {
        , run_identifier(run_identifier)
        , options(options)
        , owned_ranges(std::move(owned_ranges_))
+        , io_priority(io_priority)
    {}

    explicit compaction_descriptor(sstables::has_only_fully_expired has_only_fully_expired,
-                                   std::vector<sstables::shared_sstable> sstables)
+                                   std::vector<sstables::shared_sstable> sstables,
+                                   ::io_priority_class io_priority)
        : sstables(std::move(sstables))
        , level(default_level)
        , max_sstable_bytes(default_max_sstable_bytes)
        , run_identifier(run_id::create_random_id())
        , options(compaction_type_options::make_regular())
+        , io_priority(io_priority)
        , has_only_fully_expired(has_only_fully_expired)
    {}

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -31,8 +31,8 @@
 #include <functional>
 #include <algorithm>
 #include "compaction.hh"
+#include "compaction_weight_registration.hh"
 #include "compaction_backlog_manager.hh"
-#include "compaction/compaction_descriptor.hh"
 #include "compaction/task_manager_module.hh"
 #include "compaction_state.hh"
 #include "strategy_control.hh"
@@ -46,14 +46,14 @@ class system_keyspace;
 class compaction_history_entry;
 }

+class compacting_sstable_registration;
+
 class repair_history_map {
 public:
    boost::icl::interval_map<dht::token, gc_clock::time_point, boost::icl::partial_absorber, std::less, boost::icl::inplace_max> map;
 };

 namespace compaction {
-using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
 class compaction_task_executor;
 class sstables_task_executor;
 class major_compaction_task_executor;
@@ -64,6 +64,8 @@ class rewrite_sstables_compaction_task_executor;
 class cleanup_sstables_compaction_task_executor;
 class validate_sstables_compaction_task_executor;
 }
+class compaction_manager_test_task_executor;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -161,21 +163,7 @@ private:
    per_table_history_maps _repair_history_maps;
    tombstone_gc_state _tombstone_gc_state;
 private:
-    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
-    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
-
-    // Return nullopt if compaction cannot be started
-    std::optional<gate::holder> start_compaction(table_state& t);
-
-    // parent_info set to std::nullopt means that task manager should not register this task executor.
-    // To create a task manager task with no parent, parent_info argument should contain empty task_info.
-    template<typename TaskExecutor, typename... Args>
-    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
-            std::is_base_of_v<compaction_task_impl, TaskExecutor> &&
-    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
-        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
-    }
-    future<compaction_manager::compaction_stats_opt> perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
+    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor>);

    future<> stop_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -194,20 +182,17 @@ private:
    // Get candidates for compaction strategy, which are all sstables but the ones being compacted.
    std::vector<sstables::shared_sstable> get_candidates(compaction::table_state& t) const;

-    bool eligible_for_compaction(const sstables::shared_sstable& sstable) const;
-    bool eligible_for_compaction(const sstables::frozen_sstable_run& sstable_run) const;
-
    template <std::ranges::range Range>
-    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable> || std::convertible_to<std::ranges::range_value_t<Range>, sstables::frozen_sstable_run>
-    std::vector<std::ranges::range_value_t<Range>> get_candidates(table_state& t, const Range& sstables) const;
+    requires std::convertible_to<std::ranges::range_value_t<Range>, sstables::shared_sstable>
+    std::vector<sstables::shared_sstable> get_candidates(table_state& t, const Range& sstables) const;

-    template <std::ranges::range Range>
-    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
-    void register_compacting_sstables(const Range& range);
+    template <typename Iterator, typename Sentinel>
+    requires std::same_as<Sentinel, Iterator> || std::sentinel_for<Sentinel, Iterator>
+    void register_compacting_sstables(Iterator first, Sentinel last);

-    template <std::ranges::range Range>
-    requires std::same_as<std::ranges::range_value_t<Range>, sstables::shared_sstable>
-    void deregister_compacting_sstables(const Range& range);
+    template <typename Iterator, typename Sentinel>
+    requires std::same_as<Sentinel, Iterator> || std::sentinel_for<Sentinel, Iterator>
+    void deregister_compacting_sstables(Iterator first, Sentinel last);

    // gets the table's compaction state
    // throws std::out_of_range exception if not found.
@@ -226,7 +211,7 @@ private:
    // similar-sized compaction.
    void postpone_compaction_for_table(compaction::table_state* t);

-    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t);
    future<> update_static_shares(float shares);

    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
@@ -234,11 +219,10 @@ private:
    // Guarantees that a maintenance task, e.g. cleanup, will be performed on all files available at the time
    // by retrieving set of candidates only after all compactions for table T were stopped, if any.
    template<typename TaskType, typename... Args>
-    requires std::derived_from<TaskType, compaction_task_executor> &&
-            std::derived_from<TaskType, compaction_task_impl>
-    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(std::optional<tasks::task_info> info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);
+    requires std::derived_from<TaskType, compaction::compaction_task_executor>
+    future<compaction_stats_opt> perform_task_on_all_files(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, Args... args);

-    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, std::optional<tasks::task_info> info, can_purge_tombstones can_purge = can_purge_tombstones::yes);
+    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, can_purge_tombstones can_purge = can_purge_tombstones::yes);

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
@@ -310,7 +294,7 @@ public:

    // Submit a table to be off-strategy compacted.
    // Returns true iff off-strategy compaction was required and performed.
-    future<bool> perform_offstrategy(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<bool> perform_offstrategy(compaction::table_state& t);

    // Submit a table to be cleaned up and wait for its termination.
    //
@@ -319,23 +303,16 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
-private:
-    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t);

-    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
-    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
-
-    future<> on_compaction_completion(table_state& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
-public:
    // Submit a table to be upgraded and wait for its termination.
-    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version);

    // Submit a table to be scrubbed and wait for its termination.
-    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts, std::optional<tasks::task_info> info = std::nullopt);
+    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts);

    // Submit a table for major compaction.
-    future<> perform_major_compaction(compaction::table_state& t, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_major_compaction(compaction::table_state& t);


    // Run a custom job for a given table, defined by a function
@@ -345,7 +322,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job, std::optional<tasks::task_info> info, throw_if_stopping do_throw_if_stopping);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);

    class compaction_reenabler {
        compaction_manager& _cm;
@@ -427,13 +404,11 @@ public:
        return _tombstone_gc_state;
    };

-    // Uncoditionally erase sst from `sstables_requiring_cleanup`
-    // Returns true iff sst was found and erased.
-    bool erase_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst);
+    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
+    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, owned_ranges_ptr owned_ranges_ptr);

    // checks if the sstable is in the respective compaction_state.sstables_requiring_cleanup set.
    bool requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const;
-    const std::unordered_set<sstables::shared_sstable>& sstables_requiring_cleanup(table_state& t) const;

    friend class compacting_sstable_registration;
    friend class compaction_weight_registration;
@@ -448,11 +423,12 @@ public:
    friend class compaction::rewrite_sstables_compaction_task_executor;
    friend class compaction::cleanup_sstables_compaction_task_executor;
    friend class compaction::validate_sstables_compaction_task_executor;
+    friend class compaction_manager_test_task_executor;
 };

 namespace compaction {

-class compaction_task_executor : public enable_shared_from_this<compaction_task_executor> {
+class compaction_task_executor {
 public:
    enum class state {
        none,       // initial and final state
@@ -460,54 +436,42 @@ public:
                    // counted in compaction_manager::stats::pending_tasks
        active,     // task initiated active compaction, may alternate with pending
                    // counted in compaction_manager::stats::active_tasks
-        done,       // task completed successfully (may transition only to state::none, or
-                    // state::pending for regular compaction)
+        done,       // task completed successfully (may transition only to state::none)
                    // counted in compaction_manager::stats::completed_tasks
        postponed,  // task was postponed (may transition only to state::none)
                    // represented by the postponed_compactions metric
        failed,     // task failed (may transition only to state::none)
                    // counted in compaction_manager::stats::errors
    };
+    static std::string_view to_string(state);
 protected:
    compaction_manager& _cm;
    ::compaction::table_state* _compacting_table = nullptr;
    compaction::compaction_state& _compaction_state;
    sstables::compaction_data _compaction_data;
    state _state = state::none;
-    throw_if_stopping _do_throw_if_stopping;

 private:
    shared_future<compaction_manager::compaction_stats_opt> _compaction_done = make_ready_future<compaction_manager::compaction_stats_opt>();
    exponential_backoff_retry _compaction_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
    sstables::compaction_type _type;
    sstables::run_id _output_run_identifier;
+    gate::holder _gate_holder;
    sstring _description;
-    compaction_manager::compaction_stats_opt _stats = std::nullopt;

 public:
-    explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);
+    explicit compaction_task_executor(compaction_manager& mgr, ::compaction::table_state* t, sstables::compaction_type type, sstring desc);

    compaction_task_executor(compaction_task_executor&&) = delete;
    compaction_task_executor(const compaction_task_executor&) = delete;

-    virtual ~compaction_task_executor() = default;
-
-    // called when a compaction replaces the exhausted sstables with the new set
-    struct on_replacement {
-        virtual ~on_replacement() {}
-        // called after the replacement completes
-        // @param sstables the old sstable which are replaced in this replacement
-        virtual void on_removal(const std::vector<sstables::shared_sstable>& sstables) = 0;
-        // called before the replacement happens
-        // @param sstables the new sstables to be added to the table's sstable set
-        virtual void on_addition(const std::vector<sstables::shared_sstable>& sstables) = 0;
-    };
+    virtual ~compaction_task_executor();

 protected:
-    future<> perform();
-
    virtual future<compaction_manager::compaction_stats_opt> do_run() = 0;

+    using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
    state switch_state(state new_state);

    future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -524,27 +488,24 @@ protected:
    // otherwise, returns stop_iteration::no after sleep for exponential retry.
    future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);

-    future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
+    // Compacts set of SSTables according to the descriptor.
+    using release_exhausted_func_t = std::function<void(const std::vector<sstables::shared_sstable>& exhausted_sstables)>;
+    future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
+                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes);
+    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes);
-    future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
-                                compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes,
-                                sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> update_history(::compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
    bool should_update_history(sstables::compaction_type ct) {
        return ct == sstables::compaction_type::Compaction;
    }
 public:
-    compaction_manager::compaction_stats_opt get_stats() const noexcept {
-        return _stats;
-    }
-
-    future<compaction_manager::compaction_stats_opt> run_compaction() noexcept;
+    future<compaction_manager::compaction_stats_opt> run() noexcept;

    const ::compaction::table_state* compacting_table() const noexcept {
        return _compacting_table;
    }

-    sstables::compaction_type compaction_type() const noexcept {
+    sstables::compaction_type type() const noexcept {
        return _type;
    }

@@ -570,46 +531,27 @@ public:
    const sstring& description() const noexcept {
        return _description;
    }
-private:
-    // Before _compaction_done is set in compaction_task_executor::run_compaction(), compaction_done() returns ready future.
+
    future<compaction_manager::compaction_stats_opt> compaction_done() noexcept {
        return _compaction_done.get_future();
    }
-public:
+
    bool stopping() const noexcept {
        return _compaction_data.abort.abort_requested();
    }

-    void stop_compaction(sstring reason) noexcept;
+    void stop(sstring reason) noexcept;

    sstables::compaction_stopped_exception make_compaction_stopped_exception() const;

-    template<typename TaskExecutor, typename... Args>
-    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
-            std::is_base_of_v<compaction_task_impl, TaskExecutor> &&
-    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
-        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
-    }
-    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
-    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
-    friend fmt::formatter<compaction_task_executor>;
-    friend future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason);
+    std::string describe() const;
 };

+std::ostream& operator<<(std::ostream& os, compaction::compaction_task_executor::state s);
+std::ostream& operator<<(std::ostream& os, const compaction::compaction_task_executor& task);
+
 }

-template <>
-struct fmt::formatter<compaction::compaction_task_executor::state> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    auto format(compaction::compaction_task_executor::state c, fmt::format_context& ctx) const -> decltype(ctx.out());
-};
-
-template <>
-struct fmt::formatter<compaction::compaction_task_executor> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    auto format(const compaction::compaction_task_executor& ex, fmt::format_context& ctx) const  -> decltype(ctx.out());
-};
-
 bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges);

 // Return all sstables but those that are off-strategy like the ones in maintenance set and staging dir.
--- a/compaction/compaction_state.hh
+++ b/compaction/compaction_state.hh
@@ -32,10 +32,10 @@ struct compaction_state {
    // Signaled whenever a compaction task completes.
    condition_variable compaction_done;

-    std::optional<compaction_backlog_tracker> backlog_tracker;
+    compaction_backlog_tracker backlog_tracker;

    std::unordered_set<sstables::shared_sstable> sstables_requiring_cleanup;
-    compaction::owned_ranges_ptr owned_ranges_ptr;
+    owned_ranges_ptr owned_ranges_ptr;

    explicit compaction_state(table_state& t);
    compaction_state(compaction_state&&) = delete;
--- a/Show More
+++ b/Show More