doc: remove wrong image upgrade info (5.2-to-2023.1)

This commit removes the information about the recommended way of upgrading ScyllaDB images - by updating ScyllaDB and OS packages in one step. This upgrade procedure is not supported (it was implemented, but then reverted). Refs https://github.com/scylladb/scylladb/issues/15733 Closes scylladb/scylladb#21876 Fixes https://github.com/scylladb/scylla-enterprise/issues/5041 Fixes https://github.com/scylladb/scylladb/issues/21898 (cherry picked from commit 98860905d8)
db/config.cc: increment components_memory_reclaim_threshold config default
2024-12-12 15:28:20 +02:00 · 2024-06-04 07:13:28 +03:00 · 2024-05-30 11:11:39 +03:00 · 2024-05-30 11:10:49 +03:00 · 2024-05-27 08:52:06 +03:00 · 2024-05-26 16:30:06 +03:00
1190 changed files with 67749 additions and 27733 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -12,7 +12,7 @@ test/cql/cdc_* @kbr- @elcallio @piodul @jul-stas
 test/boost/cdc_* @kbr- @elcallio @piodul @jul-stas

 # COMMITLOG / BATCHLOG
-db/commitlog/* @elcallio
+db/commitlog/* @elcallio @eliransin
 db/batch* @elcallio

 # COORDINATOR
@@ -25,7 +25,7 @@ compaction/* @raphaelsc @nyh
 transport/*

 # CQL QUERY LANGUAGE
-cql3/* @tgrabiec @psarna @cvybhu
+cql3/* @tgrabiec @cvybhu @nyh

 # COUNTERS
 counters* @jul-stas
@@ -33,7 +33,7 @@ tests/counter_test* @jul-stas

 # DOCS
 docs/* @annastuchlik @tzach
-docs/alternator @annastuchlik @tzach @nyh @psarna
+docs/alternator @annastuchlik @tzach @nyh @havaker @nuivall

 # GOSSIP
 gms/* @tgrabiec @asias
@@ -45,9 +45,9 @@ dist/docker/*
 utils/logalloc* @tgrabiec

 # MATERIALIZED VIEWS
-db/view/* @nyh @psarna
-cql3/statements/*view* @nyh @psarna
-test/boost/view_* @nyh @psarna
+db/view/* @nyh @cvybhu @piodul
+cql3/statements/*view* @nyh @cvybhu @piodul
+test/boost/view_* @nyh @cvybhu @piodul

 # PACKAGING
 dist/* @syuu1228
@@ -62,9 +62,9 @@ service/migration* @tgrabiec @nyh
 schema* @tgrabiec @nyh

 # SECONDARY INDEXES
-db/index/* @nyh @psarna
-cql3/statements/*index* @nyh @psarna
-test/boost/*index* @nyh @psarna
+index/* @nyh @cvybhu @piodul
+cql3/statements/*index* @nyh @cvybhu @piodul
+test/boost/*index* @nyh @cvybhu @piodul

 # SSTABLES
 sstables/* @tgrabiec @raphaelsc @nyh
@@ -74,11 +74,11 @@ streaming/* @tgrabiec @asias
 service/storage_service.* @tgrabiec @asias

 # ALTERNATOR
-alternator/* @nyh @psarna
-test/alternator/* @nyh @psarna
+alternator/* @nyh @havaker @nuivall
+test/alternator/* @nyh @havaker @nuivall

 # HINTED HANDOFF
-db/hints/* @piodul @vladzcloudius
+db/hints/* @piodul @vladzcloudius @eliransin

 # REDIS
 redis/* @nyh @syuu1228
--- a/.github/workflows/docs-amplify-enhanced.yaml
+++ b/.github/workflows/docs-amplify-enhanced.yaml
@@ -0,0 +1,17 @@
+name: "Docs / Amplify enhanced"
+
+on: issue_comment
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.issue.pull_request }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Amplify enhanced
+        env:
+          TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        uses: scylladb/sphinx-scylladb-theme/.github/actions/amplify-enhanced@master
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,5 @@ docs/poetry.lock
 compile_commands.json
 .ccls-cache/
 .mypy_cache
+.envrc
+rust/Cargo.lock
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,17 +1,11 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
 	url = ../scylla-swagger-ui
 	ignore = dirty
-[submodule "libdeflate"]
-	path = libdeflate
-	url = ../libdeflate
-[submodule "abseil"]
-	path = abseil
-	url = ../abseil-cpp
 [submodule "scylla-jmx"]
 	path = tools/jmx
 	url = ../scylla-jmx
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,22 +42,13 @@ set(Seastar_CXX_FLAGS ${cxx_coro_flag} ${target_arch_flag} CACHE INTERNAL "" FOR
 set(Seastar_CXX_DIALECT gnu++20 CACHE INTERNAL "" FORCE)

 add_subdirectory(seastar)
-add_subdirectory(abseil)
-# Exclude absl::strerror from the default "all" target since it's not
-# used in Scylla build and, moreover, makes use of deprecated glibc APIs,
-# such as sys_nerr, which are not exposed from "stdio.h" since glibc 2.32,
-# which happens to be the case for recent Fedora distribution versions.
-#
-# Need to use the internal "absl_strerror" target name instead of namespaced
-# variant because `set_target_properties` does not understand the latter form,
-# unfortunately.
-set_target_properties(absl_strerror PROPERTIES EXCLUDE_FROM_ALL TRUE)

 # System libraries dependencies
 find_package(Boost COMPONENTS filesystem program_options system thread regex REQUIRED)
 find_package(Lua REQUIRED)
 find_package(ZLIB REQUIRED)
 find_package(ICU COMPONENTS uc REQUIRED)
+find_package(Abseil REQUIRED)

 set(scylla_build_dir "${CMAKE_BINARY_DIR}/build/${BUILD_TYPE}")
 set(scylla_gen_build_dir "${scylla_build_dir}/gen")
@@ -189,6 +180,8 @@ set(swagger_files
    api/api-doc/storage_service.json
    api/api-doc/stream_manager.json
    api/api-doc/system.json
+    api/api-doc/task_manager.json
+    api/api-doc/task_manager_test.json
    api/api-doc/utils.json)

 set(swagger_gen_files)
@@ -301,6 +294,8 @@ set(scylla_sources
    api/storage_service.cc
    api/stream_manager.cc
    api/system.cc
+    api/task_manager.cc
+    api/task_manager_test.cc
    atomic_cell.cc
    auth/allow_all_authenticator.cc
    auth/allow_all_authorizer.cc
@@ -424,6 +419,8 @@ set(scylla_sources
    cql3/statements/sl_prop_defs.cc
    cql3/statements/truncate_statement.cc
    cql3/statements/update_statement.cc
+    cql3/statements/strongly_consistent_modification_statement.cc
+    cql3/statements/strongly_consistent_select_statement.cc
    cql3/statements/use_statement.cc
    cql3/type_json.cc
    cql3/untyped_result_set.cc
@@ -533,6 +530,7 @@ set(scylla_sources
    raft/raft.cc
    raft/server.cc
    raft/tracker.cc
+    service/broadcast_tables/experimental/lang.cc
    range_tombstone.cc
    range_tombstone_list.cc
    tombstone_gc_options.cc
@@ -612,6 +610,7 @@ set(scylla_sources
    streaming/stream_task.cc
    streaming/stream_transfer_task.cc
    table_helper.cc
+    tasks/task_manager.cc
    thrift/controller.cc
    thrift/handler.cc
    thrift/server.cc
@@ -738,7 +737,6 @@ target_compile_definitions(scylla PRIVATE XXH_PRIVATE_API HAVE_LZ4_COMPRESS_DEFA
 target_include_directories(scylla PRIVATE
    "${CMAKE_CURRENT_SOURCE_DIR}"
    libdeflate
-    abseil
    "${scylla_gen_build_dir}")

 ###
--- a/39
+++ b/39
@@ -1,11 +1,12 @@
 #!/bin/sh

 USAGE=$(cat <<-END
-Usage: $(basename "$0") [-h|--help] [-o|--output-dir PATH] -- generate Scylla version and build information files.
+Usage: $(basename "$0") [-h|--help] [-o|--output-dir PATH] [--date-stamp DATE] -- generate Scylla version and build information files.

 Options:
  -h|--help show this help message.
  -o|--output-dir PATH specify destination path at which the version files are to be created.
+  -d|--date-stamp DATE manually set date for release parameter

 By default, the script will attempt to parse 'version' file
 in the current directory, which should contain a string of
@@ -31,7 +32,9 @@ using '-o PATH' option.
 END
 )

-while [[ $# -gt 0 ]]; do
+DATE=""
+
+while [ $# -gt 0 ]; do
 	opt="$1"
 	case $opt in
 		-h|--help)
@@ -43,6 +46,11 @@ while [[ $# -gt 0 ]]; do
 			shift
 			shift
 			;;
+		--date-stamp)
+			DATE="$2"
+			shift
+			shift
+			;;
 		*)
 			echo "Unexpected argument found: $1"
 			echo
@@ -58,24 +66,33 @@ if [ -z "$OUTPUT_DIR" ]; then
 	OUTPUT_DIR="$SCRIPT_DIR/build"
 fi

+if [ -z "$DATE" ]; then
+  DATE=$(date --utc +%Y%m%d)
+fi
+
 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.1.0-dev
+VERSION=5.2.19

 if test -f version
 then
 	SCYLLA_VERSION=$(cat version | awk -F'-' '{print $1}')
 	SCYLLA_RELEASE=$(cat version | awk -F'-' '{print $2}')
 else
-	DATE=$(date --utc +%Y%m%d)
-	GIT_COMMIT=$(git -C "$SCRIPT_DIR" log --pretty=format:'%h' -n 1 --abbrev=12)
 	SCYLLA_VERSION=$VERSION
-	# For custom package builds, replace "0" with "counter.your_name",
-	# where counter starts at 1 and increments for successive versions.
-	# This ensures that the package manager will select your custom
-	# package over the standard release.
-	SCYLLA_BUILD=0
-	SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
+	if [ -z "$SCYLLA_RELEASE" ]; then
+		DATE=$(date --utc +%Y%m%d)
+		GIT_COMMIT=$(git -C "$SCRIPT_DIR" log --pretty=format:'%h' -n 1 --abbrev=12)
+		# For custom package builds, replace "0" with "counter.your_name",
+		# where counter starts at 1 and increments for successive versions.
+		# This ensures that the package manager will select your custom
+		# package over the standard release.
+		SCYLLA_BUILD=0
+		SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
+	elif [ -f "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" ]; then
+		echo "setting SCYLLA_RELEASE only makes sense in clean builds" 1>&2
+		exit 1
+	fi
 fi

 if [ -f "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" ]; then
--- a/1
+++ b/1
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -133,14 +133,15 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::strin
    }
    auto selection = cql3::selection::selection::for_columns(schema, {salted_hash_col});
    auto partition_slice = query::partition_slice(std::move(bounds), {}, query::column_id_vector{salted_hash_col->id}, selection->get_query_options());
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice,
+            proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
    auto cl = auth::password_authenticator::consistency_for_user(username);

    service::client_state client_state{service::client_state::internal_tag()};
    service::storage_proxy::coordinator_query_result qr = co_await proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
            service::storage_proxy::coordinator_query_options(executor::default_timeout(), empty_service_permit(), client_state));

-    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

    auto result_set = builder.build();
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -14,6 +14,8 @@
 #include "db/config.hh"
 #include "cdc/generation_service.hh"
 #include "service/memory_limiter.hh"
+#include "auth/service.hh"
+#include "service/qos/service_level_controller.hh"

 using namespace seastar;

@@ -28,6 +30,8 @@ controller::controller(
        sharded<db::system_distributed_keyspace>& sys_dist_ks,
        sharded<cdc::generation_service>& cdc_gen_svc,
        sharded<service::memory_limiter>& memory_limiter,
+        sharded<auth::service>& auth_service,
+        sharded<qos::service_level_controller>& sl_controller,
        const db::config& config)
    : _gossiper(gossiper)
    , _proxy(proxy)
@@ -35,6 +39,8 @@ controller::controller(
    , _sys_dist_ks(sys_dist_ks)
    , _cdc_gen_svc(cdc_gen_svc)
    , _memory_limiter(memory_limiter)
+    , _auth_service(auth_service)
+    , _sl_controller(sl_controller)
    , _config(config)
 {
 }
@@ -77,7 +83,7 @@ future<> controller::start_server() {
        auto get_cdc_metadata = [] (cdc::generation_service& svc) { return std::ref(svc.get_cdc_metadata()); };

        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
-        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
+        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
        // Note: from this point on, if start_server() throws for any reason,
        // it must first call stop_server() to stop the executor and server
        // services we just started - or Scylla will cause an assertion
--- a/alternator/controller.hh
+++ b/alternator/controller.hh
@@ -34,6 +34,14 @@ class gossiper;

 }

+namespace auth {
+class service;
+}
+
+namespace qos {
+class service_level_controller;
+}
+
 namespace alternator {

 // This is the official DynamoDB API version.
@@ -53,6 +61,8 @@ class controller : public protocol_server {
    sharded<db::system_distributed_keyspace>& _sys_dist_ks;
    sharded<cdc::generation_service>& _cdc_gen_svc;
    sharded<service::memory_limiter>& _memory_limiter;
+    sharded<auth::service>& _auth_service;
+    sharded<qos::service_level_controller>& _sl_controller;
    const db::config& _config;

    std::vector<socket_address> _listen_addresses;
@@ -68,6 +78,8 @@ public:
        sharded<db::system_distributed_keyspace>& sys_dist_ks,
        sharded<cdc::generation_service>& cdc_gen_svc,
        sharded<service::memory_limiter>& memory_limiter,
+        sharded<auth::service>& auth_service,
+        sharded<qos::service_level_controller>& sl_controller,
        const db::config& config);

    virtual sstring name() const override;
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -23,7 +23,7 @@ namespace alternator {
 // api_error into a JSON object, and that is returned to the user.
 class api_error final : public std::exception {
 public:
-    using status_type = httpd::reply::status_type;
+    using status_type = http::reply::status_type;
    status_type _http_code;
    std::string _type;
    std::string _msg;
@@ -77,7 +77,7 @@ public:
        return api_error("TableNotFoundException", std::move(msg));
    }
    static api_error internal(std::string msg) {
-        return api_error("InternalServerError", std::move(msg), reply::status_type::internal_server_error);
+        return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error);
    }

    // Provide the "std::exception" interface, to make it easier to print this
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -34,6 +34,7 @@
 #include "expressions.hh"
 #include "conditions.hh"
 #include "cql3/constants.hh"
+#include "cql3/util.hh"
 #include <optional>
 #include "utils/overloaded_functor.hh"
 #include <seastar/json/json_elements.hh>
@@ -87,17 +88,20 @@ json::json_return_type make_streamed(rjson::value&& value) {
        // move objects to coroutine frame.
        auto los = std::move(os);
        auto lrs = std::move(rs);
+        std::exception_ptr ex;
        try {
            co_await rjson::print(*lrs, los);
-            co_await los.flush();
-            co_await los.close();
        } catch (...) {
            // at this point, we cannot really do anything. HTTP headers and return code are
            // already written, and quite potentially a portion of the content data.
            // just log + rethrow. It is probably better the HTTP server closes connection
            // abruptly or something...
-            elogger.error("Unhandled exception in data streaming: {}", std::current_exception());
-            throw;
+            ex = std::current_exception();
+            elogger.error("Exception during streaming HTTP response: {}", ex);
+        }
+        co_await los.close();
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
        }
        co_return;
    };
@@ -438,6 +442,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -460,6 +469,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
            rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
            describe_key_schema(view_entry, *vptr, key_attribute_types);
+            // Add projection type
+            rjson::value projection = rjson::empty_object();
+            rjson::add(projection, "ProjectionType", "ALL");
+            // FIXME: we have to get ProjectionType from the schema when it is added
+            rjson::add(view_entry, "Projection", std::move(projection));
            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
            rjson::push_back(index_array, std::move(view_entry));
@@ -750,7 +764,6 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
        co_return api_error::access_denied("Incorrect resource identifier");
    }
    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
    const rjson::value* tags = rjson::find(request, "Tags");
    if (!tags || !tags->IsArray()) {
        co_return api_error::validation("Cannot parse tags");
@@ -758,8 +771,9 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    if (tags->Size() < 1) {
        co_return api_error::validation("The number of tags must be at least 1") ;
    }
-    update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
+    });
    co_return json_string("");
 }

@@ -777,9 +791,9 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli

    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+    });
    co_return json_string("");
 }

@@ -917,9 +931,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            if  (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
                add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
            }
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -974,9 +989,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -1082,7 +1098,6 @@ future<executor::request_return_type> executor::update_table(client_state& clien
    elogger.trace("Updating table {}", request);

    static const std::vector<sstring> unsupported = {
-        "AttributeDefinitions", 
        "GlobalSecondaryIndexUpdates", 
        "ProvisionedThroughput",
        "ReplicaUpdates",
@@ -1255,6 +1270,22 @@ put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schem
    check_key(key, schema);
 }

+// find_attribute() checks whether the named attribute is stored in the
+// schema as a real column (we do this for key attribute, and for a GSI key)
+// and if so, returns that column. If not, the function returns nullptr,
+// telling the caller that the attribute is stored serialized in the
+// ATTRS_COLUMN_NAME map - not in a stand-alone column in the schema.
+static inline const column_definition* find_attribute(const schema& schema, const bytes& attribute_name) {
+    const column_definition* cdef = schema.get_column_definition(attribute_name);
+    // Although ATTRS_COLUMN_NAME exists as an actual column, when used as an
+    // attribute name it should refer to an attribute inside ATTRS_COLUMN_NAME
+    // not to ATTRS_COLUMN_NAME itself. This if() is needed for #5009.
+    if (cdef && cdef->name() == executor::ATTRS_COLUMN_NAME) {
+        return nullptr;
+    }
+    return cdef;
+}
+
 put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item)
        : _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) {
    _cells = std::vector<cell>();
@@ -1262,7 +1293,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
    for (auto it = item.MemberBegin(); it != item.MemberEnd(); ++it) {
        bytes column_name = to_bytes(it->name.GetString());
        validate_value(it->value, "PutItem");
-        const column_definition* cdef = schema->get_column_definition(column_name);
+        const column_definition* cdef = find_attribute(*schema, column_name);
        if (!cdef) {
            bytes value = serialize_item(it->value);
            _cells->push_back({std::move(column_name), serialize_item(it->value)});
@@ -1294,7 +1325,7 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) co
    auto& row = m.partition().clustered_row(*schema, _ck);
    attribute_collector attrs_collector;
    for (auto& c : *_cells) {
-        const column_definition* cdef = schema->get_column_definition(c.column_name);
+        const column_definition* cdef = find_attribute(*schema, c.column_name);
        if (!cdef) {
            attrs_collector.put(c.column_name, c.value, ts);
        } else {
@@ -1359,7 +1390,8 @@ static lw_shared_ptr<query::read_command> previous_item_read_command(service::st
    auto regular_columns = boost::copy_range<query::column_id_vector>(
            schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
    auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
+    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
+            query::tombstone_limit(proxy.get_tombstone_limit()));
 }

 static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
@@ -2276,7 +2308,7 @@ void executor::describe_single_item(const cql3::selection::selection& selection,
                rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(*cell, **column_it));
            }
        } else if (cell) {
-            auto deserialized = attrs_type()->deserialize(*cell, cql_serialization_format::latest());
+            auto deserialized = attrs_type()->deserialize(*cell);
            auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
            for (auto entry : keys_and_values) {
                std::string attr_name = value_cast<sstring>(entry.first);
@@ -2311,7 +2343,7 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
        const std::optional<attrs_to_get>& attrs_to_get) {
    rjson::value item = rjson::empty_object();

-    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
+    cql3::selection::result_set_builder builder(selection, gc_clock::now());
    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));

    auto result_set = builder.build();
@@ -2329,21 +2361,22 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
    return item;
 }

-std::vector<rjson::value> executor::describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get) {
-    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
-    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
+future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get) {
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        describe_single_item(selection, result_row, attrs_to_get, item);
+        describe_single_item(*selection, result_row, *attrs_to_get, item);
        ret.push_back(std::move(item));
+        co_await coroutine::maybe_yield();
    }
-    return ret;
+    co_return ret;
 }

 static bool check_needs_read_before_write(const parsed::value& v) {
@@ -2748,7 +2781,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                }
            }
        }
-        const column_definition* cdef = _schema->get_column_definition(column_name);
+        const column_definition* cdef = find_attribute(*_schema, column_name);
        if (cdef) {
            bytes column_value = get_key_from_typed_value(json_value, *cdef);
            row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, column_value));
@@ -2770,7 +2803,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                rjson::add_with_string_name(_return_attributes, cn, rjson::copy(*col));
            }
        }
-        const column_definition* cdef = _schema->get_column_definition(column_name);
+        const column_definition* cdef = find_attribute(*_schema, column_name);
        if (cdef) {
            row.cells().apply(*cdef, atomic_cell::make_dead(ts, gc_clock::now()));
        } else {
@@ -3063,7 +3096,8 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
    auto selection = cql3::selection::selection::wildcard(schema);

    auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
+            query::tombstone_limit(_proxy.get_tombstone_limit()));

    std::unordered_set<std::string> used_attribute_names;
    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
@@ -3217,14 +3251,14 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    rs.schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
            auto selection = cql3::selection::selection::wildcard(rs.schema);
            auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
+            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
+                    query::tombstone_limit(_proxy.get_tombstone_limit()));
            command->allow_limit = db::allow_per_partition_rate_limit::yes;
            future<std::vector<rjson::value>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                std::vector<rjson::value> jsons = describe_multi_item(schema, partition_slice, *selection, *qr.query_result, *attrs_to_get);
-                return make_ready_future<std::vector<rjson::value>>(std::move(jsons));
+                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get));
            });
            response_futures.push_back(std::move(f));
        }
@@ -3480,7 +3514,7 @@ public:
                    rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it));
                }
            } else {
-                auto deserialized = attrs_type()->deserialize(bv, cql_serialization_format::latest());
+                auto deserialized = attrs_type()->deserialize(bv);
                auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
                for (auto entry : keys_and_values) {
                    std::string attr_name = value_cast<sstring>(entry.first);
@@ -3614,11 +3648,11 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr

    if (exclusive_start_key) {
        partition_key pk = pk_from_json(*exclusive_start_key, schema);
-        auto pos = position_in_partition(position_in_partition::partition_start_tag_t());
+        auto pos = position_in_partition::for_partition_start();
        if (schema->clustering_key_size() > 0) {
            pos = pos_from_json(*exclusive_start_key, schema);
        }
-        paging_state = make_lw_shared<service::pager::paging_state>(pk, pos, query::max_partitions, utils::UUID(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0);
+        paging_state = make_lw_shared<service::pager::paging_state>(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0);
    }

    auto regular_columns = boost::copy_range<query::column_id_vector>(
@@ -3629,7 +3663,8 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
    query::partition_slice::option_set opts = selection->get_query_options();
    opts.add(custom_opts);
    auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
+        query::tombstone_limit(proxy.get_tombstone_limit()));

    auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));

--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -222,11 +222,11 @@ public:
        const query::result&,
        const std::optional<attrs_to_get>&);

-    static std::vector<rjson::value> describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get);
+    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<bytes_opt>&,
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -73,7 +73,7 @@ struct from_json_visitor {
    }
    // default
    void operator()(const abstract_type& t) const {
-        bo.write(from_json_object(t, v, cql_serialization_format::internal()));
+        bo.write(from_json_object(t, v));
    }
 };

@@ -279,7 +279,7 @@ position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema)
        return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(std::move(ck)) : std::nullopt);
    }
    if (ck.is_empty()) {
-        return position_in_partition(position_in_partition::partition_start_tag_t());
+        return position_in_partition::for_partition_start();
    }
    return position_in_partition::for_key(std::move(ck));
 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -16,6 +16,7 @@
 #include <seastar/util/short_streams.hh>
 #include "seastarx.hh"
 #include "error.hh"
+#include "service/qos/service_level_controller.hh"
 #include "utils/rjson.hh"
 #include "auth.hh"
 #include <cctype>
@@ -27,6 +28,8 @@
 static logging::logger slogger("alternator-server");

 using namespace httpd;
+using request = http::request;
+using reply = http::reply;

 namespace alternator {

@@ -234,7 +237,7 @@ protected:
 future<std::string> server::verify_signature(const request& req, const chunked_content& content) {
    if (!_enforce_authorization) {
        slogger.debug("Skipping authorization");
-        return make_ready_future<std::string>("<unauthenticated request>");
+        return make_ready_future<std::string>();
    }
    auto host_it = req._headers.find("Host");
    if (host_it == req._headers.end()) {
@@ -364,7 +367,9 @@ static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_
        tracing::add_session_param(trace_state, "alternator_op", op);
        tracing::add_query(trace_state, truncated_content_view(query, buf));
        tracing::begin(trace_state, format("Alternator {}", op), client_state.get_client_address());
-        tracing::set_username(trace_state, auth::authenticated_user(username));
+        if (!username.empty()) {
+            tracing::set_username(trace_state, auth::authenticated_user(username));
+        }
    }
    return trace_state;
 }
@@ -407,7 +412,11 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    auto leave = defer([this] () noexcept { _pending_requests.leave(); });
    //FIXME: Client state can provide more context, e.g. client's endpoint address
    // We use unique_ptr because client_state cannot be moved or copied
-    executor::client_state client_state{executor::client_state::internal_tag()};
+    executor::client_state client_state = username.empty()
+        ? service::client_state{service::client_state::internal_tag()}
+        : service::client_state{service::client_state::internal_tag(), _auth_service, _sl_controller, username};
+    co_await client_state.maybe_update_per_service_level_params();
+
    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
    tracing::trace(trace_state, op);
    rjson::value json_request = co_await _json_parser.parse(std::move(content));
@@ -440,12 +449,14 @@ void server::set_routes(routes& r) {
 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper)
+server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& auth_service, qos::service_level_controller& sl_controller)
        : _http_server("http-alternator")
        , _https_server("https-alternator")
        , _executor(exec)
        , _proxy(proxy)
        , _gossiper(gossiper)
+        , _auth_service(auth_service)
+        , _sl_controller(sl_controller)
        , _key_cache(1024, 1min, slogger)
        , _enforce_authorization(false)
        , _enabled_servers{}
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -15,6 +15,7 @@
 #include <seastar/net/tls.hh>
 #include <optional>
 #include "alternator/auth.hh"
+#include "service/qos/service_level_controller.hh"
 #include "utils/small_vector.hh"
 #include "utils/updateable_value.hh"
 #include <seastar/core/units.hh>
@@ -26,7 +27,7 @@ using chunked_content = rjson::chunked_content;
 class server {
    static constexpr size_t content_length_limit = 16*MB;
    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
-            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
+            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<http::request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

    http_server _http_server;
@@ -34,6 +35,8 @@ class server {
    executor& _executor;
    service::storage_proxy& _proxy;
    gms::gossiper& _gossiper;
+    auth::service& _auth_service;
+    qos::service_level_controller& _sl_controller;

    key_cache _key_cache;
    bool _enforce_authorization;
@@ -65,7 +68,7 @@ class server {
    json_parser _json_parser;

 public:
-    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper);
+    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
            bool enforce_authorization, semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
@@ -73,8 +76,8 @@ public:
 private:
    void set_routes(seastar::httpd::routes& r);
    // If verification succeeds, returns the authenticated user's username
-    future<std::string> verify_signature(const seastar::httpd::request&, const chunked_content&);
-    future<executor::request_return_type> handle_api_request(std::unique_ptr<request> req);
+    future<std::string> verify_signature(const seastar::http::request&, const chunked_content&);
+    future<executor::request_return_type> handle_api_request(std::unique_ptr<http::request> req);
 };

 }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -74,8 +74,8 @@ struct rapidjson::internal::TypeHelper<ValueType, utils::UUID>
    : public from_string_helper<ValueType, utils::UUID>
 {};

-static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
-    return db_clock::time_point{utils::UUID_gen::unix_timestamp(uuid)};
+static db_clock::time_point as_timepoint(const table_id& tid) {
+    return db_clock::time_point{utils::UUID_gen::unix_timestamp(tid.uuid())};
 }

 /**
@@ -106,6 +106,9 @@ public:
    stream_arn(const UUID& uuid)
        : UUID(uuid)
    {}
+    stream_arn(const table_id& tid)
+        : UUID(tid.uuid())
+    {}
    stream_arn(std::string_view v)
        : UUID(v.substr(1))
    {
@@ -142,20 +145,25 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id().uuid() < t2.schema()->id().uuid();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
-            return t.schema()->id() == streams_start 
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
+            return t.schema()->id().uuid() == streams_start
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
                ;
@@ -430,7 +438,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    auto db = _proxy.data_dictionary();

    try {
-        auto cf = db.find_column_family(stream_arn);
+        auto cf = db.find_column_family(table_id(stream_arn));
        schema = cf.schema();
        bs = cdc::get_base_table(db.real_database(), *schema);
    } catch (...) {        
@@ -717,7 +725,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&
    std::optional<shard_id> sid;

    try {
-        auto cf = db.find_column_family(stream_arn);
+        auto cf = db.find_column_family(table_id(stream_arn));
        schema = cf.schema();
        sid = rjson::get<shard_id>(request, "ShardId");
    } catch (...) {
@@ -802,7 +810,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto db = _proxy.data_dictionary();
    schema_ptr schema, base;
    try {
-        auto log_table = db.find_column_family(iter.table);
+        auto log_table = db.find_column_family(table_id(iter.table));
        schema = log_table.schema();
        base = cdc::get_base_table(db.real_database(), *schema);
    } catch (...) {        
@@ -876,11 +884,11 @@ future<executor::request_return_type> executor::get_records(client_state& client
        ++mul;
    }
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
-            query::row_limit(limit * mul));
+            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

        auto result_set = builder.build();
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -8,6 +8,7 @@

 #include <chrono>
 #include <cstdint>
+#include <exception>
 #include <optional>
 #include <seastar/core/sstring.hh>
 #include <seastar/core/coroutine.hh>
@@ -17,6 +18,7 @@
 #include <seastar/coroutine/maybe_yield.hh>
 #include <boost/multiprecision/cpp_int.hpp>

+#include "exceptions/exceptions.hh"
 #include "gms/gossiper.hh"
 #include "gms/inet_address.hh"
 #include "inet_address_vectors.hh"
@@ -92,24 +94,25 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    if (enabled) {
-        if (tags_map.contains(TTL_TAG_KEY)) {
-            co_return api_error::validation("TTL is already enabled");
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
+        if (enabled) {
+            if (tags_map.contains(TTL_TAG_KEY)) {
+                throw api_error::validation("TTL is already enabled");
+            }
+            tags_map[TTL_TAG_KEY] = attribute_name;
+        } else {
+            auto i = tags_map.find(TTL_TAG_KEY);
+            if (i == tags_map.end()) {
+                throw api_error::validation("TTL is already disabled");
+            } else if (i->second != attribute_name) {
+                throw api_error::validation(format(
+                    "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
+                    attribute_name, i->second));
+            }
+            tags_map.erase(TTL_TAG_KEY);
        }
-        tags_map[TTL_TAG_KEY] = attribute_name;
-    } else {
-        auto i = tags_map.find(TTL_TAG_KEY);
-        if (i == tags_map.end()) {
-            co_return api_error::validation("TTL is already disabled");
-        } else if (i->second != attribute_name) {
-            co_return api_error::validation(format(
-                "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
-                attribute_name, i->second));
-        }
-        tags_map.erase(TTL_TAG_KEY);
-    }
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    });
+
    // Prepare the response, which contains a TimeToLiveSpecification
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
@@ -136,7 +139,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via a UpdateTimeToLeave request.
+// Alternator tables with TTL configured via a UpdateTimeToLive request.
 //
 // Here is a brief overview of how the expiration service works:
 //
@@ -150,25 +153,25 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
 // To avoid scanning the same items RF times in RF replicas, only one node is
 // responsible for scanning a token range at a time. Normally, this is the
 // node owning this range as a "primary range" (the first node in the ring
-// with this range), but when this node is down, other nodes may take over
-// (FIXME: this is not implemented yet).
+// with this range), but when this node is down, the secondary owner (the
+// second in the ring) may take over.
 // An expiration thread is reponsible for all tables which need expiration
-// scans. FIXME: explain how this is done with multiple tables - parallel,
-// staggered, or what?
+// scans. Currently, the different tables are scanned sequentially (not in
+// parallel).
 // The expiration thread scans item using CL=QUORUM to ensures that it reads
 // a consistent expiration-time attribute. This means that the items are read
 // locally and in addition QUORUM-1 additional nodes (one additional node
 // when RF=3) need to read the data and send digests.
-// FIXME: explain if we can read the exact attribute or the entire map.
 // When the expiration thread decides that an item has expired and wants
 // to delete it, it does it using a CL=QUORUM write. This allows this
 // deletion to be visible for consistent (quorum) reads. The deletion,
 // like user deletions, will also appear on the CDC log and therefore
-// Alternator Streams if enabled (FIXME: explain how we mark the
-// deletion different from user deletes. We don't do it yet.).
-expiration_service::expiration_service(data_dictionary::database db, service::storage_proxy& proxy)
+// Alternator Streams if enabled - currently as ordinary deletes (the
+// userIdentity flag is currently missing this is issue #11523).
+expiration_service::expiration_service(data_dictionary::database db, service::storage_proxy& proxy, gms::gossiper& g)
        : _db(db)
        , _proxy(proxy)
+        , _gossiper(g)
 {
 }

@@ -282,7 +285,9 @@ static future<> expire_item(service::storage_proxy& proxy,
        auto ck = clustering_key::from_exploded(exploded_ck);
        m.partition().clustered_row(*schema, ck).apply(tombstone(ts, gc_clock::now()));
    }
-    return proxy.mutate(std::vector<mutation>{std::move(m)},
+    std::vector<mutation> mutations;
+    mutations.push_back(std::move(m));
+    return proxy.mutate(std::move(mutations),
        db::consistency_level::LOCAL_QUORUM,
        executor::default_timeout(), // FIXME - which timeout?
        qs.get_trace_state(), qs.get_permit(),
@@ -365,7 +370,7 @@ static std::vector<std::pair<dht::token_range, gms::inet_address>> get_secondary
 // 2. The primary replica for this token is currently marked down.
 // 3. In this node, this shard is responsible for this token.
 // We use the <secondary> case to handle the possibility that some of the
-// nodes in the system are down. A dead node will not be expiring expiring
+// nodes in the system are down. A dead node will not be expiring
 // the tokens owned by it, so we want the secondary owner to take over its
 // primary ranges.
 //
@@ -511,7 +516,7 @@ struct scan_ranges_context {
        opts.set<query::partition_slice::option::bypass_cache>();
        std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
        auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), opts);
-        command = ::make_lw_shared<query::read_command>(s->id(), s->version(), partition_slice, proxy.get_max_result_size(partition_slice));
+        command = ::make_lw_shared<query::read_command>(s->id(), s->version(), partition_slice, proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
        executor::client_state client_state{executor::client_state::internal_tag()};
        tracing::trace_state_ptr trace_state;
        // NOTICE: empty_service_permit is used because the TTL service has fixed parallelism
@@ -546,13 +551,34 @@ static future<> scan_table_ranges(
            co_return;
        }
        auto units = co_await get_units(page_sem, 1);
-        // We don't to limit page size in number of rows because there is a
-        // builtin limit of the page's size in bytes. Setting this limit to 1
-        // is useful for debugging the paging code with moderate-size data.
+        // We don't need to limit page size in number of rows because there is
+        // a builtin limit of the page's size in bytes. Setting this limit to
+        // 1 is useful for debugging the paging code with moderate-size data.
        uint32_t limit = std::numeric_limits<uint32_t>::max();
-        // FIXME: which timeout?
-        // FIXME: if read times out, need to retry it.
-        std::unique_ptr<cql3::result_set> rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
+        // Read a page, and if that times out, try again after a small sleep.
+        // If we didn't catch the timeout exception, it would cause the scan
+        // be aborted and only be restarted at the next scanning period.
+        // If we retry too many times, give up and restart the scan later.
+        std::unique_ptr<cql3::result_set> rs;
+        for (int retries=0; ; retries++) {
+            try {
+                // FIXME: which timeout?
+                rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
+                break;
+            } catch(exceptions::read_timeout_exception&) {
+                tlogger.warn("expiration scanner read timed out, will retry: {}",
+                    std::current_exception());
+            }
+            // If we didn't break out of this loop, add a minimal sleep
+            if (retries >= 10) {
+                // Don't get stuck forever asking the same page, maybe there's
+                // a bug or a real problem in several replicas. Give up on
+                // this scan an retry the scan from a random position later,
+                // in the next scan period.
+                throw runtime_exception("scanner thread failed after too many timeouts for the same page");
+            }
+            co_await sleep_abortable(std::chrono::seconds(1), abort_source);
+        }
        auto rows = rs->rows();
        auto meta = rs->get_metadata().get_names();
        std::optional<unsigned> expiration_column;
@@ -637,6 +663,7 @@ static future<> scan_table_ranges(
 static future<bool> scan_table(
    service::storage_proxy& proxy,
    data_dictionary::database db,
+    gms::gossiper& gossiper,
    schema_ptr s,
    abort_source& abort_source,
    named_semaphore& page_sem,
@@ -689,7 +716,7 @@ static future<bool> scan_table(
    expiration_stats.scan_table++;
    // FIXME: need to pace the scan, not do it all at once.
    scan_ranges_context scan_ctx{s, proxy, std::move(column_name), std::move(member)};
-    token_ranges_owned_by_this_shard<primary> my_ranges(db.real_database(), proxy.gossiper(), s);
+    token_ranges_owned_by_this_shard<primary> my_ranges(db.real_database(), gossiper, s);
    while (std::optional<dht::partition_range> range = my_ranges.next_partition_range()) {
        // Note that because of issue #9167 we need to run a separate
        // query on each partition range, and can't pass several of
@@ -709,7 +736,7 @@ static future<bool> scan_table(
    // by tasking another node to take over scanning of the dead node's primary
    // ranges. What we do here is that this node will also check expiration
    // on its *secondary* ranges - but only those whose primary owner is down.
-    token_ranges_owned_by_this_shard<secondary> my_secondary_ranges(db.real_database(), proxy.gossiper(), s);
+    token_ranges_owned_by_this_shard<secondary> my_secondary_ranges(db.real_database(), gossiper, s);
    while (std::optional<dht::partition_range> range = my_secondary_ranges.next_partition_range()) {
        expiration_stats.secondary_ranges_scanned++;
        dht::partition_range_vector partition_ranges;
@@ -741,7 +768,7 @@ future<> expiration_service::run() {
                co_return;
            }
            try {
-                co_await scan_table(_proxy, _db, s, _abort_source, _page_sem, _expiration_stats);
+                co_await scan_table(_proxy, _db, _gossiper, s, _abort_source, _page_sem, _expiration_stats);
            } catch (...) {
                // The scan of a table may fail in the middle for many
                // reasons, including network failure and even the table
@@ -767,13 +794,15 @@ future<> expiration_service::run() {
        // in the next iteration by reducing the scanner's scheduling-group
        // share (if using a separate scheduling group), or introduce
        // finer-grain sleeps into the scanning code.
-        std::chrono::seconds scan_duration(std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start));
-        std::chrono::seconds period(_db.get_config().alternator_ttl_period_in_seconds());
+        std::chrono::milliseconds scan_duration(std::chrono::duration_cast<std::chrono::milliseconds>(lowres_clock::now() - start));
+        std::chrono::milliseconds period(long(_db.get_config().alternator_ttl_period_in_seconds() * 1000));
        if (scan_duration < period) {
            try {
-                tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count());
+                tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count()/1000.0);
                co_await seastar::sleep_abortable(period - scan_duration, _abort_source);
            } catch(seastar::sleep_aborted&) {}
+        } else {
+                tlogger.warn("scan took {} seconds, longer than period - not sleeping", scan_duration.count()/1000.0);
        }
    }
 }
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -14,6 +14,10 @@
 #include <seastar/core/semaphore.hh>
 #include "data_dictionary/data_dictionary.hh"

+namespace gms {
+class gossiper;
+}
+
 namespace replica {
 class database;
 }
@@ -47,6 +51,7 @@ public:
 private:
    data_dictionary::database _db;
    service::storage_proxy& _proxy;
+    gms::gossiper& _gossiper;
    // _end is set by start(), and resolves when the the background service
    // started by it ends. To ask the background service to end, _abort_source
    // should be triggered. stop() below uses both _abort_source and _end.
@@ -60,7 +65,7 @@ public:
    // sharded_service<expiration_service>::start() creates this object on
    // all shards, so calls this constructor on each shard. Later, the
    // additional start() function should be invoked on all shards.
-    expiration_service(data_dictionary::database, service::storage_proxy&);
+    expiration_service(data_dictionary::database, service::storage_proxy&, gms::gossiper&);
    future<> start();
    future<> run();
    // sharded_service<expiration_service>::stop() calls the following stop()
--- a/amplify.yml
+++ b/amplify.yml
@@ -0,0 +1,15 @@
+version: 1
+applications:
+  - frontend:
+      phases:
+        build:
+          commands:
+            - make setupenv
+            - make dirhtml
+      artifacts:
+        baseDirectory: _build/dirhtml
+        files:
+          - '**/*'
+      cache:
+        paths: []
+    appRoot: docs
--- a/api/api-doc/raft.json
+++ b/api/api-doc/raft.json
@@ -0,0 +1,43 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/raft",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/raft/trigger_snapshot/{group_id}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Triggers snapshot creation and log truncation for the given Raft group",
+               "type":"string",
+               "nickname":"trigger_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"group_id",
+                     "description":"The ID of the group which should get snapshotted",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Timeout in seconds after which the endpoint returns a failure. If not provided, 60s is used.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1228,7 +1228,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Removes token (and all data associated with enpoint that had it) from the ring",
+               "summary":"Removes a node from the cluster. Replicated data that logically belonged to this node is redistributed among the remaining nodes.",
               "type":"void",
               "nickname":"remove_node",
               "produces":[
@@ -1245,7 +1245,7 @@
                  },
                  {
                     "name":"ignore_nodes",
-                     "description":"List of dead nodes to ingore in removenode operation",
+                     "description":"Comma-separated list of dead nodes to ignore in removenode operation. Use the same method for all nodes to ignore: either Host IDs or ip addresses.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -1946,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Reset local schema",
+               "summary":"Forces this node to recalculate versions of schema objects.",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -52,6 +52,45 @@
            }
         ]
      },
+      {
+         "path":"/system/log",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Write a message to the Scylla log",
+               "type":"void",
+               "nickname":"write_log_message",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"message",
+                     "description":"The message to write to the log",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"level",
+                     "description":"The logging level to use",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "enum":[
+                        "error",
+                        "warn",
+                        "info",
+                        "debug",
+                        "trace"
+                     ],
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/system/drop_sstable_caches",
         "operations":[
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -0,0 +1,305 @@
+{
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager/list_modules",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get all modules names",
+                "type":"array",
+                "items":{
+                   "type":"string"
+                },
+                "nickname":"get_modules",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/list_module_tasks/{module}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get a list of tasks",
+                "type":"array",
+                "items":{
+                    "type":"task_stats"
+                },
+                "nickname":"get_tasks",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"module",
+                        "description":"The module to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    },
+                    {
+                        "name":"internal",
+                        "description":"Boolean flag indicating whether internal tasks should be shown (false by default)",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"boolean",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table to query about",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/task_status/{task_id}",
+          "operations":[
+             {
+                "method":"GET",
+                "summary":"Get task status",
+                "type":"task_status",
+                "nickname":"get_task_status",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to query about",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"path"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager/abort_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Abort running task and its descendants",
+                "type":"void",
+                "nickname":"abort_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to abort",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   }
+                ]
+             }
+          ]
+       },
+       {
+        "path":"/task_manager/wait_task/{task_id}",
+        "operations":[
+           {
+              "method":"GET",
+              "summary":"Wait for a task to complete",
+              "type":"task_status",
+              "nickname":"wait_task",
+              "produces":[
+                 "application/json"
+              ],
+              "parameters":[
+                 {
+                    "name":"task_id",
+                    "description":"The uuid of a task to wait for",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                 }
+              ]
+           }
+        ]
+     },
+     {
+      "path":"/task_manager/task_status_recursive/{task_id}",
+      "operations":[
+         {
+            "method":"GET",
+            "summary":"Get statuses of the task and all its descendants",
+            "type":"array",
+            "items":{
+               "type":"task_status"
+            },
+            "nickname":"get_task_status_recursively",
+            "produces":[
+               "application/json"
+            ],
+            "parameters":[
+                {
+                    "name":"task_id",
+                    "description":"The uuid of a task to query about",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                }
+            ]
+         }
+      ]
+    }
+    ],
+    "models":{
+       "task_stats" :{
+           "id": "task_stats",
+           "description":"A task statistics object",
+           "properties":{
+             "task_id":{
+                "type":"string",
+                "description":"The uuid of a task"
+             },
+             "state":{
+                "type":"string",
+                "enum":[
+                  "created",
+                  "running",
+                  "done",
+                  "failed"
+                ],
+                "description":"The state of a task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "keyspace":{
+                "type":"string",
+                "description":"The keyspace the task is working on (if applicable)"
+             },
+             "table":{
+                "type":"string",
+                "description":"The table the task is working on (if applicable)"
+             },
+             "entity":{
+                "type":"string",
+                "description":"Task-specific entity description"
+             },
+             "sequence_number":{
+                "type":"long",
+                "description":"The running sequence number of the task"
+             }
+           }
+       },
+       "task_status":{
+          "id":"task_status",
+          "description":"A task status object",
+          "properties":{
+             "id":{
+                "type":"string",
+                "description":"The uuid of the task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "state":{
+               "type":"string",
+               "enum":[
+                 "created",
+                 "running",
+                 "done",
+                 "failed"
+               ],
+                "description":"The state of the task"
+             },
+             "is_abortable":{
+                "type":"boolean",
+                "description":"Boolean flag indicating whether the task can be aborted"
+             },
+             "start_time":{
+                "type":"datetime",
+                "description":"The start time of the task"
+             },
+             "end_time":{
+                "type":"datetime",
+                "description":"The end time of the task (unspecified when the task is not completed)"
+             },
+             "error":{
+                "type":"string",
+                "description":"Error string, if the task failed"
+             },
+             "parent_id":{
+               "type":"string",
+               "description":"The uuid of the parent task"
+            },
+            "sequence_number":{
+               "type":"long",
+               "description":"The running sequence number of the task"
+            },
+            "shard":{
+               "type":"long",
+               "description":"The number of a shard the task is running on"
+            },
+            "keyspace":{
+               "type":"string",
+               "description":"The keyspace the task is working on (if applicable)"
+            },
+            "table":{
+               "type":"string",
+               "description":"The table the task is working on (if applicable)"
+            },
+            "entity":{
+               "type":"string",
+               "description":"Task-specific entity description"
+            },
+            "progress_units":{
+               "type":"string",
+               "description":"A description of the progress units"
+            },
+            "progress_total":{
+               "type":"double",
+               "description":"The total number of units to complete for the task"
+            },
+            "progress_completed":{
+               "type":"double",
+               "description":"The number of units completed so far"
+            },
+            "children_ids":{
+               "type":"array",
+                "items":{
+                    "type":"string"
+                },
+               "description":"Task IDs of children of this task"
+            }
+          }
+       }
+    }
+ }
--- a/api/api-doc/task_manager_test.json
+++ b/api/api-doc/task_manager_test.json
@@ -0,0 +1,177 @@
+{
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/task_manager_test",
+    "produces":[
+       "application/json"
+    ],
+    "apis":[
+       {
+          "path":"/task_manager_test/test_module",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test module in task manager",
+                "type":"void",
+                "nickname":"register_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test module in task manager",
+                "type":"void",
+                "nickname":"unregister_test_module",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/test_task",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Register test task",
+                "type":"string",
+                "nickname":"register_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"shard",
+                        "description":"The shard of the task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"long",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"parent_id",
+                        "description":"The uuid of a parent task",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"keyspace",
+                        "description":"The keyspace the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"table",
+                        "description":"The table the task is working on",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    },
+                    {
+                        "name":"entity",
+                        "description":"Task-specific entity description",
+                        "required":false,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             },
+             {
+                "method":"DELETE",
+                "summary":"Unregister test task",
+                "type":"void",
+                "nickname":"unregister_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                    {
+                        "name":"task_id",
+                        "description":"The uuid of a task to register",
+                        "required":true,
+                        "allowMultiple":false,
+                        "type":"string",
+                        "paramType":"query"
+                    }
+                ]
+             }
+          ]
+       },
+       {
+          "path":"/task_manager_test/finish_test_task/{task_id}",
+          "operations":[
+             {
+                "method":"POST",
+                "summary":"Finish test task",
+                "type":"void",
+                "nickname":"finish_test_task",
+                "produces":[
+                   "application/json"
+                ],
+                "parameters":[
+                   {
+                      "name":"task_id",
+                      "description":"The uuid of a task to finish",
+                      "required":true,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"path"
+                   },
+                   {
+                      "name":"error",
+                      "description":"The error with which task fails (if it does)",
+                      "required":false,
+                      "allowMultiple":false,
+                      "type":"string",
+                      "paramType":"query"
+                   }
+                ]
+             }
+          ]
+       },
+       {
+         "path":"/task_manager_test/ttl",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Set ttl in seconds and get last value",
+               "type":"long",
+               "nickname":"get_and_update_ttl",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"ttl",
+                     "description":"The number of seconds for which the tasks will be kept in memory after it finishes",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      }
+    ]
+ }
--- a/api/api.cc
+++ b/api/api.cc
@@ -29,6 +29,9 @@
 #include "stream_manager.hh"
 #include "system.hh"
 #include "api/config.hh"
+#include "task_manager.hh"
+#include "task_manager_test.hh"
+#include "raft.hh"

 logging::logger apilog("api");

@@ -146,8 +149,14 @@ future<> unset_server_snapshot(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_snapshot(ctx, r); });
 }

-future<> set_server_snitch(http_context& ctx) {
-    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", set_endpoint_snitch);
+future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch) {
+    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", [&snitch] (http_context& ctx, routes& r) {
+        set_endpoint_snitch(ctx, r, snitch);
+    });
+}
+
+future<> unset_server_snitch(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_endpoint_snitch(ctx, r); });
 }

 future<> set_server_gossip(http_context& ctx, sharded<gms::gossiper>& g) {
@@ -245,6 +254,42 @@ future<> set_server_done(http_context& ctx) {
    });
 }

+future<> set_server_task_manager(http_context& ctx) {
+    auto rb = std::make_shared < api_registry_builder > (ctx.api_doc);
+
+    return ctx.http_server.set_routes([rb, &ctx](routes& r) {
+        rb->register_function(r, "task_manager",
+                "The task manager API");
+        set_task_manager(ctx, r);
+    });
+}
+
+#ifndef SCYLLA_BUILD_MODE_RELEASE
+
+future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::config> cfg) {
+    auto rb = std::make_shared < api_registry_builder > (ctx.api_doc);
+
+    return ctx.http_server.set_routes([rb, &ctx, &cfg = *cfg](routes& r) mutable {
+        rb->register_function(r, "task_manager_test",
+                "The task manager test API");
+        set_task_manager_test(ctx, r, cfg);
+    });
+}
+
+#endif
+
+future<> set_server_raft(http_context& ctx, sharded<service::raft_group_registry>& raft_gr) {
+    auto rb = std::make_shared<api_registry_builder>(ctx.api_doc);
+    return ctx.http_server.set_routes([rb, &ctx, &raft_gr] (routes& r) {
+        rb->register_function(r, "raft", "The Raft API");
+        set_raft(ctx, r, raft_gr);
+    });
+}
+
+future<> unset_server_raft(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
+}
+
 void req_params::process(const request& req) {
    // Process mandatory parameters
    for (auto& [name, ent] : params) {
--- a/api/api.hh
+++ b/api/api.hh
@@ -137,6 +137,14 @@ future<json::json_return_type>  sum_timer_stats(distributed<T>& d, utils::timed_
    });
 }

+template<class T, class F>
+future<json::json_return_type>  sum_timer_stats(distributed<T>& d, utils::timed_rate_moving_average_summary_and_histogram F::*f) {
+    return d.map_reduce0([f](const T& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average_and_histogram(),
+            std::plus<utils::rate_moving_average_and_histogram>()).then([](const utils::rate_moving_average_and_histogram& val) {
+        return make_ready_future<json::json_return_type>(timer_to_json(val));
+    });
+}
+
 inline int64_t min_int64(int64_t a, int64_t b) {
    return std::min(a,b);
 }
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -11,13 +11,18 @@
 #include <seastar/core/future.hh>

 #include "replica/database_fwd.hh"
+#include "tasks/task_manager.hh"
 #include "seastarx.hh"

+using request = http::request;
+using reply = http::reply;
+
 namespace service {

 class load_meter;
 class storage_proxy;
 class storage_service;
+class raft_group_registry;

 } // namespace service

@@ -31,6 +36,7 @@ namespace locator {

 class token_metadata;
 class shared_token_metadata;
+class snitch_ptr;

 } // namespace locator

@@ -66,11 +72,12 @@ struct http_context {
    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
    const sharded<locator::shared_token_metadata>& shared_token_metadata;
+    sharded<tasks::task_manager>& tm;

    http_context(distributed<replica::database>& _db,
            distributed<service::storage_proxy>& _sp,
-            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm)
-            : db(_db), sp(_sp), lmeter(_lm), shared_token_metadata(_stm) {
+            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm, sharded<tasks::task_manager>& _tm)
+            : db(_db), sp(_sp), lmeter(_lm), shared_token_metadata(_stm), tm(_tm) {
    }

    const locator::token_metadata& get_token_metadata();
@@ -78,7 +85,8 @@ struct http_context {

 future<> set_server_init(http_context& ctx);
 future<> set_server_config(http_context& ctx, const db::config& cfg);
-future<> set_server_snitch(http_context& ctx);
+future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
+future<> unset_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<gms::gossiper>& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ks);
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
 future<> unset_server_sstables_loader(http_context& ctx);
@@ -107,5 +115,9 @@ future<> set_server_gossip_settle(http_context& ctx, sharded<gms::gossiper>& g);
 future<> set_server_cache(http_context& ctx);
 future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
+future<> set_server_task_manager(http_context& ctx);
+future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::config> cfg);
+future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
+future<> unset_server_raft(http_context&);

 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -14,7 +14,7 @@
 #include "sstables/metadata_collector.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
-#include "db/system_keyspace_view_types.hh"
+#include "db/system_keyspace.hh"
 #include "db/data_listeners.hh"
 #include "storage_service.hh"
 #include "unimplemented.hh"
@@ -43,7 +43,7 @@ std::tuple<sstring, sstring> parse_fully_qualified_cf_name(sstring name) {
    return std::make_tuple(name.substr(0, pos), name.substr(end));
 }

-const utils::UUID& get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
+const table_id& get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
    try {
        return db.find_uuid(ks, cf);
    } catch (replica::no_such_column_family& e) {
@@ -51,7 +51,7 @@ const utils::UUID& get_uuid(const sstring& ks, const sstring& cf, const replica:
    }
 }

-const utils::UUID& get_uuid(const sstring& name, const replica::database& db) {
+const table_id& get_uuid(const sstring& name, const replica::database& db) {
    auto [ks, cf] = parse_fully_qualified_cf_name(name);
    return get_uuid(ks, cf, db);
 }
@@ -110,7 +110,7 @@ static future<json::json_return_type>  get_cf_stats_count(http_context& ctx,

 static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const sstring& name,
        utils::timed_rate_moving_average_and_histogram replica::column_family_stats::*f) {
-    utils::UUID uuid = get_uuid(name, ctx.db.local());
+    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([f, uuid](const replica::database& p) {
        return (p.find_column_family(uuid).get_stats().*f).hist;},
            utils::ihistogram(),
@@ -122,7 +122,7 @@ static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const

 static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const sstring& name,
        utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
-    utils::UUID uuid = get_uuid(name, ctx.db.local());
+    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([f, uuid](const replica::database& p) {
        return (p.find_column_family(uuid).get_stats().*f).hist;},
            utils::ihistogram(),
@@ -149,7 +149,7 @@ static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils:

 static future<json::json_return_type>  get_cf_rate_and_histogram(http_context& ctx, const sstring& name,
        utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
-    utils::UUID uuid = get_uuid(name, ctx.db.local());
+    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([f, uuid](const replica::database& p) {
        return (p.find_column_family(uuid).get_stats().*f).rate();},
            utils::rate_moving_average_and_histogram(),
@@ -334,13 +334,13 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](replica::column_family& cf) {
-            return cf.active_memtable().partition_count();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t{0}, [](replica::column_family& cf) {
-            return cf.active_memtable().partition_count();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });

@@ -354,25 +354,33 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().total_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().total_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().total_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().total_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

@@ -394,7 +402,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_cf_all_memtables_off_heap_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        warn(unimplemented::cause::INDEXES);
        return ctx.db.map_reduce0([](const replica::database& db){
-            return db.dirty_memory_region_group().memory_used();
+            return db.dirty_memory_region_group().real_memory_used();
        }, int64_t(0), std::plus<int64_t>()).then([](int res) {
            return make_ready_future<json::json_return_type>(res);
        });
@@ -410,7 +418,9 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_cf_all_memtables_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        warn(unimplemented::cause::INDEXES);
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

@@ -529,13 +539,13 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

    cf::get_all_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

@@ -855,7 +865,7 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_auto_compaction.set(r, [&ctx] (const_req req) {
-        const utils::UUID& uuid = get_uuid(req.param["name"], ctx.db.local());
+        auto uuid = get_uuid(req.param["name"], ctx.db.local());
        replica::column_family& cf = ctx.db.local().find_column_family(uuid);
        return !cf.is_auto_compaction_disabled_by_user();
    });
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -18,7 +18,7 @@ namespace api {

 void set_column_family(http_context& ctx, routes& r);

-const utils::UUID& get_uuid(const sstring& name, const replica::database& db);
+const table_id& get_uuid(const sstring& name, const replica::database& db);
 future<> foreach_column_family(http_context& ctx, const sstring& name, std::function<void(replica::column_family&)> f);


@@ -63,7 +63,7 @@ struct map_reduce_column_families_locally {
    std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)> reducer;
    future<std::unique_ptr<std::any>> operator()(replica::database& db) const {
        auto res = seastar::make_lw_shared<std::unique_ptr<std::any>>(std::make_unique<std::any>(init));
-        return do_for_each(db.get_column_families(), [res, this](const std::pair<utils::UUID, seastar::lw_shared_ptr<replica::table>>& i) {
+        return do_for_each(db.get_column_families(), [res, this](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) {
            *res = reducer(std::move(*res), mapper(*i.second.get()));
        }).then([res] {
            return std::move(*res);
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -41,7 +41,6 @@ static std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_ha
    return std::move(a);
 }

-
 void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
@@ -68,9 +67,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<request> req) {
        return ctx.db.map_reduce0([&ctx](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&ctx, &db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return do_for_each(db.get_column_families(), [&tasks](const std::pair<utils::UUID, seastar::lw_shared_ptr<replica::table>>& i) {
+                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) -> future<> {
                    replica::table& cf = *i.second.get();
-                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
                    return make_ready_future<>();
                }).then([&tasks] {
                    return std::move(tasks);
@@ -119,7 +118,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
            auto& cm = db.get_compaction_manager();
            return parallel_for_each(table_names, [&db, &cm, &ks_name, type] (sstring& table_name) {
                auto& t = db.find_column_family(ks_name, table_name);
-                return cm.stop_compaction(type, &t.as_table_state());
+                return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                    return cm.stop_compaction(type, &ts);
+                });
            });
        });
        co_return json_void();
@@ -127,7 +128,7 @@ void set_compaction_manager(http_context& ctx, routes& r) {

    cm::get_pending_tasks.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

--- a/api/endpoint_snitch.cc
+++ b/api/endpoint_snitch.cc
@@ -8,13 +8,15 @@

 #include "locator/token_metadata.hh"
 #include "locator/snitch_base.hh"
+#include "locator/production_snitch_base.hh"
 #include "endpoint_snitch.hh"
 #include "api/api-doc/endpoint_snitch_info.json.hh"
+#include "api/api-doc/storage_service.json.hh"
 #include "utils/fb_utilities.hh"

 namespace api {

-void set_endpoint_snitch(http_context& ctx, routes& r) {
+void set_endpoint_snitch(http_context& ctx, routes& r, sharded<locator::snitch_ptr>& snitch) {
    static auto host_or_broadcast = [](const_req req) {
        auto host = req.get_query_param("host");
        return host.empty() ? gms::inet_address(utils::fb_utilities::get_broadcast_address()) : gms::inet_address(host);
@@ -22,17 +24,45 @@ void set_endpoint_snitch(http_context& ctx, routes& r) {

    httpd::endpoint_snitch_info_json::get_datacenter.set(r, [&ctx](const_req req) {
        auto& topology = ctx.shared_token_metadata.local().get()->get_topology();
-        return topology.get_datacenter(host_or_broadcast(req));
+        auto ep = host_or_broadcast(req);
+        if (!topology.has_endpoint(ep)) {
+            // Cannot return error here, nodetool status can race, request
+            // info about just-left node and not handle it nicely
+            return sstring(locator::production_snitch_base::default_dc);
+        }
+        return topology.get_datacenter(ep);
    });

    httpd::endpoint_snitch_info_json::get_rack.set(r, [&ctx](const_req req) {
        auto& topology = ctx.shared_token_metadata.local().get()->get_topology();
-        return topology.get_rack(host_or_broadcast(req));
+        auto ep = host_or_broadcast(req);
+        if (!topology.has_endpoint(ep)) {
+            // Cannot return error here, nodetool status can race, request
+            // info about just-left node and not handle it nicely
+            return sstring(locator::production_snitch_base::default_rack);
+        }
+        return topology.get_rack(ep);
    });

-    httpd::endpoint_snitch_info_json::get_snitch_name.set(r, [] (const_req req) {
-        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_name();
+    httpd::endpoint_snitch_info_json::get_snitch_name.set(r, [&snitch] (const_req req) {
+        return snitch.local()->get_name();
    });
+
+    httpd::storage_service_json::update_snitch.set(r, [&snitch](std::unique_ptr<request> req) {
+        locator::snitch_config cfg;
+        cfg.name = req->get_query_param("ep_snitch_class_name");
+        return locator::i_endpoint_snitch::reset_snitch(snitch, cfg).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
+    });
+
+}
+
+void unset_endpoint_snitch(http_context& ctx, routes& r) {
+    httpd::endpoint_snitch_info_json::get_datacenter.unset(r);
+    httpd::endpoint_snitch_info_json::get_rack.unset(r);
+    httpd::endpoint_snitch_info_json::get_snitch_name.unset(r);
+    httpd::storage_service_json::update_snitch.unset(r);
 }

 }
--- a/api/endpoint_snitch.hh
+++ b/api/endpoint_snitch.hh
@@ -10,8 +10,13 @@

 #include "api.hh"

+namespace locator {
+class snitch_ptr;
+}
+
 namespace api {

-void set_endpoint_snitch(http_context& ctx, routes& r);
+void set_endpoint_snitch(http_context& ctx, routes& r, sharded<locator::snitch_ptr>&);
+void unset_endpoint_snitch(http_context& ctx, routes& r);

 }
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -17,36 +17,42 @@ namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
-        std::vector<fd::endpoint_state> res;
-        for (auto i : g.get_endpoint_states()) {
-            fd::endpoint_state val;
-            val.addrs = boost::lexical_cast<std::string>(i.first);
-            val.is_alive = i.second.is_alive();
-            val.generation = i.second.get_heart_beat_state().get_generation();
-            val.version = i.second.get_heart_beat_state().get_heart_beat_version();
-            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
-            for (auto a : i.second.get_application_state_map()) {
-                fd::version_value version_val;
-                // We return the enum index and not it's name to stay compatible to origin
-                // method that the state index are static but the name can be changed.
-                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
-                version_val.value = a.second.value;
-                version_val.version = a.second.version;
-                val.application_state.push(version_val);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::vector<fd::endpoint_state> res;
+            for (auto i : g.get_endpoint_states()) {
+                fd::endpoint_state val;
+                val.addrs = boost::lexical_cast<std::string>(i.first);
+                val.is_alive = i.second.is_alive();
+                val.generation = i.second.get_heart_beat_state().get_generation();
+                val.version = i.second.get_heart_beat_state().get_heart_beat_version();
+                val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+                for (auto a : i.second.get_application_state_map()) {
+                    fd::version_value version_val;
+                    // We return the enum index and not it's name to stay compatible to origin
+                    // method that the state index are static but the name can be changed.
+                    version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                    version_val.value = a.second.value;
+                    version_val.version = a.second.version;
+                    val.application_state.push(version_val);
+                }
+                res.push_back(val);
            }
-            res.push_back(val);
-        }
-        return make_ready_future<json::json_return_type>(res);
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_up_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_up_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_up_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_down_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_down_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_down_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_phi_convict_threshold.set(r, [] (std::unique_ptr<request> req) {
@@ -54,11 +60,13 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
-        std::map<sstring, sstring> nodes_status;
-        for (auto& entry : g.get_endpoint_states()) {
-            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
-        }
-        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::map<sstring, sstring> nodes_status;
+            for (auto& entry : g.get_endpoint_states()) {
+                nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+            }
+            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        });
    });

    fd::set_phi_convict_threshold.set(r, [](std::unique_ptr<request> req) {
@@ -67,13 +75,15 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
-        if (!state) {
-            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
-        }
-        std::stringstream ss;
-        g.append_endpoint_state(ss, *state);
-        return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
+            auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
+            if (!state) {
+                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+            }
+            std::stringstream ss;
+            g.append_endpoint_state(ss, *state);
+            return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        });
    });

    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -6,6 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <seastar/core/coroutine.hh>
+
 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
 #include "gms/gossiper.hh"
@@ -14,19 +16,23 @@ namespace api {
 using namespace json;

 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
-    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_unreachable_members();
-        return container_to_vec(res);
+    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto res = co_await g.get_unreachable_members_synchronized();
+        co_return json::json_return_type(container_to_vec(res));
    });

-    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_live_members();
-        return container_to_vec(res);
+
+    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (std::unique_ptr<request> req) {
+        return g.get_live_members_synchronized().then([] (auto res) {
+            return make_ready_future<json::json_return_type>(container_to_vec(res));
+        });
    });

-    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
-        gms::inet_address ep(req.param["addr"]);
-        return g.get_endpoint_downtime(ep);
+    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        gms::inet_address ep(req->param["addr"]);
+        // synchronize unreachable_members on all shards
+        co_await g.get_unreachable_members_synchronized();
+        co_return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<request> req) {
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <seastar/core/coroutine.hh>
+
+#include "api/api.hh"
+#include "api/api-doc/raft.json.hh"
+
+#include "service/raft/raft_group_registry.hh"
+
+using namespace seastar::httpd;
+
+extern logging::logger apilog;
+
+namespace api {
+
+namespace r = httpd::raft_json;
+using namespace json;
+
+void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
+    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
+        raft::group_id gid{utils::UUID{req->param["group_id"]}};
+        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
+            if (timeout_str.empty()) {
+                return std::chrono::seconds{60};
+            }
+            auto dur = std::stoll(timeout_str);
+            if (dur <= 0) {
+                throw std::runtime_error{"Timeout must be a positive number."};
+            }
+            return std::chrono::seconds{dur};
+        });
+
+        std::atomic<bool> found_srv{false};
+        co_await raft_gr.invoke_on_all([gid, timeout_dur, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
+            auto* srv = raft_gr.find_server(gid);
+            if (!srv) {
+                co_return;
+            }
+
+            found_srv = true;
+            abort_on_expiry aoe(lowres_clock::now() + timeout_dur);
+            apilog.info("Triggering Raft group {} snapshot", gid);
+            auto result = co_await srv->trigger_snapshot(&aoe.abort_source());
+            if (result) {
+                apilog.info("New snapshot for Raft group {} created", gid);
+            } else {
+                apilog.info("Could not create new snapshot for Raft group {}, no new entries applied", gid);
+            }
+        });
+
+        if (!found_srv) {
+            throw std::runtime_error{fmt::format("Server for group ID {} not found", gid)};
+        }
+
+        co_return json_void{};
+    });
+}
+
+void unset_raft(http_context&, httpd::routes& r) {
+    r::trigger_snapshot.unset(r);
+}
+
+}
+
--- a/api/raft.hh
+++ b/api/raft.hh
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2023-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "api_init.hh"
+
+namespace api {
+
+void set_raft(http_context& ctx, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr);
+void unset_raft(http_context& ctx, httpd::routes& r);
+
+}
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -22,6 +22,9 @@ namespace sp = httpd::storage_proxy_json;
 using proxy = service::storage_proxy;
 using namespace json;

+utils::time_estimated_histogram timed_rate_moving_average_summary_merge(utils::time_estimated_histogram a, const utils::timed_rate_moving_average_summary_and_histogram& b) {
+    return a.merge(b.histogram());
+}

 /**
 * This function implement a two dimentional map reduce where
@@ -55,10 +58,10 @@ future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
 * @param initial_value - the initial value to use for both aggregations* @return
 * @return A future that resolves to the result of the aggregation.
 */
-template<typename V, typename Reducer, typename F>
+template<typename V, typename Reducer, typename F, typename C>
 future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
-        V F::*f, Reducer reducer, V initial_value) {
-    return two_dimensional_map_reduce(d, [f] (F& stats) {
+        C F::*f, Reducer reducer, V initial_value) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) -> V {
        return stats.*f;
    }, reducer, initial_value);
 }
@@ -112,10 +115,10 @@ utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimat
    return res;
 }

-static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::time_estimated_histogram service::storage_proxy_stats::stats::*f) {
-
-    return two_dimensional_map_reduce(ctx.sp, f, utils::time_estimated_histogram_merge,
-            utils::time_estimated_histogram()).then([](const utils::time_estimated_histogram& val) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).histogram();
+    }, utils::time_estimated_histogram_merge, utils::time_estimated_histogram()).then([](const utils::time_estimated_histogram& val) {
        return make_ready_future<json::json_return_type>(time_to_json_histogram(val));
    });
 }
@@ -130,7 +133,7 @@ static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx
    });
 }

-static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram service::storage_proxy_stats::stats::*f) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram service::storage_proxy_stats::stats::*f) {
    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
            return (stats.*f).hist.mean * (stats.*f).hist.count;
        }, std::plus<double>(), 0.0).then([](double val) {
@@ -150,7 +153,7 @@ static future<json::json_return_type>  total_latency(http_context& ctx, utils::t
 template<typename F>
 future<json::json_return_type>
 sum_histogram_stats_storage_proxy(distributed<proxy>& d,
-        utils::timed_rate_moving_average_and_histogram F::*f) {
+        utils::timed_rate_moving_average_summary_and_histogram F::*f) {
    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
        return (stats.*f).hist;
    }, std::plus<utils::ihistogram>(), utils::ihistogram()).
@@ -170,7 +173,7 @@ sum_histogram_stats_storage_proxy(distributed<proxy>& d,
 template<typename F>
 future<json::json_return_type>
 sum_timer_stats_storage_proxy(distributed<proxy>& d,
-        utils::timed_rate_moving_average_and_histogram F::*f) {
+        utils::timed_rate_moving_average_summary_and_histogram F::*f) {

    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
        return (stats.*f).rate();
@@ -491,14 +494,14 @@ void set_storage_proxy(http_context& ctx, routes& r, sharded<service::storage_se
    });

    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_read);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::read);
    });

    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<request> req) {
        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_write);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::write);
    });

    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<request> req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -49,6 +49,14 @@

 extern logging::logger apilog;

+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti) {
+    return os << "table{name=" << ti.name << ", id=" << ti.id << "}";
+}
+
+} // namespace std
+
 namespace api {

 const locator::token_metadata& http_context::get_token_metadata() {
@@ -69,6 +77,11 @@ sstring validate_keyspace(http_context& ctx, const parameters& param) {
    return validate_keyspace(ctx, param["keyspace"]);
 }

+locator::host_id validate_host_id(const sstring& param) {
+    auto hoep = locator::host_id_or_endpoint(param, locator::host_id_or_endpoint::param_type::host_id);
+    return hoep.id;
+}
+
 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective no_such_column_family error.
@@ -95,6 +108,55 @@ std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, con
    return parse_tables(ks_name, ctx, it->second);
 }

+std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, sstring value) {
+    std::vector<table_info> res;
+    try {
+        if (value.empty()) {
+            const auto& cf_meta_data = ctx.db.local().find_keyspace(ks_name).metadata().get()->cf_meta_data();
+            res.reserve(cf_meta_data.size());
+            for (const auto& [name, schema] : cf_meta_data) {
+                res.emplace_back(table_info{name, schema->id()});
+            }
+        } else {
+            std::vector<sstring> names = split(value, ",");
+            res.reserve(names.size());
+            const auto& db = ctx.db.local();
+            for (const auto& table_name : names) {
+                res.emplace_back(table_info{table_name, db.find_uuid(ks_name, table_name)});
+            }
+        }
+    } catch (const replica::no_such_keyspace& e) {
+        throw bad_param_exception(e.what());
+    } catch (const replica::no_such_column_family& e) {
+        throw bad_param_exception(e.what());
+    }
+    return res;
+}
+
+std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name) {
+    auto it = query_params.find(param_name);
+    return parse_table_infos(ks_name, ctx, it != query_params.end() ? it->second : "");
+}
+
+// Run on all tables, skipping dropped tables
+future<> run_on_existing_tables(sstring op, replica::database& db, std::string_view keyspace, const std::vector<table_info> local_tables, std::function<future<> (replica::table&)> func) {
+    std::exception_ptr ex;
+    for (const auto& ti : local_tables) {
+        apilog.debug("Starting {} on {}.{}", op, keyspace, ti);
+        try {
+            co_await func(db.find_column_family(ti.id));
+        } catch (const replica::no_such_column_family& e) {
+            apilog.warn("Skipping {} of {}.{}: {}", op, keyspace, ti, e.what());
+        } catch (...) {
+            ex = std::current_exception();
+            apilog.error("Failed {} of {}.{}: {}", op, keyspace, ti, ex);
+        }
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+    }
+}
+
 static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
    ss::token_range r;
    r.start_token = d._start_token;
@@ -113,16 +175,13 @@ static ss::token_range token_range_endpoints_to_json(const dht::token_range_endp
    return r;
 }

-using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
+using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<table_info>)>;

 static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-        }
-        return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        return f(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
    };
 }

@@ -184,17 +243,21 @@ future<json::json_return_type> set_tables_autocompaction(http_context& ctx, cons
 }

 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
-    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::start_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::stop_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -216,17 +279,21 @@ void unset_transport_controller(http_context& ctx, routes& r) {
 }

 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
-    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+    ss::stop_rpc_server.set(r, [&ctx, &ctl] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+    ss::start_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -548,7 +615,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::describe_any_ring.set(r, [&ctx, &ss](std::unique_ptr<request> req) {
        // Find an arbitrary non-system keyspace.
-        auto keyspaces = ctx.db.local().get_non_system_keyspaces();
+        auto keyspaces = ctx.db.local().get_non_local_strategy_keyspaces();
        if (keyspaces.empty()) {
            throw std::runtime_error("No keyspace provided and no non system kespace exist");
        }
@@ -604,104 +671,126 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-        }
-        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
-            auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
-                return db.find_uuid(keyspace, cf_name);
-            }));
-            // major compact smaller tables first, to increase chances of success if low on space.
-            std::ranges::sort(table_ids, std::less<>(), [&] (const utils::UUID& id) {
-                return db.find_column_family(id).get_stats().live_disk_space_used;
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, table_infos);
+        try {
+            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
+                auto local_tables = table_infos;
+                // major compact smaller tables first, to increase chances of success if low on space.
+                std::ranges::sort(local_tables, std::less<>(), [&] (const table_info& ti) {
+                    try {
+                        return db.find_column_family(ti.id).get_stats().live_disk_space_used;
+                    } catch (const replica::no_such_column_family& e) {
+                        return int64_t(-1);
+                    }
+                });
+                co_await run_on_existing_tables("force_keyspace_compaction", db, keyspace, local_tables, [] (replica::table& t) {
+                    return t.compact_all_sstables();
+                });
            });
-            // as a table can be dropped during loop below, let's find it before issuing major compaction request.
-            for (auto& id : table_ids) {
-                co_await db.find_column_family(id).compact_all_sstables();
-            }
-            co_return;
-        }).then([]{
-                return make_ready_future<json::json_return_type>(json_void());
-        });
+        } catch (...) {
+            apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json_void();
    });

-    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<request> req) {
+    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
+            auto msg = "Can not perform cleanup operation when topology changes";
+            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+            co_await coroutine::return_exception(std::runtime_error(msg));
        }
-        return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
-                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
-            if (!is_cleanup_allowed) {
-                return make_exception_future<json::json_return_type>(
-                        std::runtime_error("Can not perform cleanup operation when topology changes"));
-            }
-            return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
-                auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& table_name) {
-                    return db.find_uuid(keyspace, table_name);
-                }));
+        try {
+            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
+                auto local_tables = table_infos;
                // cleanup smaller tables first, to increase chances of success if low on space.
-                std::ranges::sort(table_ids, std::less<>(), [&] (const utils::UUID& id) {
-                    return db.find_column_family(id).get_stats().live_disk_space_used;
+                std::ranges::sort(local_tables, std::less<>(), [&] (const table_info& ti) {
+                    try {
+                        return db.find_column_family(ti.id).get_stats().live_disk_space_used;
+                    } catch (const replica::no_such_column_family& e) {
+                        return int64_t(-1);
+                    }
                });
                auto& cm = db.get_compaction_manager();
                auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
-                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
-                for (auto& id : table_ids) {
-                    replica::table& t = db.find_column_family(id);
-                    co_await cm.perform_cleanup(owned_ranges_ptr, t.as_table_state());
-                }
-                co_return;
-            }).then([]{
-                return make_ready_future<json::json_return_type>(0);
+                co_await run_on_existing_tables("force_keyspace_cleanup", db, keyspace, local_tables, [&] (replica::table& t) {
+                    return t.perform_cleanup_compaction(owned_ranges_ptr);
+                });
            });
-        });
+        } catch (...) {
+            apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(0);
    });

-    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
-        co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
-            bool needed = false;
-            for (const auto& table : tables) {
-                auto& t = db.find_column_family(keyspace, table);
-                needed |= co_await t.perform_offstrategy_compaction();
-            }
-            co_return needed;
-        }, false, std::plus<bool>());
+    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
+        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
+        bool res = false;
+        try {
+            res = co_await ctx.db.map_reduce0([&] (replica::database& db) -> future<bool> {
+                bool needed = false;
+                co_await run_on_existing_tables("perform_keyspace_offstrategy_compaction", db, keyspace, table_infos, [&needed] (replica::table& t) -> future<> {
+                    needed |= co_await t.perform_offstrategy_compaction();
+                });
+                co_return needed;
+            }, false, std::plus<bool>());
+        } catch (...) {
+            apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(res);
    }));

-    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

-        return ctx.db.invoke_on_all([=] (replica::database& db) {
-            auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
-            return do_for_each(column_families, [=, &db](sstring cfname) {
-                auto& cm = db.get_compaction_manager();
-                auto& cf = db.find_column_family(keyspace, cfname);
-                return cm.perform_sstable_upgrade(owned_ranges_ptr, cf.as_table_state(), exclude_current_version);
+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+        try {
+            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
+                auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
+                co_await run_on_existing_tables("upgrade_sstables", db, keyspace, table_infos, [&] (replica::table& t) {
+                    return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                        return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, exclude_current_version);
+                    });
+                });
            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
-        });
+        } catch (...) {
+            apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(0);
    }));

    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-        auto &db = ctx.db.local();
+        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
+        auto& db = ctx.db;
        if (column_families.empty()) {
-            co_await db.flush_on_all(keyspace);
+            co_await replica::database::flush_keyspace_on_all_shards(db, keyspace);
        } else {
-            co_await db.flush_on_all(keyspace, std::move(column_families));
+            co_await replica::database::flush_tables_on_all_shards(db, keyspace, std::move(column_families));
        }
        co_return json_void();
    });


    ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("decommission");
        return ss.local().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -715,20 +804,24 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
-        auto host_id = req->get_query_param("host_id");
+        auto host_id = validate_host_id(req->get_query_param("host_id"));
        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
-        auto ignore_nodes = std::list<gms::inet_address>();
+        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
+        auto ignore_nodes = std::list<locator::host_id_or_endpoint>();
        for (std::string n : ignore_nodes_strs) {
            try {
                std::replace(n.begin(), n.end(), '\"', ' ');
                std::replace(n.begin(), n.end(), '\'', ' ');
                boost::trim_all(n);
                if (!n.empty()) {
-                    auto node = gms::inet_address(n);
-                    ignore_nodes.push_back(node);
+                    auto hoep = locator::host_id_or_endpoint(n);
+                    if (!ignore_nodes.empty() && hoep.has_host_id() != ignore_nodes.front().has_host_id()) {
+                        throw std::runtime_error("All nodes should be identified using the same method: either Host IDs or ip addresses.");
+                    }
+                    ignore_nodes.push_back(std::move(hoep));
                }
            } catch (...) {
-                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}", ignore_nodes_strs, n));
+                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}: {}", ignore_nodes_strs, n, std::current_exception()));
            }
        }
        return ss.local().removenode(host_id, std::move(ignore_nodes)).then([] {
@@ -789,6 +882,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("drain");
        return ss.local().drain().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -806,28 +900,20 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (type == "user") {
            return ctx.db.local().get_user_keyspaces();
        } else if (type == "non_local_strategy") {
-            return map_keys(ctx.db.local().get_keyspaces() | boost::adaptors::filtered([](const auto& p) {
-                return p.second.get_replication_strategy().get_type() != locator::replication_strategy_type::local;
-            }));
+            return ctx.db.local().get_non_local_strategy_keyspaces();
        }
        return map_keys(ctx.db.local().get_keyspaces());
    });

-    ss::update_snitch.set(r, [](std::unique_ptr<request> req) {
-        locator::snitch_config cfg;
-        cfg.name = req->get_query_param("ep_snitch_class_name");
-        return locator::i_endpoint_snitch::reset_snitch(cfg).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
    ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("stop_gossiping");
        return ss.local().stop_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("start_gossiping");
        return ss.local().start_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -930,6 +1016,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
        auto source_dc = req->get_query_param("source_dc");
+        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -962,17 +1049,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&sys_ks](std::unique_ptr<request> req) {
+    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
-        auto& sp = service::get_storage_proxy();
-        auto& fs = sp.local().features();
-        return db::schema_tables::recalculate_schema_version(sys_ks, sp, fs).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        apilog.info("reset_local_schema");
+        co_await ss.local().reload_schema();
+        co_return json_void();
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
+        apilog.info("set_trace_probability: probability={}", probability);
        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -1010,6 +1096,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto ttl = req->get_query_param("ttl");
        auto threshold = req->get_query_param("threshold");
        auto fast = req->get_query_param("fast");
+        apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
        try {
            return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
                if (threshold != "") {
@@ -1036,6 +1123,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, keyspace, tables, true);
    });

@@ -1043,6 +1131,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, keyspace, tables, false);
    });

@@ -1314,7 +1403,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        });
    });

-    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+    ss::take_snapshot.set(r, [&ctx, &snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
        apilog.info("take_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
@@ -1332,7 +1421,13 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
                if (keynames.size() > 1) {
                    throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
                }
-                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
+                for (const auto& table_name : column_families) {
+                    auto& t = ctx.db.local().find_column_family(keynames[0], table_name);
+                    if (t.schema()->is_view()) {
+                        throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
+                    }
+                }
+                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, db::snapshot_ctl::snap_views::yes, sf);
            }
            co_return json_void();
        } catch (...) {
@@ -1362,7 +1457,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        });
    });

-    ss::scrub.set(r, [&ctx, &snap_ctl] (std::unique_ptr<request> req) {
+    ss::scrub.set(r, [&ctx, &snap_ctl] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        auto rp = req_params({
            {"keyspace", {mandatory::yes}},
            {"cf", {""}},
@@ -1398,11 +1494,13 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
            }
        }

-        auto f = make_ready_future<>();
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
-                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::skip_flush::no, db::snapshot_ctl::allow_view_snapshots::yes);
+            co_await coroutine::parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
+                // We always pass here db::snapshot_ctl::snap_views::no since:
+                // 1. When scrubbing particular tables, there's no need to auto-snapshot their views.
+                // 2. When scrubbing the whole keyspace, column_families will contain both base tables and views.
+                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::snap_views::no, db::snapshot_ctl::skip_flush::no);
            });
        }

@@ -1427,28 +1525,30 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
            return stats;
        };

-        return f.then([&ctx, keyspace, column_families, opts, &reduce_compaction_stats] {
-            return ctx.db.map_reduce0([=] (replica::database& db) {
-                return map_reduce(column_families, [=, &db] (sstring cfname) {
+        try {
+            auto opt_stats = co_await db.map_reduce0([&] (replica::database& db) {
+                return map_reduce(column_families, [&] (sstring cfname) -> future<std::optional<sstables::compaction_stats>> {
                    auto& cm = db.get_compaction_manager();
                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(cf.as_table_state(), opts);
+                    sstables::compaction_stats stats{};
+                    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
+                        auto r = co_await cm.perform_sstable_scrub(ts, opts);
+                        stats += r.value_or(sstables::compaction_stats{});
+                    });
+                    co_return stats;
                }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
            }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
-        }).then_wrapped([] (auto f) {
-            if (f.failed()) {
-                auto ex = f.get_exception();
-                if (try_catch<sstables::compaction_aborted_exception>(ex)) {
-                    return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::aborted));
-                } else {
-                    return make_exception_future<json::json_return_type>(std::move(ex));
-                }
-            } else if (f.get()->validation_errors) {
-                return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::validation_errors));
-            } else {
-                return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::successful));
+            if (opt_stats && opt_stats->validation_errors) {
+                co_return json::json_return_type(static_cast<int>(scrub_status::validation_errors));
            }
-        });
+        } catch (const sstables::compaction_aborted_exception&) {
+            co_return json::json_return_type(static_cast<int>(scrub_status::aborted));
+        } catch (...) {
+            apilog.error("scrub keyspace={} tables={} failed: {}", keyspace, column_families, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(static_cast<int>(scrub_status::successful));
    });
 }

--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -8,6 +8,8 @@

 #pragma once

+#include <iostream>
+
 #include <seastar/core/sharded.hh>
 #include "api.hh"
 #include "db/data_listeners.hh"
@@ -41,8 +43,22 @@ sstring validate_keyspace(http_context& ctx, const parameters& param);
 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective no_such_column_family error.
+// Returns an empty vector if no parameter was found.
+// If the parameter is found and empty, returns a list of all table names in the keyspace.
 std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

+struct table_info {
+    sstring name;
+    table_id id;
+};
+
+// splits a request parameter assumed to hold a comma-separated list of table names
+// verify that the tables are found, otherwise a bad_param_exception exception is thrown
+// containing the description of the respective no_such_column_family error.
+// Returns a vector of all table infos given by the parameter, or
+// if the parameter is not found or is empty, returns a list of all table infos in the keyspace.
+std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);
+
 void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ls);
 void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>& sst_loader);
 void unset_sstables_loader(http_context& ctx, routes& r);
@@ -58,4 +74,10 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
 void unset_snapshot(http_context& ctx, routes& r);
 seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);

-}
+} // namespace api
+
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti);
+
+} // namespace std
--- a/api/system.cc
+++ b/api/system.cc
@@ -61,6 +61,16 @@ void set_system(http_context& ctx, routes& r) {
        return json::json_void();
    });

+    hs::write_log_message.set(r, [](const_req req) {
+        try {
+            logging::log_level level = boost::lexical_cast<logging::log_level>(std::string(req.get_query_param("level")));
+            apilog.log(level, "/system/log: {}", std::string(req.get_query_param("message")));
+        } catch (boost::bad_lexical_cast& e) {
+            throw bad_param_exception("Unknown logging level " + req.get_query_param("level"));
+        }
+        return json::json_void();
+    });
+
    hs::drop_sstable_caches.set(r, [&ctx](std::unique_ptr<request> req) {
        apilog.info("Dropping sstable caches");
        return ctx.db.invoke_on_all([] (replica::database& db) {
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -0,0 +1,231 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <seastar/core/coroutine.hh>
+
+#include "task_manager.hh"
+#include "api/api-doc/task_manager.json.hh"
+#include "db/system_keyspace.hh"
+#include "column_family.hh"
+#include "unimplemented.hh"
+#include "storage_service.hh"
+
+#include <utility>
+#include <boost/range/adaptors.hpp>
+
+namespace api {
+
+namespace tm = httpd::task_manager_json;
+using namespace json;
+
+inline bool filter_tasks(tasks::task_manager::task_ptr task, std::unordered_map<sstring, sstring>& query_params) {
+    return (!query_params.contains("keyspace") || query_params["keyspace"] == task->get_status().keyspace) &&
+        (!query_params.contains("table") || query_params["table"] == task->get_status().table);
+}
+
+struct full_task_status {
+    tasks::task_manager::task::status task_status;
+    std::string type;
+    tasks::task_manager::task::progress progress;
+    std::string module;
+    tasks::task_id parent_id;
+    tasks::is_abortable abortable;
+    std::vector<std::string> children_ids;
+};
+
+struct task_stats {
+    task_stats(tasks::task_manager::task_ptr task)
+        : task_id(task->id().to_sstring())
+        , state(task->get_status().state)
+        , type(task->type())
+        , keyspace(task->get_status().keyspace)
+        , table(task->get_status().table)
+        , entity(task->get_status().entity)
+        , sequence_number(task->get_status().sequence_number)
+    { }
+
+    sstring task_id;
+    tasks::task_manager::task_state state;
+    std::string type;
+    std::string keyspace;
+    std::string table;
+    std::string entity;
+    uint64_t sequence_number;
+};
+
+tm::task_status make_status(full_task_status status) {
+    auto start_time = db_clock::to_time_t(status.task_status.start_time);
+    auto end_time = db_clock::to_time_t(status.task_status.end_time);
+    ::tm st, et;
+    ::gmtime_r(&end_time, &et);
+    ::gmtime_r(&start_time, &st);
+
+    tm::task_status res{};
+    res.id = status.task_status.id.to_sstring();
+    res.type = status.type;
+    res.state = status.task_status.state;
+    res.is_abortable = bool(status.abortable);
+    res.start_time = st;
+    res.end_time = et;
+    res.error = status.task_status.error;
+    res.parent_id = status.parent_id.to_sstring();
+    res.sequence_number = status.task_status.sequence_number;
+    res.shard = status.task_status.shard;
+    res.keyspace = status.task_status.keyspace;
+    res.table = status.task_status.table;
+    res.entity = status.task_status.entity;
+    res.progress_units = status.task_status.progress_units;
+    res.progress_total = status.progress.total;
+    res.progress_completed = status.progress.completed;
+    res.children_ids = std::move(status.children_ids);
+    return res;
+}
+
+future<full_task_status> retrieve_status(const tasks::task_manager::foreign_task_ptr& task) {
+    if (task.get() == nullptr) {
+        co_return coroutine::return_exception(httpd::bad_param_exception("Task not found"));
+    }
+    auto progress = co_await task->get_progress();
+    full_task_status s;
+    s.task_status = task->get_status();
+    s.type = task->type();
+    s.parent_id = task->get_parent_id();
+    s.abortable = task->is_abortable();
+    s.module = task->get_module_name();
+    s.progress.completed = progress.completed;
+    s.progress.total = progress.total;
+    std::vector<std::string> ct{task->get_children().size()};
+    boost::transform(task->get_children(), ct.begin(), [] (const auto& child) {
+        return child->id().to_sstring();
+    });
+    s.children_ids = std::move(ct);
+    co_return s;
+}
+
+void set_task_manager(http_context& ctx, routes& r) {
+    tm::get_modules.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        std::vector<std::string> v = boost::copy_range<std::vector<std::string>>(ctx.tm.local().get_modules() | boost::adaptors::map_keys);
+        co_return v;
+    });
+
+    tm::get_tasks.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        using chunked_stats = utils::chunked_vector<task_stats>;
+        auto internal = tasks::is_internal{req_param<bool>(*req, "internal", false)};
+        std::vector<chunked_stats> res = co_await ctx.tm.map([&req, internal] (tasks::task_manager& tm) {
+            chunked_stats local_res;
+            auto module = tm.find_module(req->param["module"]);
+            const auto& filtered_tasks = module->get_tasks() | boost::adaptors::filtered([&params = req->query_parameters, internal] (const auto& task) {
+                return (internal || !task.second->is_internal()) && filter_tasks(task.second, params);
+            });
+            for (auto& [task_id, task] : filtered_tasks) {
+                local_res.push_back(task_stats{task});
+            }
+            return local_res;
+        });
+
+        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
+            auto s = std::move(os);
+            auto res = std::move(r);
+            co_await s.write("[");
+            std::string delim = "";
+            for (auto& v: res) {
+                for (auto& stats: v) {
+                    co_await s.write(std::exchange(delim, ", "));
+                    tm::task_stats ts;
+                    ts = stats;
+                    co_await formatter::write(s, ts);
+                }
+            }
+            co_await s.write("]");
+            co_await s.close();
+        };
+        co_return std::move(f);
+    });
+
+    tm::get_task_status.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return std::move(task);
+        }));
+        auto s = co_await retrieve_status(task);
+        co_return make_status(s);
+    });
+
+    tm::abort_task.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            if (!task->is_abortable()) {
+                co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
+            }
+            co_await task->abort();
+        });
+        co_return json_void();
+    });
+
+    tm::wait_task.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
+            return task->done().then_wrapped([task] (auto f) {
+                task->unregister_task();
+                // done() is called only because we want the task to be complete before getting its status.
+                // The future should be ignored here as the result does not matter.
+                f.ignore_ready_future();
+                return make_foreign(task);
+            });
+        }));
+        auto s = co_await retrieve_status(task);
+        co_return make_status(s);
+    });
+
+    tm::get_task_status_recursively.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& _ctx = ctx;
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        std::queue<tasks::task_manager::foreign_task_ptr> q;
+        utils::chunked_vector<full_task_status> res;
+
+        // Get requested task.
+        auto task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return task;
+        }));
+
+        // Push children's statuses in BFS order.
+        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
+        while (!q.empty()) {
+            auto& current = q.front();
+            res.push_back(co_await retrieve_status(current));
+            for (auto& child: current->get_children()) {
+                q.push(co_await child.copy());
+            }
+            q.pop();
+        }
+
+        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
+            auto s = std::move(os);
+            auto res = std::move(r);
+            co_await s.write("[");
+            std::string delim = "";
+            for (auto& status: res) {
+                co_await s.write(std::exchange(delim, ", "));
+                co_await formatter::write(s, make_status(status));
+            }
+            co_await s.write("]");
+            co_await s.close();
+        };
+        co_return f;
+    });
+}
+
+}
--- a/api/task_manager.hh
+++ b/api/task_manager.hh
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "api.hh"
+
+namespace api {
+
+void set_task_manager(http_context& ctx, routes& r);
+
+}
--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#ifndef SCYLLA_BUILD_MODE_RELEASE
+
+#include <seastar/core/coroutine.hh>
+
+#include "task_manager_test.hh"
+#include "api/api-doc/task_manager_test.json.hh"
+#include "tasks/test_module.hh"
+
+namespace api {
+
+namespace tmt = httpd::task_manager_test_json;
+using namespace json;
+
+void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg) {
+    tmt::register_test_module.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        co_await ctx.tm.invoke_on_all([] (tasks::task_manager& tm) {
+            auto m = make_shared<tasks::test_module>(tm);
+            tm.register_module("test", m);
+        });
+        co_return json_void();
+    });
+
+    tmt::unregister_test_module.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        co_await ctx.tm.invoke_on_all([] (tasks::task_manager& tm) -> future<> {
+            auto module_name = "test";
+            auto module = tm.find_module(module_name);
+            co_await module->stop();
+        });
+        co_return json_void();
+    });
+
+    tmt::register_test_task.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        sharded<tasks::task_manager>& tms = ctx.tm;
+        auto it = req->query_parameters.find("task_id");
+        auto id = it != req->query_parameters.end() ? tasks::task_id{utils::UUID{it->second}} : tasks::task_id::create_null_id();
+        it = req->query_parameters.find("shard");
+        unsigned shard = it != req->query_parameters.end() ? boost::lexical_cast<unsigned>(it->second) : 0;
+        it = req->query_parameters.find("keyspace");
+        std::string keyspace = it != req->query_parameters.end() ? it->second : "";
+        it = req->query_parameters.find("table");
+        std::string table = it != req->query_parameters.end() ? it->second : "";
+        it = req->query_parameters.find("entity");
+        std::string entity = it != req->query_parameters.end() ? it->second : "";
+        it = req->query_parameters.find("parent_id");
+        tasks::task_info data;
+        if (it != req->query_parameters.end()) {
+            data.id = tasks::task_id{utils::UUID{it->second}};
+            auto parent_ptr = co_await tasks::task_manager::lookup_task_on_all_shards(ctx.tm, data.id);
+            data.shard = parent_ptr->get_status().shard;
+        }
+
+        auto module = tms.local().find_module("test");
+        id = co_await module->make_task<tasks::test_task_impl>(shard, id, keyspace, table, entity, data);
+        co_await tms.invoke_on(shard, [id] (tasks::task_manager& tm) {
+            auto it = tm.get_all_tasks().find(id);
+            if (it != tm.get_all_tasks().end()) {
+                it->second->start();
+            }
+        });
+        co_return id.to_sstring();
+    });
+
+    tmt::unregister_test_task.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto id = tasks::task_id{utils::UUID{req->query_parameters["task_id"]}};
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
+            tasks::test_task test_task{task};
+            co_await test_task.unregister_task();
+        });
+        co_return json_void();
+    });
+
+    tmt::finish_test_task.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto it = req->query_parameters.find("error");
+        bool fail = it != req->query_parameters.end();
+        std::string error = fail ? it->second : "";
+
+        co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
+            tasks::test_task test_task{task};
+            if (fail) {
+                test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
+            } else {
+                test_task.finish();
+            }
+            return make_ready_future<>();
+        });
+        co_return json_void();
+    });
+
+    tmt::get_and_update_ttl.set(r, [&ctx, &cfg] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        uint32_t ttl = cfg.task_ttl_seconds();
+        co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
+        co_return json::json_return_type(ttl);
+    });
+}
+
+}
+
+#endif
--- a/api/task_manager_test.hh
+++ b/api/task_manager_test.hh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#ifndef SCYLLA_BUILD_MODE_RELEASE
+
+#pragma once
+
+#include "api.hh"
+#include "db/config.hh"
+
+namespace api {
+
+void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg);
+
+}
+
+#endif
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -66,36 +66,48 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+// Based on Cassandra's resolveRegular function:
+//  - https://github.com/apache/cassandra/blob/e4f31b73c21b04966269c5ac2d3bd2562e5f6c63/src/java/org/apache/cassandra/db/rows/Cells.java#L79-L119
+//
+// Note: the ordering algorithm for cell is the same as for rows,
+// except that the cell value is used to break a tie in case all other attributes are equal.
+// See compare_row_marker_for_merge.
 std::strong_ordering
 compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    // Largest write timestamp wins.
    if (left.timestamp() != right.timestamp()) {
        return left.timestamp() <=> right.timestamp();
    }
+    // Tombstones always win reconciliation with live cells of the same timestamp
    if (left.is_live() != right.is_live()) {
        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
    }
    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
+        // Prefer expiring cells (which will become tombstones at some future date) over live cells.
+        // See https://issues.apache.org/jira/browse/CASSANDRA-14592
        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
+        // If both are expiring, choose the cell with the latest expiry or derived write time.
        if (left.is_live_and_has_ttl()) {
+            // Prefer cell with latest expiry
            if (left.expiry() != right.expiry()) {
                return left.expiry() <=> right.expiry();
-            } else {
-                // prefer the cell that was written later,
-                // so it survives longer after it expires, until purged.
+            } else if (right.ttl() != left.ttl()) {
+                // The cell write time is derived by (expiry - ttl).
+                // Prefer the cell that was written later,
+                // so it survives longer after it expires, until purged,
+                // as it become purgeable gc_grace_seconds after it was written.
+                //
+                // Note that this is an extension to Cassandra's algorithm
+                // which stops at the expiration time, and if equal,
+                // move forward to compare the cell values.
                return right.ttl() <=> left.ttl();
            }
        }
+        // The cell with the largest value wins, if all other attributes of the cells are identical.
+        // This is quite arbitrary, but still required to break the tie in a deterministic way.
+        return compare_unsigned(left.value(), right.value());
    } else {
        // Both are deleted

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -229,6 +229,8 @@ future<authenticated_user> password_authenticator::authenticate(
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
        } catch (exceptions::authentication_exception& e) {
            std::throw_with_nested(e);
+        } catch (exceptions::unavailable_exception& e) {
+            std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
        } catch (...) {
            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -55,6 +55,7 @@ future<bool> default_role_row_satisfies(
        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
+                internal_distributed_query_state(),
                {meta::DEFAULT_SUPERUSER_NAME},
                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
--- a/build_mode.hh
+++ b/build_mode.hh
@@ -0,0 +1,59 @@
+
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#ifndef SCYLLA_BUILD_MODE
+#error SCYLLA_BUILD_MODE must be defined
+#endif
+
+#ifndef STRINGIFY
+// We need to levels of indirection
+// to make a string out of the macro name.
+// The outer level expands the macro
+// and the inner level makes a string out of the expanded macro.
+#define STRINGIFY_VALUE(x) #x
+#define STRINGIFY_MACRO(x) STRINGIFY_VALUE(x)
+#endif
+
+#define SCYLLA_BUILD_MODE_STR STRINGIFY_MACRO(SCYLLA_BUILD_MODE)
+
+// We use plain macro definitions
+// so the preprocessor can expand them
+// inline in the #if directives below
+#define SCYLLA_BUILD_MODE_CODE_debug 0
+#define SCYLLA_BUILD_MODE_CODE_release 1
+#define SCYLLA_BUILD_MODE_CODE_dev 2
+#define SCYLLA_BUILD_MODE_CODE_sanitize 3
+#define SCYLLA_BUILD_MODE_CODE_coverage 4
+
+#define _SCYLLA_BUILD_MODE_CODE(sbm) SCYLLA_BUILD_MODE_CODE_ ## sbm
+#define SCYLLA_BUILD_MODE_CODE(sbm) _SCYLLA_BUILD_MODE_CODE(sbm)
+
+#if SCYLLA_BUILD_MODE_CODE(SCYLLA_BUILD_MODE) == SCYLLA_BUILD_MODE_CODE_debug
+#define SCYLLA_BUILD_MODE_DEBUG
+#elif SCYLLA_BUILD_MODE_CODE(SCYLLA_BUILD_MODE) == SCYLLA_BUILD_MODE_CODE_release
+#define SCYLLA_BUILD_MODE_RELEASE
+#elif SCYLLA_BUILD_MODE_CODE(SCYLLA_BUILD_MODE) == SCYLLA_BUILD_MODE_CODE_dev
+#define SCYLLA_BUILD_MODE_DEV
+#elif SCYLLA_BUILD_MODE_CODE(SCYLLA_BUILD_MODE) == SCYLLA_BUILD_MODE_CODE_sanitize
+#define SCYLLA_BUILD_MODE_SANITIZE
+#elif SCYLLA_BUILD_MODE_CODE(SCYLLA_BUILD_MODE) == SCYLLA_BUILD_MODE_CODE_coverage
+#define SCYLLA_BUILD_MODE_COVERAGE
+#else
+#error unrecognized SCYLLA_BUILD_MODE
+#endif
+
+#if (defined(SCYLLA_BUILD_MODE_RELEASE) || defined(SCYLLA_BUILD_MODE_DEV)) && defined(SEASTAR_DEBUG)
+#error SEASTAR_DEBUG is not expected to be defined when SCYLLA_BUILD_MODE is "release" or "dev"
+#endif
+
+#if (defined(SCYLLA_BUILD_MODE_DEBUG) || defined(SCYLLA_BUILD_MODE_SANITIZE)) && !defined(SEASTAR_DEBUG)
+#error SEASTAR_DEBUG is expected to be defined when SCYLLA_BUILD_MODE is "debug" or "sanitize"
+#endif
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -457,7 +457,9 @@ public:
            _begin.ptr->size = _size;
            _current = nullptr;
            _size = 0;
-            return managed_bytes(std::exchange(_begin.ptr, {}));
+            auto begin_ptr = _begin.ptr;
+            _begin.ptr = nullptr;
+            return managed_bytes(begin_ptr);
        } else {
            return managed_bytes();
        }
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -572,7 +572,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        _read_context.cache().on_mispopulate();
        return;
    }
-    auto rt_opt = _rt_assembler.flush(*_schema, position_in_partition::after_key(cr.key()));
+    auto rt_opt = _rt_assembler.flush(*_schema, position_in_partition::after_key(*_schema, cr.key()));
    clogger.trace("csm {}: populate({})", fmt::ptr(this), clustering_row::printer(*_schema, cr));
    _lsa_manager.run_in_update_section_with_allocator([this, &cr, &rt_opt] {
        mutation_partition& mp = _snp->version()->partition();
@@ -634,8 +634,8 @@ inline
 void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", fmt::ptr(this), _next_row.position(), _next_row_in_range);
    _next_row.touch();
-    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
-    auto upper_bound = _next_row_in_range ? next_lower_bound : _upper_bound;
+    auto next_lower_bound = position_in_partition_view::after_key(table_schema(), _next_row.position());
+    auto upper_bound = _next_row_in_range ? next_lower_bound.view : _upper_bound;
    if (_snp->range_tombstones(_lower_bound, upper_bound, [&] (range_tombstone rts) {
        add_range_tombstone_to_buffer(std::move(rts));
        return stop_iteration(_lower_bound_changed && is_buffer_full());
@@ -737,7 +737,9 @@ void cache_flat_mutation_reader::maybe_drop_last_entry() noexcept {
            && _snp->at_oldest_version()) {

        with_allocator(_snp->region().allocator(), [&] {
-            _last_row->on_evicted(_read_context.cache()._tracker);
+            cache_tracker& tracker = _read_context.cache()._tracker;
+            tracker.get_lru().remove(*_last_row);
+            _last_row->on_evicted(tracker);
        });
        _last_row = nullptr;

@@ -772,14 +774,14 @@ void cache_flat_mutation_reader::move_to_next_entry() {
    }
 }

-void cache_flat_mutation_reader::flush_tombstones(position_in_partition_view pos, bool end_of_range) {
+void cache_flat_mutation_reader::flush_tombstones(position_in_partition_view pos_, bool end_of_range) {
    // Ensure position is appropriate for range tombstone bound
-    pos = position_in_partition_view::after_key(pos);
-    clogger.trace("csm {}: flush_tombstones({}) end_of_range: {}", fmt::ptr(this), pos, end_of_range);
-    _rt_gen.flush(pos, [this] (range_tombstone_change&& rtc) {
+    auto pos = position_in_partition_view::after_key(*_schema, pos_);
+    clogger.trace("csm {}: flush_tombstones({}) end_of_range: {}", fmt::ptr(this), pos.view, end_of_range);
+    _rt_gen.flush(pos.view, [this] (range_tombstone_change&& rtc) {
        add_to_buffer(std::move(rtc), source::cache);
    }, end_of_range);
-    if (auto rtc_opt = _rt_merger.flush(pos, end_of_range)) {
+    if (auto rtc_opt = _rt_merger.flush(pos.view, end_of_range)) {
        do_add_to_buffer(std::move(*rtc_opt));
    }
 }
@@ -830,7 +832,7 @@ inline
 void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment_v2&& mf) {
    clogger.trace("csm {}: add_clustering_row_to_buffer({})", fmt::ptr(this), mutation_fragment_v2::printer(*_schema, mf));
    auto& row = mf.as_clustering_row();
-    auto new_lower_bound = position_in_partition::after_key(row.key());
+    auto new_lower_bound = position_in_partition::after_key(*_schema, row.key());
    push_mutation_fragment(std::move(mf));
    _lower_bound = std::move(new_lower_bound);
    _lower_bound_changed = true;
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -15,14 +15,7 @@
 #include "converting_mutation_partition_applier.hh"
 #include "hashing_partition_visitor.hh"
 #include "utils/UUID.hh"
-#include "serializer.hh"
-#include "idl/uuid.dist.hh"
-#include "idl/keys.dist.hh"
 #include "idl/mutation.dist.hh"
-#include "serializer_impl.hh"
-#include "serialization_visitors.hh"
-#include "idl/uuid.dist.impl.hh"
-#include "idl/keys.dist.impl.hh"
 #include "idl/mutation.dist.impl.hh"
 #include <iostream>

@@ -44,7 +37,7 @@ canonical_mutation::canonical_mutation(const mutation& m)
                 }).end_canonical_mutation();
 }

-utils::UUID canonical_mutation::column_family_id() const {
+table_id canonical_mutation::column_family_id() const {
    auto in = ser::as_input_stream(_data);
    auto mv = ser::deserialize(in, boost::type<ser::canonical_mutation_view>());
    return mv.table_id();
@@ -120,17 +113,19 @@ std::ostream& operator<<(std::ostream& os, const canonical_mutation& cm) {
            auto&& entry = _cm.static_column_at(id);
            fmt::print(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
        }
-        virtual void accept_row_tombstone(range_tombstone rt) override {
+        virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
            print_separator();
            fmt::print(_os, "row tombstone {}", rt);
+            return stop_iteration::no;
        }
-        virtual void accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) override {
+        virtual stop_iteration accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) override {
            if (_in_row) {
                fmt::print(_os, "}}, ");
            }
            fmt::print(_os, "{{row {} tombstone {} marker {}", pipv, rt, rm);
            _in_row = true;
            _first = false;
+            return stop_iteration::no;
        }
        virtual void accept_row_cell(column_id id, atomic_cell ac) override {
            print_separator();
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -14,10 +14,6 @@
 #include "bytes_ostream.hh"
 #include <iosfwd>

-namespace utils {
-    class UUID;
-} // namespace utils
-
 // Immutable mutation form which can be read using any schema version of the same table.
 // Safe to access from other shards via const&.
 // Safe to pass serialized across nodes.
@@ -39,7 +35,7 @@ public:
    // is not intended, user should sync the schema first.
    mutation to_mutation(schema_ptr) const;

-    utils::UUID column_family_id() const;
+    table_id column_family_id() const;

    const bytes_ostream& representation() const { return _data; }

--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -25,6 +25,7 @@
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
 #include "utils/UUID_gen.hh"
+#include "utils/error_injection.hh"

 #include "cdc/generation.hh"
 #include "cdc/cdc_options.hh"
@@ -44,8 +45,16 @@ static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const

 namespace cdc {

-extern const api::timestamp_clock::duration generation_leeway =
-    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+api::timestamp_clock::duration get_generation_leeway() {
+    static thread_local auto generation_leeway =
+            std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+    utils::get_local_injector().inject("increase_cdc_generation_leeway", [&] {
+        generation_leeway = std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::minutes(5));
+    });
+
+    return generation_leeway;
+}

 static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    i = net::hton(i);
@@ -160,18 +169,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(std::vector<token_range_description> entries)
+topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const& {
+const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-std::vector<token_range_description>&& topology_description::entries() && {
+utils::chunked_vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -263,7 +272,7 @@ public:
    topology_description generate() const {
        const auto tokens = get_tokens();

-        std::vector<token_range_description> vnode_descriptions;
+        utils::chunked_vector<token_range_description> vnode_descriptions;
        vnode_descriptions.reserve(tokens.size());

        vnode_descriptions.push_back(
@@ -331,7 +340,7 @@ future<cdc::generation_id> generation_service::make_new_generation(const std::un
    auto new_generation_timestamp = [add_delay, ring_delay = _cfg.ring_delay] {
        auto ts = db_clock::now();
        if (add_delay && ring_delay != 0ms) {
-            ts += 2 * ring_delay + duration_cast<milliseconds>(generation_leeway);
+            ts += 2 * ring_delay + duration_cast<milliseconds>(get_generation_leeway());
        }
        return ts;
    };
@@ -455,7 +464,12 @@ static future<> update_streams_description(
                    noncopyable_function<unsigned()> get_num_token_owners,
                    abort_source& abort_src) -> future<> {
            while (true) {
-                co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+                try {
+                    co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+                } catch (seastar::sleep_aborted&) {
+                    cdc_log.warn( "Aborted update CDC description table with generation {}", gen_id);
+                    co_return;
+                }
                try {
                    co_await do_update_streams_description(gen_id, *sys_dist_ks, { get_num_token_owners() });
                    co_return;
@@ -578,7 +592,7 @@ future<> generation_service::maybe_rewrite_streams_descriptions() {
        co_return;
    }

-    if (co_await db::system_keyspace::cdc_is_rewritten()) {
+    if (co_await _sys_ks.local().cdc_is_rewritten()) {
        co_return;
    }

@@ -602,13 +616,13 @@ future<> generation_service::maybe_rewrite_streams_descriptions() {
            continue;
        }

-        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id()), cdc_opts.ttl()});
+        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
    }

    if (times_and_ttls.empty()) {
        // There's no point in rewriting old generations' streams (they don't contain any data).
        cdc_log.info("No CDC log tables present, not rewriting stream tables.");
-        co_return co_await db::system_keyspace::cdc_set_rewritten(std::nullopt);
+        co_return co_await _sys_ks.local().cdc_set_rewritten(std::nullopt);
    }

    auto get_num_token_owners = [tm = _token_metadata.get()] { return tm->count_normal_token_owners(); };
@@ -626,7 +640,7 @@ future<> generation_service::maybe_rewrite_streams_descriptions() {
            std::move(get_num_token_owners),
            _abort_src);

-    co_await db::system_keyspace::cdc_set_rewritten(last_rewritten);
+    co_await _sys_ks.local().cdc_set_rewritten(last_rewritten);
 }

 static void assert_shard_zero(const sstring& where) {
@@ -870,7 +884,7 @@ future<> generation_service::check_and_repair_cdc_streams() {
            { gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(new_gen_id) },
            { gms::application_state::STATUS, *status }
    });
-    co_await db::system_keyspace::update_cdc_generation_id(new_gen_id);
+    co_await _sys_ks.local().update_cdc_generation_id(new_gen_id);
 }

 future<> generation_service::handle_cdc_generation(std::optional<cdc::generation_id> gen_id) {
@@ -1013,7 +1027,7 @@ future<bool> generation_service::do_handle_cdc_generation(cdc::generation_id gen
    // The assumption follows from the requirement of bootstrapping nodes sequentially.
    if (!_gen_id || get_ts(*_gen_id) < get_ts(gen_id)) {
        _gen_id = gen_id;
-        co_await db::system_keyspace::update_cdc_generation_id(gen_id);
+        co_await _sys_ks.local().update_cdc_generation_id(gen_id);
        co_await _gossiper.add_local_application_state(
                gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(gen_id));
    }
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -46,6 +46,8 @@ namespace gms {

 namespace cdc {

+api::timestamp_clock::duration get_generation_leeway();
+
 class stream_id final {
    bytes _value;
 public:
@@ -94,13 +96,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    std::vector<token_range_description> _entries;
+    utils::chunked_vector<token_range_description> _entries;
 public:
-    topology_description(std::vector<token_range_description> entries);
+    topology_description(utils::chunked_vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const&;
-    std::vector<token_range_description>&& entries() &&;
+    const utils::chunked_vector<token_range_description>& entries() const&;
+    utils::chunked_vector<token_range_description>&& entries() &&;
 };

 /**
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -59,7 +59,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
+static schema_ptr create_log_schema(const schema&, std::optional<table_id> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -485,7 +485,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
+static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner(cdc::cdc_partitioner::classname);
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -605,7 +605,7 @@ private:
 public:
    collection_iterator(managed_bytes_view_opt v = {})
        : _v(v.value_or(managed_bytes_view{}))
-        , _rem(_v.empty() ? 0 : read_collection_size(_v, cql_serialization_format::internal()))
+        , _rem(_v.empty() ? 0 : read_collection_size(_v))
    {
        if (_rem != 0) {
            parse();
@@ -650,8 +650,8 @@ template<>
 void collection_iterator<std::pair<managed_bytes_view, managed_bytes_view>>::parse() {
    assert(_rem > 0);
    _next = _v;
-    auto k = read_collection_value(_next, cql_serialization_format::internal());
-    auto v = read_collection_value(_next, cql_serialization_format::internal());
+    auto k = read_collection_value(_next);
+    auto v = read_collection_value(_next);
    _current = std::make_pair(k, v);
 }

@@ -659,7 +659,7 @@ template<>
 void collection_iterator<managed_bytes_view>::parse() {
    assert(_rem > 0);
    _next = _v;
-    auto k = read_collection_value(_next, cql_serialization_format::internal());
+    auto k = read_collection_value(_next);
    _current = k;
 }

@@ -728,7 +728,7 @@ auto make_maybe_back_inserter(Container& c, const abstract_type& type, collectio
 static size_t collection_size(const managed_bytes_opt& bo) {
    if (bo) {
        managed_bytes_view mbv(*bo);
-        return read_collection_size(mbv, cql_serialization_format::internal());
+        return read_collection_size(mbv);
    }
    return 0;
 }
@@ -750,7 +750,7 @@ static managed_bytes merge(const collection_type_impl& ctype, const managed_byte
    // note order: set_union, when finding doubles, use value from first1 (j here). So
    // since this is next, it has prio
    std::set_union(j, e, i, e, make_maybe_back_inserter(res, *type, collection_iterator<managed_bytes_view>(deleted)), cmp);
-    return map_type_impl::serialize_partially_deserialized_form_fragmented(res, cql_serialization_format::internal());
+    return map_type_impl::serialize_partially_deserialized_form_fragmented(res);
 }
 static managed_bytes merge(const set_type_impl& ctype, const managed_bytes_opt& prev, const managed_bytes_opt& next, const managed_bytes_opt& deleted) {
    std::vector<managed_bytes_view> res;
@@ -761,7 +761,7 @@ static managed_bytes merge(const set_type_impl& ctype, const managed_bytes_opt&
    };
    collection_iterator<managed_bytes_view> e, i(prev), j(next), d(deleted);
    std::set_union(j, e, i, e, make_maybe_back_inserter(res, *type, d), cmp);
-    return set_type_impl::serialize_partially_deserialized_form_fragmented(res, cql_serialization_format::internal());
+    return set_type_impl::serialize_partially_deserialized_form_fragmented(res);
 }
 static managed_bytes merge(const user_type_impl& type, const managed_bytes_opt& prev, const managed_bytes_opt& next, const managed_bytes_opt& deleted) {
    std::vector<managed_bytes_view_opt> res(type.size());
@@ -812,15 +812,14 @@ static managed_bytes_opt get_preimage_col_value(const column_definition& cdef, c
            // flatten set
            [&] (const set_type_impl& type) {
                auto v = pirow->get_view(cdef.name_as_text());
-                auto f = cql_serialization_format::internal();
-                auto n = read_collection_size(v, f);
+                auto n = read_collection_size(v);
                std::vector<managed_bytes> tmp;
                tmp.reserve(n);
                while (n--) {
-                    tmp.emplace_back(read_collection_value(v, f)); // key
-                    read_collection_value(v, f); // value. ignore.
+                    tmp.emplace_back(read_collection_value(v)); // key
+                    read_collection_value(v); // value. ignore.
                }
-                return set_type_impl::serialize_partially_deserialized_form_fragmented({tmp.begin(), tmp.end()}, f);
+                return set_type_impl::serialize_partially_deserialized_form_fragmented({tmp.begin(), tmp.end()});
            },
            [&] (const abstract_type& o) -> managed_bytes {
                return pirow->get_blob_fragmented(cdef.name_as_text());
@@ -1122,7 +1121,7 @@ struct process_row_visitor {
                visit_collection(v);

                managed_bytes_opt added_keys = v._added_keys.empty() ? std::nullopt :
-                    std::optional{set_type_impl::serialize_partially_deserialized_form_fragmented(v._added_keys, cql_serialization_format::internal())};
+                    std::optional{set_type_impl::serialize_partially_deserialized_form_fragmented(v._added_keys)};

                return {
                    v._is_column_delete,
@@ -1178,7 +1177,7 @@ struct process_row_visitor {
                visit_collection(v);

                managed_bytes_opt added_cells = v._added_cells.empty() ? std::nullopt :
-                    std::optional{map_type_impl::serialize_partially_deserialized_form_fragmented(v._added_cells, cql_serialization_format::internal())};
+                    std::optional{map_type_impl::serialize_partially_deserialized_form_fragmented(v._added_cells)};

                return {
                    v._is_column_delete,
@@ -1198,7 +1197,7 @@ struct process_row_visitor {
        // then we deserialize again when merging images below
        managed_bytes_opt deleted_elements = std::nullopt;
        if (!deleted_keys.empty()) {
-            deleted_elements = set_type_impl::serialize_partially_deserialized_form_fragmented(deleted_keys, cql_serialization_format::internal());
+            deleted_elements = set_type_impl::serialize_partially_deserialized_form_fragmented(deleted_keys);
        }

        // delta
@@ -1655,7 +1654,8 @@ public:

        auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), std::move(opts));
        const auto max_result_size = _ctx._proxy.get_max_result_size(partition_slice);
-        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_result_size(max_result_size), query::row_limit(row_limit));
+        const auto tombstone_limit = query::tombstone_limit(_ctx._proxy.get_tombstone_limit());
+        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_result_size(max_result_size), tombstone_limit, query::row_limit(row_limit));

        const auto select_cl = adjust_cl(write_cl);

--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -15,10 +15,6 @@

 extern logging::logger cdc_log;

-namespace cdc {
-    extern const api::timestamp_clock::duration generation_leeway;
-} // namespace cdc
-
 static api::timestamp_type to_ts(db_clock::time_point tp) {
    // This assumes that timestamp_clock and db_clock have the same epochs.
    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
@@ -40,7 +36,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const std::vector<cdc::token_range_description>& entries,
+        const utils::chunked_vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
@@ -73,7 +69,7 @@ bool cdc::metadata::streams_available() const {

 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
-    if (ts > now + generation_leeway.count()) {
+    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
@@ -86,27 +82,43 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // Nothing protects us from that until we start using transactions for generation switching.
    }

-    auto it = gen_used_at(now);
-    if (it == _gens.end()) {
+    auto it = gen_used_at(now - get_generation_leeway().count());
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+    }
+
+    if (ts <= now - get_generation_leeway().count()) {
+        // We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
+        // happens iff one of the following is true:
+        // - the write is to no generation,
+        // - the write is to a generation older than the generation under `it`,
+        // - the write is to the generation under `it` and that generation is not the current generation.
+        // Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
+        // but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
+        // are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
+        // the generation under `it` because that generation was operating at `now - generation_leeway`.
+        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
+        if (it == _gens.end() || ts < it->first || is_previous_gen) {
+            throw exceptions::invalid_request_exception(format(
+                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
+                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                    " consistency properties.\n"
+                    "We *do* allow sending writes into the near past, but our ability to do that is limited."
+                    " Are you using client-side timestamps? Make sure your clocks are well-synchronized"
+                    " with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        }
+    }
+
+    it = _gens.begin();
+    if (it == _gens.end() || ts < it->first) {
        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
    }

-    // Garbage-collect generations that will no longer be used.
-    it = _gens.erase(_gens.begin(), it);
-
-    if (it->first > ts) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream from an earlier generation than the currently used one."
-                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                " consistency properties (write timestamp: {}, current generation started at: {})",
-                format_timestamp(ts), format_timestamp(it->first)));
-    }
-
-    // With `generation_leeway` we allow sending writes to the near future. It might happen
-    // that `ts` doesn't belong to the current generation ("current" according to our clock),
-    // but to the next generation. Adjust for this case:
+    // Find the generation operating at `ts`.
    {
        auto next_it = std::next(it);
        while (next_it != _gens.end() && next_it->first <= ts) {
@@ -147,8 +159,8 @@ bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
        ++it;
    }

-    // Check if some new generation has already superseded this one.
-    return it != _gens.end() && it->first <= api::new_timestamp();
+    // Check if the generation is obsolete.
+    return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
 }

 bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
@@ -157,7 +169,7 @@ bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen)
    }

    auto now = api::new_timestamp();
-    auto it = gen_used_at(now);
+    auto it = gen_used_at(now - get_generation_leeway().count());

    if (it != _gens.end()) {
        // Garbage-collect generations that will no longer be used.
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -42,7 +42,9 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
 public:
-    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
+     * it is older than the generation operating at `now - get_generation_leeway()`.
+     */
    bool known_or_obsolete(db_clock::time_point) const;

    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
@@ -54,8 +56,9 @@ public:
     *
     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
-     * by the `cdc::generation_leeway` constant.
+     * yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
+     * the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
+     * `get_generation_leeway()`.
     */
    stream_id get_stream(api::timestamp_type ts, dht::token tok);

--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -21,8 +21,6 @@ class row_tombstone;

 class collection_mutation;

-class cql_serialization_format;
-
 // An auxiliary struct used to (de)construct collection_mutations.
 // Unlike collection_mutation which is a serialized blob, this struct allows to inspect logical units of information
 // (tombstone and cells) inside the mutation easily.
@@ -131,4 +129,4 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec
 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
-bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
+bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view);
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -12,7 +12,13 @@

 class schema;
 class partition_key;
-class clustering_row;
+struct atomic_cell_view;
+struct tombstone;
+
+namespace db::view {
+struct clustering_or_static_row;
+struct view_key_and_action;
+}

 class column_computation;
 using column_computation_ptr = std::unique_ptr<column_computation>;
@@ -22,7 +28,7 @@ using column_computation_ptr = std::unique_ptr<column_computation>;
 * Computed columns description is also available at docs/dev/system_schema_keyspace.md. They hold values
 * not provided directly by the user, but rather computed: from other column values and possibly other sources.
 * This class is able to serialize/deserialize column computations and perform the computation itself,
- * based on given schema, partition key and clustering row. Responsibility for providing enough data
+ * based on given schema, and partition key. Responsibility for providing enough data
 * in the clustering row in order for computation to succeed belongs to the caller. In particular,
 * generating a value might involve performing a read-before-write if the computation is performed
 * on more values than are present in the update request.
@@ -36,7 +42,19 @@ public:
    virtual column_computation_ptr clone() const = 0;

    virtual bytes serialize() const = 0;
-    virtual bytes_opt compute_value(const schema& schema, const partition_key& key, const clustering_row& row) const = 0;
+    virtual bytes compute_value(const schema& schema, const partition_key& key) const = 0;
+    /*
+     * depends_on_non_primary_key_column for a column computation is needed to
+     * detect a case where the primary key of a materialized view depends on a
+     * non primary key column from the base table, but at the same time, the view
+     * itself doesn't have non-primary key columns. This is an issue, since as
+     * for now, it was assumed that no non-primary key columns in view schema
+     * meant that the update cannot change the primary key of the view, and
+     * therefore the update path can be simplified.
+     */
+    virtual bool depends_on_non_primary_key_column() const {
+        return false;
+    }
 };

 /*
@@ -54,7 +72,7 @@ public:
        return std::make_unique<legacy_token_column_computation>(*this);
    }
    virtual bytes serialize() const override;
-    virtual bytes_opt compute_value(const schema& schema, const partition_key& key, const clustering_row& row) const override;
+    virtual bytes compute_value(const schema& schema, const partition_key& key) const override;
 };


@@ -75,5 +93,54 @@ public:
        return std::make_unique<token_column_computation>(*this);
    }
    virtual bytes serialize() const override;
-    virtual bytes_opt compute_value(const schema& schema, const partition_key& key, const clustering_row& row) const override;
+    virtual bytes compute_value(const schema& schema, const partition_key& key) const override;
+};
+
+/*
+ * collection_column_computation is used for a secondary index on a collection
+ * column. In this case we don't have a single value to compute, but rather we
+ * want to return multiple values (e.g., all the keys in the collection).
+ * So this class does not implement the base class's compute_value() -
+ * instead it implements a new method compute_collection_values(), which
+ * can return multiple values. This new method is currently called only from
+ * the materialized-view code which uses collection_column_computation.
+ */
+class collection_column_computation final : public column_computation {
+    enum class kind {
+        keys,
+        values,
+        entries,
+    };
+    const bytes _collection_name;
+    const kind _kind;
+    collection_column_computation(const bytes& collection_name, kind kind) : _collection_name(collection_name), _kind(kind) {}
+
+    using collection_kv = std::pair<bytes_view, atomic_cell_view>;
+    void operate_on_collection_entries(
+            std::invocable<collection_kv*, collection_kv*, tombstone> auto&& old_and_new_row_func, const schema& schema,
+            const partition_key& key, const db::view::clustering_or_static_row& update, const std::optional<db::view::clustering_or_static_row>& existing) const;
+
+public:
+    static collection_column_computation for_keys(const bytes& collection_name) {
+        return {collection_name, kind::keys};
+    }
+    static collection_column_computation for_values(const bytes& collection_name) {
+        return {collection_name, kind::values};
+    }
+    static collection_column_computation for_entries(const bytes& collection_name) {
+        return {collection_name, kind::entries};
+    }
+    static column_computation_ptr for_target_type(std::string_view type, const bytes& collection_name);
+
+    virtual bytes serialize() const override;
+    virtual bytes compute_value(const schema& schema, const partition_key& key) const override;
+    virtual column_computation_ptr clone() const override {
+        return std::make_unique<collection_column_computation>(*this);
+    }
+    virtual bool depends_on_non_primary_key_column() const override {
+        return true;
+    }
+
+    std::vector<db::view::view_key_and_action> compute_values_with_action(const schema& schema, const partition_key& key,
+            const db::view::clustering_or_static_row& row, const std::optional<db::view::clustering_or_static_row>& existing) const;
 };
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -28,6 +28,7 @@
 #include <seastar/util/closeable.hh>
 #include <seastar/core/shared_ptr.hh>

+#include "dht/i_partitioner.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstable_writer.hh"
 #include "sstables/progress_monitor.hh"
@@ -41,6 +42,7 @@
 #include "mutation_compactor.hh"
 #include "leveled_manifest.hh"
 #include "dht/token.hh"
+#include "dht/partition_filter.hh"
 #include "mutation_writer/shard_based_splitting_writer.hh"
 #include "mutation_writer/partition_based_splitting_writer.hh"
 #include "mutation_source_metadata.hh"
@@ -52,6 +54,7 @@
 #include "readers/filtering.hh"
 #include "readers/compacting.hh"
 #include "tombstone_gc.hh"
+#include "keys.hh"

 namespace sstables {

@@ -165,7 +168,7 @@ std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
 }

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -176,6 +179,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
            hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
        }
        if (sst->filter_has_key(*hk)) {
+            bloom_filter_checks++;
            timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
        }
    }
@@ -219,13 +223,13 @@ public:

    ~compaction_write_monitor() {
        if (_sst) {
-            _table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
+            _table_s.get_backlog_tracker().revert_charges(_sst);
        }
    }

    virtual void on_write_started(const sstables::writer_offset_tracker& tracker) override {
        _tracker = &tracker;
-        _table_s.get_compaction_strategy().get_backlog_tracker().register_partially_written_sstable(_sst, *this);
+        _table_s.get_backlog_tracker().register_partially_written_sstable(_sst, *this);
    }

    virtual void on_data_write_completed() override {
@@ -277,6 +281,18 @@ class compacted_fragments_writer {
    stop_func_t _stop_compaction_writer;
    std::optional<utils::observer<>> _stop_request_observer;
    bool _unclosed_partition = false;
+    struct partition_state {
+        dht::decorated_key_opt dk;
+        // Partition tombstone is saved for the purpose of replicating it to every fragment storing a partition pL.
+        // Then when reading from the SSTable run, we won't unnecessarily have to open >= 2 fragments, the one which
+        // contains the tombstone and another one(s) that has the partition slice being queried.
+        ::tombstone tombstone;
+        // Used to determine whether any active tombstones need closing at EOS.
+        ::tombstone current_emitted_tombstone;
+        // Track last emitted clustering row, which will be used to close active tombstone if splitting partition
+        position_in_partition last_pos = position_in_partition::before_all_clustered_rows();
+        bool is_splitting_partition = false;
+    } _current_partition;
 private:
    inline void maybe_abort_compaction();

@@ -286,6 +302,13 @@ private:
            consume_end_of_stream();
        });
    }
+
+    void stop_current_writer();
+    bool can_split_large_partition() const;
+    void track_last_position(position_in_partition_view pos);
+    void split_large_partition();
+    void do_consume_new_partition(const dht::decorated_key& dk);
+    stop_iteration do_consume_end_of_partition();
 public:
    explicit compacted_fragments_writer(compaction& c, creator_func_t cpw, stop_func_t scw)
            : _c(c)
@@ -304,7 +327,7 @@ public:

    void consume_new_partition(const dht::decorated_key& dk);

-    void consume(tombstone t) { _compaction_writer->writer.consume(t); }
+    void consume(tombstone t);
    stop_iteration consume(static_row&& sr, tombstone, bool) {
        maybe_abort_compaction();
        return _compaction_writer->writer.consume(std::move(sr));
@@ -312,17 +335,11 @@ public:
    stop_iteration consume(static_row&& sr) {
        return consume(std::move(sr), tombstone{}, bool{});
    }
-    stop_iteration consume(clustering_row&& cr, row_tombstone, bool) {
-        maybe_abort_compaction();
-        return _compaction_writer->writer.consume(std::move(cr));
-    }
+    stop_iteration consume(clustering_row&& cr, row_tombstone, bool);
    stop_iteration consume(clustering_row&& cr) {
        return consume(std::move(cr), row_tombstone{}, bool{});
    }
-    stop_iteration consume(range_tombstone_change&& rtc) {
-        maybe_abort_compaction();
-        return _compaction_writer->writer.consume(std::move(rtc));
-    }
+    stop_iteration consume(range_tombstone_change&& rtc);

    stop_iteration consume_end_of_partition();
    void consume_end_of_stream();
@@ -337,7 +354,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
    public:
        virtual void on_read_started(const sstables::reader_position_tracker& tracker) override {
            _tracker = &tracker;
-            _table_s.get_compaction_strategy().get_backlog_tracker().register_compacting_sstable(_sst, *this);
+            _table_s.get_backlog_tracker().register_compacting_sstable(_sst, *this);
        }

        virtual void on_read_completed() override {
@@ -356,7 +373,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {

        void remove_sstable() {
            if (_sst) {
-                _table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
+                _table_s.get_backlog_tracker().revert_charges(_sst);
            }
            _sst = {};
        }
@@ -368,7 +385,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
            // We failed to finish handling this SSTable, so we have to update the backlog_tracker
            // about it.
            if (_sst) {
-                _table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
+                _table_s.get_backlog_tracker().revert_charges(_sst);
            }
        }

@@ -398,9 +415,12 @@ private:

 class formatted_sstables_list {
    bool _include_origin = true;
-    std::vector<sstring> _ssts;
+    std::vector<std::string> _ssts;
 public:
    formatted_sstables_list() = default;
+    void reserve(size_t n) {
+        _ssts.reserve(n);
+    }
    explicit formatted_sstables_list(const std::vector<shared_sstable>& ssts, bool include_origin) : _include_origin(include_origin) {
        _ssts.reserve(ssts.size());
        for (const auto& sst : ssts) {
@@ -419,9 +439,7 @@ public:
 };

 std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst) {
-    os << "[";
-    os << boost::algorithm::join(lst._ssts, ",");
-    os << "]";
+    fmt::print(os, "[{}]", fmt::join(lst._ssts, ","));
    return os;
 }

@@ -446,12 +464,15 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    double _estimated_droppable_tombstone_ratio = 0;
+    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
+    bool _can_split_large_partition = false;
    bool _contains_multi_fragment_runs = false;
    mutation_source_metadata _ms_metadata = {};
    compaction_sstable_replacer_fn _replacer;
-    utils::UUID _run_identifier;
+    run_id _run_identifier;
    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
    std::optional<sstable_set> _sstable_set;
@@ -479,6 +500,7 @@ protected:
        , _type(descriptor.options.type())
        , _max_sstable_size(descriptor.max_sstable_bytes)
        , _sstable_level(descriptor.level)
+        , _can_split_large_partition(descriptor.can_split_large_partition)
        , _replacer(std::move(descriptor.replacer))
        , _run_identifier(descriptor.run_identifier)
        , _io_priority(descriptor.io_priority)
@@ -489,7 +511,7 @@ protected:
        for (auto& sst : _sstables) {
            _stats_collector.update(sst->get_encoding_stats_for_compaction());
        }
-        std::unordered_set<utils::UUID> ssts_run_ids;
+        std::unordered_set<run_id> ssts_run_ids;
        _contains_multi_fragment_runs = std::any_of(_sstables.begin(), _sstables.end(), [&ssts_run_ids] (shared_sstable& sst) {
            return !ssts_run_ids.insert(sst->run_identifier()).second;
        });
@@ -500,7 +522,7 @@ protected:
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -555,14 +577,15 @@ protected:
        return bool(_sstable_set);
    }

-    compaction_writer create_gc_compaction_writer() const {
+    compaction_writer create_gc_compaction_writer(run_id gc_run) const {
        auto sst = _sstable_creator(this_shard_id());

        auto&& priority = _io_priority;
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
-        cfg.run_identifier = _run_identifier;
+        cfg.run_identifier = gc_run;
        cfg.monitor = monitor.get();
+        uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }
@@ -582,8 +605,14 @@ protected:
    // When compaction finishes, all the temporary sstables generated here will be deleted and removed
    // from table's sstable set.
    compacted_fragments_writer get_gc_compacted_fragments_writer() {
+        // because the temporary sstable run can overlap with the non-gc sstables run created by
+        // get_compacted_fragments_writer(), we have to use a different run_id. the gc_run_id is
+        // created here as:
+        // 1. it can be shared across all sstables created by this writer
+        // 2. it is optional, as gc writer is not always used
+        auto gc_run = run_id::create_random_id();
        return compacted_fragments_writer(*this,
-             [this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
+             [this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
             [this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
             _stop_request_observable);
    }
@@ -600,8 +629,8 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -623,9 +652,11 @@ private:
    future<> setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
        formatted_sstables_list formatted_msg;
+        formatted_msg.reserve(_sstables.size());
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

+        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
@@ -660,12 +691,16 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
+            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state());
+            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->all()->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->all()->size(), _sstables.size());
        }
+        // _estimated_droppable_tombstone_ratio could exceed 1.0 in certain cases, so limit it to 1.0.
+        _estimated_droppable_tombstone_ratio = std::min(1.0, sum_of_estimated_droppable_tombstone_ratio / ssts->all()->size());

        _compacting = std::move(ssts);

@@ -684,7 +719,8 @@ private:
                reader.consume_in_thread(std::move(cfc));
            });
        });
-        return consumer(make_compacting_reader(make_sstable_reader(), compaction_time, max_purgeable_func()));
+        const auto& gc_state = _table_s.get_tombstone_gc_state();
+        return consumer(make_compacting_reader(make_sstable_reader(), compaction_time, max_purgeable_func(), gc_state));
    }

    future<> consume() {
@@ -704,6 +740,7 @@ private:
                    using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, compacted_fragments_writer>;
                    auto cfc = compact_mutations(*schema(), now,
                        max_purgeable_func(),
+                        _table_s.get_tombstone_gc_state(),
                        get_compacted_fragments_writer(),
                        get_gc_compacted_fragments_writer());

@@ -713,6 +750,7 @@ private:
                using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, noop_compacted_fragments_consumer>;
                auto cfc = compact_mutations(*schema(), now,
                    max_purgeable_func(),
+                    _table_s.get_tombstone_gc_state(),
                    get_compacted_fragments_writer(),
                    noop_compacted_fragments_consumer());
                reader.consume_in_thread(std::move(cfc));
@@ -736,6 +774,7 @@ protected:
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
+                .bloom_filter_checks = _bloom_filter_checks,
            },
        };

@@ -755,7 +794,7 @@ protected:
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_start_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -776,7 +815,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
        };
    }

@@ -864,7 +903,51 @@ void compacted_fragments_writer::maybe_abort_compaction() {
    }
 }

-void compacted_fragments_writer::consume_new_partition(const dht::decorated_key& dk) {
+void compacted_fragments_writer::stop_current_writer() {
+    // stop sstable writer being currently used.
+    _stop_compaction_writer(&*_compaction_writer);
+    _compaction_writer = std::nullopt;
+}
+
+bool compacted_fragments_writer::can_split_large_partition() const {
+    return _c._can_split_large_partition;
+}
+
+void compacted_fragments_writer::track_last_position(position_in_partition_view pos) {
+    if (can_split_large_partition()) {
+        _current_partition.last_pos = pos;
+    }
+}
+
+void compacted_fragments_writer::split_large_partition() {
+    // Closes the active range tombstone if needed, before emitting partition end.
+    // after_key(last_pos) is used for both closing and re-opening the active tombstone, which
+    // will result in current fragment storing an inclusive end bound for last pos, and the
+    // next fragment storing an exclusive start bound for last pos. This is very important
+    // for not losing information on the range tombstone.
+    auto after_last_pos = position_in_partition::after_key(*_c.schema(), _current_partition.last_pos.key());
+    if (_current_partition.current_emitted_tombstone) {
+        auto rtc = range_tombstone_change(after_last_pos, tombstone{});
+        _c.log_debug("Closing active tombstone {} with {} for partition {}", _current_partition.current_emitted_tombstone, rtc, *_current_partition.dk);
+        _compaction_writer->writer.consume(std::move(rtc));
+    }
+    _c.log_debug("Splitting large partition {} in order to respect SSTable size limit of {}", *_current_partition.dk, pretty_printed_data_size(_c._max_sstable_size));
+    // Close partition in current writer, and open it again in a new writer.
+    do_consume_end_of_partition();
+    stop_current_writer();
+    do_consume_new_partition(*_current_partition.dk);
+    // Replicate partition tombstone to every fragment, allowing the SSTable run reader
+    // to open a single fragment during the read.
+    if (_current_partition.tombstone) {
+        consume(_current_partition.tombstone);
+    }
+    if (_current_partition.current_emitted_tombstone) {
+        _compaction_writer->writer.consume(range_tombstone_change(after_last_pos, _current_partition.current_emitted_tombstone));
+    }
+    _current_partition.is_splitting_partition = false;
+}
+
+void compacted_fragments_writer::do_consume_new_partition(const dht::decorated_key& dk) {
    maybe_abort_compaction();
    if (!_compaction_writer) {
        _compaction_writer = _create_compaction_writer(dk);
@@ -872,17 +955,55 @@ void compacted_fragments_writer::consume_new_partition(const dht::decorated_key&

    _c.on_new_partition();
    _compaction_writer->writer.consume_new_partition(dk);
-    _c._cdata.total_keys_written++;
    _unclosed_partition = true;
 }

-stop_iteration compacted_fragments_writer::consume_end_of_partition() {
-    auto ret = _compaction_writer->writer.consume_end_of_partition();
+stop_iteration compacted_fragments_writer::do_consume_end_of_partition() {
    _unclosed_partition = false;
+    return _compaction_writer->writer.consume_end_of_partition();
+}
+
+void compacted_fragments_writer::consume_new_partition(const dht::decorated_key& dk) {
+    _current_partition = {
+        .dk = dk,
+        .tombstone = tombstone(),
+        .current_emitted_tombstone = tombstone(),
+        .last_pos = position_in_partition::for_partition_start(),
+        .is_splitting_partition = false
+    };
+    do_consume_new_partition(dk);
+    _c._cdata.total_keys_written++;
+}
+
+void compacted_fragments_writer::consume(tombstone t) {
+    _current_partition.tombstone = t;
+    _compaction_writer->writer.consume(t);
+}
+
+stop_iteration compacted_fragments_writer::consume(clustering_row&& cr, row_tombstone, bool) {
+    maybe_abort_compaction();
+    if (_current_partition.is_splitting_partition) [[unlikely]] {
+        split_large_partition();
+    }
+    track_last_position(cr.position());
+    auto ret = _compaction_writer->writer.consume(std::move(cr));
+    if (can_split_large_partition() && ret == stop_iteration::yes) [[unlikely]] {
+        _current_partition.is_splitting_partition = true;
+    }
+    return stop_iteration::no;
+}
+
+stop_iteration compacted_fragments_writer::consume(range_tombstone_change&& rtc) {
+    maybe_abort_compaction();
+    _current_partition.current_emitted_tombstone = rtc.tombstone();
+    track_last_position(rtc.position());
+    return _compaction_writer->writer.consume(std::move(rtc));
+}
+
+stop_iteration compacted_fragments_writer::consume_end_of_partition() {
+    auto ret = do_consume_end_of_partition();
    if (ret == stop_iteration::yes) {
-        // stop sstable writer being currently used.
-        _stop_compaction_writer(&*_compaction_writer);
-        _compaction_writer = std::nullopt;
+        stop_current_writer();
    }
    return ret;
 }
@@ -894,51 +1015,6 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

-class reshape_compaction : public compaction {
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
-        : compaction(table_s, std::move(descriptor), cdata) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader() const override {
-        return _compacting->make_local_shard_sstable_reader(_schema,
-                _permit,
-                query::full_partition_range,
-                _schema->full_slice(),
-                _io_priority,
-                tracing::trace_state_ptr(),
-                ::streamed_mutation::forwarding::no,
-                ::mutation_reader::forwarding::no,
-                default_read_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            finish_new_sstable(writer);
-        }
-    }
-};
-
 class regular_compaction : public compaction {
    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
    mutable compaction_read_monitor_generator _monitor_generator;
@@ -1048,12 +1124,13 @@ private:
    }

    void update_pending_ranges() {
-        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
+        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
+        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : _cdata.pending_replacements) {
+        for (auto& pending_replacement : pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1067,35 +1144,76 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-        _cdata.pending_replacements.clear();
+    }
+};
+
+class reshape_compaction : public regular_compaction {
+private:
+    bool has_sstable_replacer() const noexcept {
+        return bool(_replacer);
+    }
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+            : regular_compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
+        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            if (has_sstable_replacer()) {
+                regular_compaction::stop_sstable_writer(writer);
+            } else {
+                finish_new_sstable(writer);
+            }
+        }
+    }
+
+    virtual void on_end_of_compaction() override {
+        if (has_sstable_replacer()) {
+            regular_compaction::on_end_of_compaction();
+        }
    }
 };

 class cleanup_compaction final : public regular_compaction {
-    class incremental_owned_ranges_checker {
-        const dht::token_range_vector& _sorted_owned_ranges;
-        mutable dht::token_range_vector::const_iterator _it;
-    public:
-        incremental_owned_ranges_checker(const dht::token_range_vector& sorted_owned_ranges)
-                : _sorted_owned_ranges(sorted_owned_ranges)
-                , _it(_sorted_owned_ranges.begin()) {
-        }
-
-        // Must be called with increasing token values.
-        bool belongs_to_current_node(const dht::token& t) const {
-            // While token T is after a range Rn, advance the iterator.
-            // iterator will be stopped at a range which either overlaps with T (if T belongs to node),
-            // or at a range which is after T (if T doesn't belong to this node).
-            while (_it != _sorted_owned_ranges.end() && _it->after(t, dht::token_comparator())) {
-                _it++;
-            }
-
-            return _it != _sorted_owned_ranges.end() && _it->contains(t, dht::token_comparator());
-        }
-    };
-
    owned_ranges_ptr _owned_ranges;
-    incremental_owned_ranges_checker _owned_ranges_checker;
+    mutable dht::incremental_owned_ranges_checker _owned_ranges_checker;
 private:
    // Called in a seastar thread
    dht::partition_range_vector
@@ -1108,21 +1226,8 @@ private:
            return dht::partition_range::make({sst->get_first_decorated_key(), true},
                                              {sst->get_last_decorated_key(), true});
        }));
-        // optimize set of potentially overlapping ranges by deoverlapping them.
-        non_owned_ranges = dht::partition_range::deoverlap(std::move(non_owned_ranges), dht::ring_position_comparator(*_schema));

-        // subtract *each* owned range from the partition range of *each* sstable*,
-        // such that we'll be left only with a set of non-owned ranges.
-        for (auto& owned_range : owned_ranges) {
-            dht::partition_range_vector new_non_owned_ranges;
-            for (auto& non_owned_range : non_owned_ranges) {
-                auto ret = non_owned_range.subtract(owned_range, dht::ring_position_comparator(*_schema));
-                new_non_owned_ranges.insert(new_non_owned_ranges.end(), ret.begin(), ret.end());
-                seastar::thread::maybe_yield();
-            }
-            non_owned_ranges = std::move(new_non_owned_ranges);
-        }
-        return non_owned_ranges;
+        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
    }
 protected:
    virtual compaction_completion_desc
@@ -1183,9 +1288,9 @@ public:
                type,
                schema.ks_name(),
                schema.cf_name(),
-                partition_key_to_string(new_key.key(), schema),
+                new_key.key().with_schema(schema),
                new_key,
-                partition_key_to_string(current_key.key(), schema),
+                current_key.key().with_schema(schema),
                current_key,
                action.empty() ? "" : "; ",
                action);
@@ -1198,9 +1303,9 @@ public:
                type,
                schema.ks_name(),
                schema.cf_name(),
-                partition_key_to_string(new_key.key(), schema),
+                new_key.key().with_schema(schema),
                new_key,
-                partition_key_to_string(current_key.key(), schema),
+                current_key.key().with_schema(schema),
                current_key,
                action.empty() ? "" : "; ",
                action);
@@ -1218,7 +1323,7 @@ public:
                mf.mutation_fragment_kind(),
                mf.has_key() ? format(" with key {}", mf.key().with_schema(schema)) : "",
                mf.position(),
-                partition_key_to_string(key.key(), schema),
+                key.key().with_schema(schema),
                key,
                prev_pos.region(),
                prev_pos.has_key() ? format(" with key {}", prev_pos.key().with_schema(schema)) : "",
@@ -1230,14 +1335,10 @@ public:
        const auto& schema = validator.schema();
        const auto& key = validator.previous_partition_key();
        clogger.error("[{} compaction {}.{}] Invalid end-of-stream, last partition {} ({}) didn't end with a partition-end fragment{}{}",
-                type, schema.ks_name(), schema.cf_name(), partition_key_to_string(key.key(), schema), key, action.empty() ? "" : "; ", action);
+                type, schema.ks_name(), schema.cf_name(), key.key().with_schema(schema), key, action.empty() ? "" : "; ", action);
    }

 private:
-    static sstring partition_key_to_string(const partition_key& key, const ::schema& s) {
-        sstring ret = format("{}", key.with_schema(s));
-        return utils::utf8::validate((const uint8_t*)ret.data(), ret.size()) ? ret : "<non-utf8-key>";
-    }

    class reader : public flat_mutation_reader_v2::impl {
        using skip = bool_class<class skip_tag>;
@@ -1249,17 +1350,20 @@ private:
        uint64_t& _validation_errors;

    private:
-        void maybe_abort_scrub() {
+        void maybe_abort_scrub(std::function<void()> report_error) {
            if (_scrub_mode == compaction_type_options::scrub::mode::abort) {
+                report_error();
                throw compaction_aborted_exception(_schema->ks_name(), _schema->cf_name(), "scrub compaction found invalid data");
            }
            ++_validation_errors;
        }

        void on_unexpected_partition_start(const mutation_fragment_v2& ps) {
-            maybe_abort_scrub();
-            report_invalid_partition_start(compaction_type::Scrub, _validator, ps.as_partition_start().key(),
-                    "Rectifying by adding assumed missing partition-end");
+            auto report_fn = [this, &ps] (std::string_view action = "") {
+                report_invalid_partition_start(compaction_type::Scrub, _validator, ps.as_partition_start().key(), action);
+            };
+            maybe_abort_scrub(report_fn);
+            report_fn("Rectifying by adding assumed missing partition-end");

            auto pe = mutation_fragment_v2(*_schema, _permit, partition_end{});
            if (!_validator(pe)) {
@@ -1279,20 +1383,26 @@ private:
        }

        skip on_invalid_partition(const dht::decorated_key& new_key) {
-            maybe_abort_scrub();
+            auto report_fn = [this, &new_key] (std::string_view action = "") {
+                report_invalid_partition(compaction_type::Scrub, _validator, new_key, action);
+            };
+            maybe_abort_scrub(report_fn);
            if (_scrub_mode == compaction_type_options::scrub::mode::segregate) {
-                report_invalid_partition(compaction_type::Scrub, _validator, new_key, "Detected");
+                report_fn("Detected");
                _validator.reset(new_key);
                // Let the segregating interposer consumer handle this.
                return skip::no;
            }
-            report_invalid_partition(compaction_type::Scrub, _validator, new_key, "Skipping");
+            report_fn("Skipping");
            _skip_to_next_partition = true;
            return skip::yes;
        }

        skip on_invalid_mutation_fragment(const mutation_fragment_v2& mf) {
-            maybe_abort_scrub();
+            auto report_fn = [this, &mf] (std::string_view action = "") {
+                report_invalid_mutation_fragment(compaction_type::Scrub, _validator, mf, "");
+            };
+            maybe_abort_scrub(report_fn);

            const auto& key = _validator.previous_partition_key();

@@ -1307,8 +1417,7 @@ private:
            // The only case a partition end is invalid is when it comes after
            // another partition end, and we can just drop it in that case.
            if (!mf.is_end_of_partition() && _scrub_mode == compaction_type_options::scrub::mode::segregate) {
-                report_invalid_mutation_fragment(compaction_type::Scrub, _validator, mf,
-                        "Injecting partition start/end to segregate out-of-order fragment");
+                report_fn("Injecting partition start/end to segregate out-of-order fragment");
                push_mutation_fragment(*_schema, _permit, partition_end{});

                // We loose the partition tombstone if any, but it will be
@@ -1321,16 +1430,19 @@ private:
                return skip::no;
            }

-            report_invalid_mutation_fragment(compaction_type::Scrub, _validator, mf, "Skipping");
+            report_fn("Skipping");

            return skip::yes;
        }

        void on_invalid_end_of_stream() {
-            maybe_abort_scrub();
+            auto report_fn = [this] (std::string_view action = "") {
+                report_invalid_end_of_stream(compaction_type::Scrub, _validator, action);
+            };
+            maybe_abort_scrub(report_fn);
            // Handle missing partition_end
            push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, partition_end{}));
-            report_invalid_end_of_stream(compaction_type::Scrub, _validator, "Rectifying by adding missing partition-end to the end of the stream");
+            report_fn("Rectifying by adding missing partition-end to the end of the stream");
        }

        void fill_buffer_from_underlying() {
@@ -1509,13 +1621,13 @@ class resharding_compaction final : public compaction {
        uint64_t estimated_partitions = 0;
    };
    std::vector<estimated_values> _estimation_per_shard;
-    std::vector<utils::UUID> _run_identifiers;
+    std::vector<run_id> _run_identifiers;
 private:
    // return estimated partitions per sstable for a given shard
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
@@ -1533,7 +1645,7 @@ public:
            }
        }
        for (auto i : boost::irange(0u, smp::count)) {
-            _run_identifiers[i] = utils::make_random_uuid();
+            _run_identifiers[i] = run_id::create_random_id();
        }
    }

@@ -1766,7 +1878,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
    int64_t min_timestamp = std::numeric_limits<int64_t>::max();

    for (auto& sstable : overlapping) {
-        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time);
+        auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        if (sstable->get_max_local_deletion_time() >= gc_before) {
            min_timestamp = std::min(min_timestamp, sstable->get_stats_metadata().min_timestamp);
        }
@@ -1785,7 +1897,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable

    // SStables that do not contain live data is added to list of possibly expired sstables.
    for (auto& candidate : compacting) {
-        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time);
+        auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
        clogger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}",
                    candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before);
        // A fully expired sstable which has an ancestor undeleted shouldn't be compacted because
@@ -1816,7 +1928,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
 }

 unsigned compaction_descriptor::fan_in() const {
-    return boost::copy_range<std::unordered_set<utils::UUID>>(sstables | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::run_identifier))).size();
+    return boost::copy_range<std::unordered_set<run_id>>(sstables | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::run_identifier))).size();
 }

 uint64_t compaction_descriptor::sstables_size() const {
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -80,8 +80,10 @@ struct compaction_data {
    }

    void stop(sstring reason) {
-        stop_requested = std::move(reason);
-        abort.request_abort();
+        if (!abort.abort_requested()) {
+            stop_requested = std::move(reason);
+            abort.request_abort();
+        }
    }
 };

@@ -90,12 +92,15 @@ struct compaction_stats {
    uint64_t start_size = 0;
    uint64_t end_size = 0;
    uint64_t validation_errors = 0;
+    // Bloom filter checks during max purgeable calculation
+    uint64_t bloom_filter_checks = 0;

    compaction_stats& operator+=(const compaction_stats& r) {
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
+        bloom_filter_checks += r.bloom_filter_checks;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
--- a/compaction/compaction_backlog_manager.hh
+++ b/compaction/compaction_backlog_manager.hh
@@ -66,7 +66,8 @@ public:
    };

    compaction_backlog_tracker(std::unique_ptr<impl> impl) : _impl(std::move(impl)) {}
-    compaction_backlog_tracker(compaction_backlog_tracker&&) = default;
+    compaction_backlog_tracker(compaction_backlog_tracker&&);
+    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) noexcept;
    compaction_backlog_tracker(const compaction_backlog_tracker&) = delete;
    ~compaction_backlog_tracker();

@@ -74,7 +75,7 @@ public:
    void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts);
    void register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp);
    void register_compacting_sstable(sstables::shared_sstable sst, backlog_read_progress_manager& rp);
-    void transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true);
+    void copy_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true) const;
    void revert_charges(sstables::shared_sstable sst);

    void disable() {
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -14,7 +14,7 @@
 #include <variant>
 #include <seastar/core/smp.hh>
 #include <seastar/core/file.hh>
-#include "sstables/shared_sstable.hh"
+#include "sstables/types_fwd.hh"
 #include "sstables/sstable_set.hh"
 #include "utils/UUID.hh"
 #include "dht/i_partitioner.hh"
@@ -153,8 +153,10 @@ struct compaction_descriptor {
    int level;
    // Threshold size for sstable(s) to be created.
    uint64_t max_sstable_bytes;
+    // Can split large partitions at clustering boundary.
+    bool can_split_large_partition = false;
    // Run identifier of output sstables.
-    utils::UUID run_identifier;
+    sstables::run_id run_identifier;
    // The options passed down to the compaction code.
    // This also selects the kind of compaction to do.
    compaction_type_options options = compaction_type_options::make_regular();
@@ -176,7 +178,7 @@ struct compaction_descriptor {
                                   ::io_priority_class io_priority,
                                   int level = default_level,
                                   uint64_t max_sstable_bytes = default_max_sstable_bytes,
-                                   utils::UUID run_identifier = utils::make_random_uuid(),
+                                   run_id run_identifier = run_id::create_random_id(),
                                   compaction_type_options options = compaction_type_options::make_regular())
        : sstables(std::move(sstables))
        , level(level)
@@ -192,7 +194,7 @@ struct compaction_descriptor {
        : sstables(std::move(sstables))
        , level(default_level)
        , max_sstable_bytes(default_max_sstable_bytes)
-        , run_identifier(utils::make_random_uuid())
+        , run_identifier(run_id::create_random_id())
        , options(compaction_type_options::make_regular())
        , io_priority(io_priority)
        , has_only_fully_expired(has_only_fully_expired)
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -7,18 +7,23 @@
 */

 #include "compaction_manager.hh"
+#include "compaction_descriptor.hh"
 #include "compaction_strategy.hh"
 #include "compaction_backlog_manager.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstables_manager.hh"
+#include <memory>
 #include <seastar/core/metrics.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/switch_to.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "sstables/exceptions.hh"
+#include "sstables/sstable_directory.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "utils/fb_utilities.hh"
 #include "utils/UUID_gen.hh"
+#include "db/system_keyspace.hh"
 #include <cmath>
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include <boost/range/algorithm/remove_if.hpp>
@@ -75,6 +80,23 @@ public:
            _compacting.erase(sst);
        }
    }
+
+    class update_me : public compaction_manager::task::on_replacement {
+        compacting_sstable_registration& _registration;
+        public:
+            update_me(compacting_sstable_registration& registration)
+                : _registration{registration} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.release_compacting(sstables);
+            }
+            void on_addition(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.register_compacting(sstables);
+            }
+    };
+
+    auto update_on_sstable_replacement() {
+        return update_me(*this);
+    }
 };

 sstables::compaction_data compaction_manager::create_compaction_data() {
@@ -206,7 +228,7 @@ std::vector<sstables::shared_sstable> compaction_manager::get_candidates(compact
    candidates.reserve(t.main_sstable_set().all()->size());
    // prevents sstables that belongs to a partial run being generated by ongoing compaction from being
    // selected for compaction, which could potentially result in wrong behavior.
-    auto partial_run_identifiers = boost::copy_range<std::unordered_set<utils::UUID>>(_tasks
+    auto partial_run_identifiers = boost::copy_range<std::unordered_set<sstables::run_id>>(_tasks
            | boost::adaptors::filtered(std::mem_fn(&task::generating_output_run))
            | boost::adaptors::transformed(std::mem_fn(&task::output_run_id)));
    auto& cs = t.get_compaction_strategy();
@@ -276,7 +298,7 @@ compaction_manager::task::task(compaction_manager& mgr, compaction::table_state*
    , _description(std::move(desc))
 {}

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task, throw_if_stopping do_throw_if_stopping) {
    _tasks.push_back(task);
    auto unregister_task = defer([this, task] {
        _tasks.remove(task);
@@ -289,6 +311,9 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
        co_return res;
    } catch (sstables::compaction_stopped_exception& e) {
        cmlog.info("{}: stopped, reason: {}", *task, e.what());
+        if (do_throw_if_stopping) {
+            throw;
+        }
    } catch (sstables::compaction_aborted_exception& e) {
        cmlog.error("{}: aborted, reason: {}", *task, e.what());
        _stats.errors++;
@@ -307,14 +332,14 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
    co_return std::nullopt;
 }

-future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge) {
    if (!descriptor.sstables.size()) {
        // if there is nothing to compact, just return.
        co_return sstables::compaction_result{};
    }

    bool should_update_history = this->should_update_history(descriptor.options.type());
-    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, std::move(release_exhausted), std::move(can_purge));
+    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, on_replace, std::move(can_purge));

    if (should_update_history) {
        co_await update_history(*_compacting_table, res, cdata);
@@ -322,8 +347,11 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables_a

    co_return res;
 }
-future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+
+future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge,
+                                                                               sstables::offstrategy offstrategy) {
    compaction::table_state& t = *_compacting_table;
+
    if (can_purge) {
        descriptor.enable_garbage_collection(t.main_sstable_set());
    }
@@ -331,15 +359,26 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables(s
        auto sst = t.make_sstable();
        return sst;
    };
-    descriptor.replacer = [this, &t, release_exhausted] (sstables::compaction_completion_desc desc) {
+
+    descriptor.replacer = [this, &t, &on_replace, offstrategy] (sstables::compaction_completion_desc desc) {
        t.get_compaction_strategy().notify_completion(desc.old_sstables, desc.new_sstables);
        _cm.propagate_replacement(t, desc.old_sstables, desc.new_sstables);
+        // on_replace updates the compacting registration with the old and new
+        // sstables. while on_compaction_completion() removes the old sstables
+        // from the table's sstable set, and adds the new ones to the sstable
+        // set.
+        // since the regular compactions exclude the sstables in the sstable
+        // set which are currently being compacted, if we want to ensure the
+        // exclusive access of compactions to an sstable we should guard it
+        // with the registration when adding/removing it to/from the sstable
+        // set. otherwise, the regular compaction would pick it up in the time
+        // window, where the sstables:
+        // - are still in the main set
+        // - are not being compacted.
+        on_replace.on_addition(desc.new_sstables);
        auto old_sstables = desc.old_sstables;
-        t.on_compaction_completion(std::move(desc), sstables::offstrategy::no).get();
-        // Calls compaction manager's task for this compaction to release reference to exhausted SSTables.
-        if (release_exhausted) {
-            release_exhausted(old_sstables);
-        }
+        t.on_compaction_completion(std::move(desc), offstrategy).get();
+        on_replace.on_removal(old_sstables);
    };

    co_return co_await sstables::compact_sstables(std::move(descriptor), cdata, t);
@@ -347,8 +386,15 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables(s
 future<> compaction_manager::task::update_history(compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata) {
    auto ended_at = std::chrono::duration_cast<std::chrono::milliseconds>(res.stats.ended_at.time_since_epoch());

-    co_return co_await t.update_compaction_history(cdata.compaction_uuid, t.schema()->ks_name(), t.schema()->cf_name(), ended_at,
-                                                                    res.stats.start_size, res.stats.end_size);
+    if (_cm._sys_ks) {
+        // FIXME: add support to merged_rows. merged_rows is a histogram that
+        // shows how many sstables each row is merged from. This information
+        // cannot be accessed until we make combined_reader more generic,
+        // for example, by adding a reducer method.
+        auto sys_ks = _cm._sys_ks; // hold pointer on sys_ks
+        co_await sys_ks->update_compaction_history(cdata.compaction_uuid, t.schema()->ks_name(), t.schema()->cf_name(),
+                ended_at.count(), res.stats.start_size, res.stats.end_size, std::unordered_map<int32_t, int64_t>{});
+    }
 }

 class compaction_manager::major_compaction_task : public compaction_manager::task {
@@ -377,9 +423,7 @@ protected:
        sstables::compaction_strategy cs = t->get_compaction_strategy();
        sstables::compaction_descriptor descriptor = cs.get_major_compaction_job(*t, _cm.get_candidates(*t));
        auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
-        auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-            compacting.release_compacting(exhausted_sstables);
-        };
+        auto on_replace = compacting.update_on_sstable_replacement();
        setup_new_compaction(descriptor.run_identifier);

        cmlog.info0("User initiated compaction started on behalf of {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
@@ -391,7 +435,7 @@ protected:
        // the exclusive lock can be freed to let regular compaction run in parallel to major
        lock_holder.return_all();

-        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace);

        finish_compaction();

@@ -438,12 +482,12 @@ protected:
    }
 };

-future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job) {
+future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping) {
    if (_state != state::enabled) {
        return make_ready_future<>();
    }

-    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job))).discard_result();
+    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job)), do_throw_if_stopping).discard_result();
 }

 future<> compaction_manager::update_static_shares(float static_shares) {
@@ -611,7 +655,7 @@ future<semaphore_units<named_semaphore_exception_factory>> compaction_manager::t
    });
 }

-void compaction_manager::task::setup_new_compaction(utils::UUID output_run_id) {
+void compaction_manager::task::setup_new_compaction(sstables::run_id output_run_id) {
    _compaction_data = create_compaction_data();
    _output_run_identifier = output_run_id;
    switch_state(state::active);
@@ -619,7 +663,7 @@ void compaction_manager::task::setup_new_compaction(utils::UUID output_run_id) {

 void compaction_manager::task::finish_compaction(state finish_state) noexcept {
    switch_state(finish_state);
-    _output_run_identifier = utils::null_uuid();
+    _output_run_identifier = sstables::run_id::create_null_id();
    if (finish_state != state::failed) {
        _compaction_retry.reset();
    }
@@ -637,6 +681,7 @@ sstables::compaction_stopped_exception compaction_manager::task::make_compaction

 compaction_manager::compaction_manager(config cfg, abort_source& as)
    : _cfg(std::move(cfg))
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), [this] () -> float {
        _last_backlog = backlog();
        auto b = _last_backlog / available_memory();
@@ -657,6 +702,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as)
    , _update_compaction_static_shares_action([this] { return update_static_shares(static_shares()); })
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
    , _strategy_control(std::make_unique<strategy_control>(*this))
+    , _tombstone_gc_state(&_repair_history_maps)
 {
    register_metrics();
    // Bandwidth throttling is node-wide, updater is needed on single shard
@@ -670,12 +716,14 @@ compaction_manager::compaction_manager(config cfg, abort_source& as)

 compaction_manager::compaction_manager()
    : _cfg(config{ .available_memory = 1 })
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), 1, [] () -> float { return 1.0; }))
    , _backlog_manager(_compaction_controller)
    , _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
    , _update_compaction_static_shares_action([] { return make_ready_future<>(); })
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
    , _strategy_control(std::make_unique<strategy_control>(*this))
+    , _tombstone_gc_state(&_repair_history_maps)
 {
    // No metric registration because this constructor is supposed to be used only by the testing
    // infrastructure.
@@ -726,38 +774,46 @@ void compaction_manager::register_metrics() {
 void compaction_manager::enable() {
    assert(_state == state::none || _state == state::disabled);
    _state = state::enabled;
-    _compaction_submission_timer.arm(periodic_compaction_submission_interval());
-    postponed_compactions_reevaluation();
+    _compaction_submission_timer.arm_periodic(periodic_compaction_submission_interval());
+    _waiting_reevalution = postponed_compactions_reevaluation();
 }

 std::function<void()> compaction_manager::compaction_submission_callback() {
    return [this] () mutable {
        for (auto& e: _compaction_state) {
-            submit(*e.first);
+            postpone_compaction_for_table(e.first);
        }
+        reevaluate_postponed_compactions();
    };
 }

-void compaction_manager::postponed_compactions_reevaluation() {
-    _waiting_reevalution = repeat([this] {
-        return _postponed_reevaluation.wait().then([this] {
-            if (_state != state::enabled) {
-                _postponed.clear();
-                return stop_iteration::yes;
-            }
-            auto postponed = std::move(_postponed);
-            try {
-                for (auto& t : postponed) {
-                    auto s = t->schema();
-                    cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
-                    submit(*t);
+future<> compaction_manager::postponed_compactions_reevaluation() {
+     while (true) {
+        co_await _postponed_reevaluation.when();
+        if (_state != state::enabled) {
+            _postponed.clear();
+            co_return;
+        }
+        // A task_state being reevaluated can re-insert itself into postponed list, which is the reason
+        // for moving the list to be processed into a local.
+        auto postponed = std::exchange(_postponed, {});
+        try {
+            for (auto it = postponed.begin(); it != postponed.end();) {
+                compaction::table_state* t = *it;
+                it = postponed.erase(it);
+                // skip reevaluation of a table_state that became invalid post its removal
+                if (!_compaction_state.contains(t)) {
+                    continue;
                }
-            } catch (...) {
-                _postponed = std::move(postponed);
+                auto s = t->schema();
+                cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
+                submit(*t);
+                co_await coroutine::maybe_yield();
            }
-            return stop_iteration::no;
-        });
-    });
+        } catch (...) {
+            _postponed.insert(postponed.begin(), postponed.end());
+        }
+    }
 }

 void compaction_manager::reevaluate_postponed_compactions() noexcept {
@@ -789,23 +845,27 @@ future<> compaction_manager::stop_tasks(std::vector<shared_ptr<task>> tasks, sst
    });
 }

-future<> compaction_manager::stop_ongoing_compactions(sstring reason, compaction::table_state* t, std::optional<sstables::compaction_type> type_opt) {
-    auto ongoing_compactions = get_compactions(t).size();
-    auto tasks = boost::copy_range<std::vector<shared_ptr<task>>>(_tasks | boost::adaptors::filtered([t, type_opt] (auto& task) {
-        return (!t || task->compacting_table() == t) && (!type_opt || task->type() == *type_opt);
-    }));
-    logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
-    if (cmlog.is_enabled(level)) {
-        std::string scope = "";
-        if (t) {
-            scope = fmt::format(" for table {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
+future<> compaction_manager::stop_ongoing_compactions(sstring reason, compaction::table_state* t, std::optional<sstables::compaction_type> type_opt) noexcept {
+    try {
+        auto ongoing_compactions = get_compactions(t).size();
+        auto tasks = boost::copy_range<std::vector<shared_ptr<task>>>(_tasks | boost::adaptors::filtered([t, type_opt] (auto& task) {
+            return (!t || task->compacting_table() == t) && (!type_opt || task->type() == *type_opt);
+        }));
+        logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
+        if (cmlog.is_enabled(level)) {
+            std::string scope = "";
+            if (t) {
+                scope = fmt::format(" for table {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
+            }
+            if (type_opt) {
+                scope += fmt::format(" {} type={}", scope.size() ? "and" : "for", *type_opt);
+            }
+            cmlog.log(level, "Stopping {} tasks for {} ongoing compactions{} due to {}", tasks.size(), ongoing_compactions, scope, reason);
        }
-        if (type_opt) {
-            scope += fmt::format(" {} type={}", scope.size() ? "and" : "for", *type_opt);
-        }
-        cmlog.log(level, "Stopping {} tasks for {} ongoing compactions{} due to {}", tasks.size(), ongoing_compactions, scope, reason);
+        return stop_tasks(std::move(tasks), std::move(reason));
+    } catch (...) {
+        return current_exception_as_future<>();
    }
-    return stop_tasks(std::move(tasks), std::move(reason));
 }

 future<> compaction_manager::drain() {
@@ -842,14 +902,31 @@ future<> compaction_manager::really_do_stop() {
    cmlog.info("Stopped");
 }

+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
+}
+
 void compaction_manager::do_stop() noexcept {
-    if (_state == state::none || _state == state::stopped) {
+    if (_state == state::none || _stop_future) {
        return;
    }

    try {
        _state = state::stopped;
-        _stop_future = really_do_stop();
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        cmlog.error("Failed to stop the manager: {}", std::current_exception());
    }
@@ -941,9 +1018,7 @@ protected:
            }
            auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
            auto weight_r = compaction_weight_registration(&_cm, weight);
-            auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = compacting.update_on_sstable_replacement();
            cmlog.debug("Accepted compaction job: task={} ({} sstable(s)) of weight {} for {}.{}",
                fmt::ptr(this), descriptor.sstables.size(), weight, t.schema()->ks_name(), t.schema()->cf_name());

@@ -952,7 +1027,7 @@ protected:

            try {
                bool should_update_history = this->should_update_history(descriptor.options.type());
-                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
                finish_compaction();
                if (should_update_history) {
                    // update_history can take a long time compared to
@@ -993,7 +1068,7 @@ void compaction_manager::submit(compaction::table_state& t) {

    // OK to drop future.
    // waited via task->stop()
-    (void)perform_task(make_shared<regular_compaction_task>(*this, t));
+    (void)perform_task(make_shared<regular_compaction_task>(*this, t)).then_wrapped([] (auto f) { f.ignore_ready_future(); });
 }

 bool compaction_manager::can_perform_regular_compaction(compaction::table_state& t) {
@@ -1010,11 +1085,11 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction::
    auto num_runs_for_compaction = [&, this] {
        auto& cs = t.get_compaction_strategy();
        auto desc = cs.get_sstables_for_compaction(t, get_strategy_control(), get_candidates(t));
-        return boost::copy_range<std::unordered_set<utils::UUID>>(
+        return boost::copy_range<std::unordered_set<sstables::run_id>>(
            desc.sstables
            | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::run_identifier))).size();
    };
-    const auto threshold = std::max(schema->max_compaction_threshold(), 32);
+    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}.{}: {} <= {}",
@@ -1050,83 +1125,66 @@ public:
    bool performed() const noexcept {
        return _performed;
    }
-
+private:
    future<> run_offstrategy_compaction(sstables::compaction_data& cdata) {
-        // This procedure will reshape sstables in maintenance set until it's ready for
-        // integration into main set.
-        // It may require N reshape rounds before the set satisfies the strategy invariant.
-        // This procedure also only updates maintenance set at the end, on success.
-        // Otherwise, some overlapping could be introduced in the set after each reshape
-        // round, progressively degrading read amplification until integration happens.
-        // The drawback of this approach is the 2x space requirement as the old sstables
-        // will only be deleted at the end. The impact of this space requirement is reduced
-        // by the fact that off-strategy is serialized across all tables, meaning that the
-        // actual requirement is the size of the largest table's maintenance set.
+        // Incrementally reshape the SSTables in maintenance set. The output of each reshape
+        // round is merged into the main set. The common case is that off-strategy input
+        // is mostly disjoint, e.g. repair-based node ops, then all the input will be
+        // reshaped in a single round. The incremental approach allows us to be space
+        // efficient (avoiding a 100% overhead) as we will incrementally replace input
+        // SSTables from maintenance set by output ones into main set.

        compaction::table_state& t = *_compacting_table;
-        const auto& maintenance_sstables = t.maintenance_sstable_set();

-        const auto old_sstables = boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_sstables.all());
-        std::vector<sstables::shared_sstable> reshape_candidates = old_sstables;
-        std::vector<sstables::shared_sstable> sstables_to_remove;
-        std::unordered_set<sstables::shared_sstable> new_unused_sstables;
-
-        auto cleanup_new_unused_sstables_on_failure = defer([&new_unused_sstables] {
-            for (auto& sst : new_unused_sstables) {
-                sst->mark_for_deletion();
-            }
-        });
+        // Filter out sstables that require view building, to avoid a race between off-strategy
+        // and view building. Refs: #11882
+        auto get_reshape_candidates = [&t] () {
+            auto maintenance_ssts = t.maintenance_sstable_set().all();
+            return boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_ssts
+                | boost::adaptors::filtered([](const sstables::shared_sstable& sst) {
+                        return !sst->requires_view_building();
+                }));
+        };

        auto get_next_job = [&] () -> std::optional<sstables::compaction_descriptor> {
            auto& iop = service::get_local_streaming_priority(); // run reshape in maintenance mode
-            auto desc = t.get_compaction_strategy().get_reshaping_job(reshape_candidates, t.schema(), iop, sstables::reshape_mode::strict);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), iop, sstables::reshape_mode::strict);
            return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

+        std::exception_ptr err;
        while (auto desc = get_next_job()) {
-            desc->creator = [this, &new_unused_sstables, &t] (shard_id dummy) {
-                auto sst = t.make_sstable();
-                new_unused_sstables.insert(sst);
-                return sst;
-            };
-            auto input = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(desc->sstables);
+            auto compacting = compacting_sstable_registration(_cm, desc->sstables);
+            auto on_replace = compacting.update_on_sstable_replacement();

-            auto ret = co_await sstables::compact_sstables(std::move(*desc), cdata, t);
-            _performed = true;
-
-            // update list of reshape candidates without input but with output added to it
-            auto it = boost::remove_if(reshape_candidates, [&] (auto& s) { return input.contains(s); });
-            reshape_candidates.erase(it, reshape_candidates.end());
-            std::move(ret.new_sstables.begin(), ret.new_sstables.end(), std::back_inserter(reshape_candidates));
-
-            // If compaction strategy is unable to reshape input data in a single round, it may happen that a SSTable A
-            // created in round 1 will be compacted in a next round producing SSTable B. As SSTable A is no longer needed,
-            // it can be removed immediately. Let's remove all such SSTables immediately to reduce off-strategy space requirement.
-            // Input SSTables from maintenance set can only be removed later, as SSTable sets are only updated on completion.
-            auto can_remove_now = [&] (const sstables::shared_sstable& s) { return new_unused_sstables.contains(s); };
-            for (auto&& sst : input) {
-                if (can_remove_now(sst)) {
-                    co_await sst->unlink();
-                    new_unused_sstables.erase(std::move(sst));
-                } else {
-                    sstables_to_remove.push_back(std::move(sst));
-                }
+            try {
+                sstables::compaction_result _ = co_await compact_sstables(std::move(*desc), _compaction_data, on_replace,
+                                                                          compaction_manager::can_purge_tombstones::no,
+                                                                          sstables::offstrategy::yes);
+            } catch (sstables::compaction_stopped_exception&) {
+                // If off-strategy compaction stopped on user request, let's not discard the partial work.
+                // Therefore, both un-reshaped and reshaped data will be integrated into main set, allowing
+                // regular compaction to continue from where off-strategy left off.
+                err = std::current_exception();
+                break;
            }
+            _performed = true;
        }

-        // at this moment reshape_candidates contains a set of sstables ready for integration into main set
-        auto completion_desc = sstables::compaction_completion_desc{
-            .old_sstables = std::move(old_sstables),
-            .new_sstables = std::move(reshape_candidates)
-        };
-        co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
+        // There might be some remaining sstables in maintenance set that didn't require reshape, or the
+        // user has aborted off-strategy. So we can only integrate them into the main set, such that
+        // they become candidates for regular compaction. We cannot hold them forever in maintenance set,
+        // as that causes read and space amplification issues.
+        if (auto sstables = get_reshape_candidates(); sstables.size()) {
+            auto completion_desc = sstables::compaction_completion_desc{
+                .old_sstables = sstables, // removes from maintenance set.
+                .new_sstables = sstables, // adds into main set.
+            };
+            co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
+        }

-        cleanup_new_unused_sstables_on_failure.cancel();
-        // By marking input sstables for deletion instead, the ones which require view building will stay in the staging
-        // directory until they're moved to the main dir when the time comes. Also, that allows view building to resume
-        // on restart if there's a crash midway.
-        for (auto& sst : sstables_to_remove) {
-            sst->mark_for_deletion();
+        if (err) {
+            co_await coroutine::return_exception_ptr(std::move(err));
        }
    }
 protected:
@@ -1147,9 +1205,11 @@ protected:
            std::exception_ptr ex;
            try {
                compaction::table_state& t = *_compacting_table;
-                auto maintenance_sstables = t.maintenance_sstable_set().all();
-                cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
-                        t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                {
+                    auto maintenance_sstables = t.maintenance_sstable_set().all();
+                    cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
+                               t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                }
                co_await run_offstrategy_compaction(_compaction_data);
                finish_compaction();
                cmlog.info("Done with off-strategy compaction for {}.{}", t.schema()->ks_name(), t.schema()->cf_name());
@@ -1222,9 +1282,7 @@ private:
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, _options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
-            auto release_exhausted = [this] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                _compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = _compacting.update_on_sstable_replacement();

            setup_new_compaction(descriptor.run_identifier);

@@ -1233,7 +1291,7 @@ private:

            std::exception_ptr ex;
            try {
-                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted), _can_purge);
+                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace, _can_purge);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return res;  // done with current sstable
@@ -1390,14 +1448,26 @@ protected:
        co_return std::nullopt;
    }
 private:
-    // Releases reference to cleaned files such that respective used disk space can be freed.
-    void release_exhausted(std::vector<sstables::shared_sstable> exhausted_sstables) {
-        _compacting.release_compacting(exhausted_sstables);
-    }
-
    future<> run_cleanup_job(sstables::compaction_descriptor descriptor) {
        co_await coroutine::switch_to(_cm.compaction_sg().cpu);

+        // Releases reference to cleaned files such that respective used disk space can be freed.
+        using update_registration = compacting_sstable_registration::update_me;
+        class release_exhausted : public update_registration {
+            sstables::compaction_descriptor& _desc;
+        public:
+            release_exhausted(compacting_sstable_registration& registration, sstables::compaction_descriptor& desc)
+                : update_registration{registration}
+                , _desc{desc} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                auto exhausted = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(sstables);
+                std::erase_if(_desc.sstables, [&] (const sstables::shared_sstable& sst) {
+                    return exhausted.contains(sst);
+                });
+                update_registration::on_removal(sstables);
+            }
+        };
+        release_exhausted on_replace{_compacting, descriptor};
        for (;;) {
            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_cm._compaction_controller.backlog_of_shares(200), _cm.available_memory()));
            _cm.register_backlog_tracker(user_initiated);
@@ -1405,8 +1475,7 @@ private:
            std::exception_ptr ex;
            try {
                setup_new_compaction(descriptor.run_identifier);
-                co_await compact_sstables_and_update_history(descriptor, _compaction_data,
-                                          std::bind(&cleanup_sstables_compaction_task::release_exhausted, this, std::placeholders::_1));
+                co_await compact_sstables_and_update_history(descriptor, _compaction_data, on_replace);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return;  // done with current job
@@ -1426,10 +1495,8 @@ private:
 bool needs_cleanup(const sstables::shared_sstable& sst,
                   const dht::token_range_vector& sorted_owned_ranges,
                   schema_ptr s) {
-    auto first = sst->get_first_partition_key();
-    auto last = sst->get_last_partition_key();
-    auto first_token = dht::get_token(*s, first);
-    auto last_token = dht::get_token(*s, last);
+    auto first_token = sst->get_first_decorated_key().token();
+    auto last_token = sst->get_last_decorated_key().token();
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

    auto r = std::lower_bound(sorted_owned_ranges.begin(), sorted_owned_ranges.end(), first_token,
@@ -1529,31 +1596,35 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
    }, can_purge_tombstones::no);
 }

+compaction_manager::compaction_state::compaction_state(table_state& t)
+    : backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
+{
+}
+
 void compaction_manager::add(compaction::table_state& t) {
-    auto [_, inserted] = _compaction_state.insert({&t, compaction_state{}});
+    auto [_, inserted] = _compaction_state.try_emplace(&t, t);
    if (!inserted) {
        auto s = t.schema();
        on_internal_error(cmlog, format("compaction_state for table {}.{} [{}] already exists", s->ks_name(), s->cf_name(), fmt::ptr(&t)));
    }
 }

-future<> compaction_manager::remove(compaction::table_state& t) {
-    auto handle = _compaction_state.extract(&t);
+future<> compaction_manager::remove(compaction::table_state& t) noexcept {
+    auto& c_state = get_compaction_state(&t);

-    if (!handle.empty()) {
-        auto& c_state = handle.mapped();
+    // We need to guarantee that a task being stopped will not retry to compact
+    // a table being removed.
+    // The requirement above is provided by stop_ongoing_compactions().
+    _postponed.erase(&t);

-        // We need to guarantee that a task being stopped will not retry to compact
-        // a table being removed.
-        // The requirement above is provided by stop_ongoing_compactions().
-        _postponed.erase(&t);
+    // Wait for all compaction tasks running under gate to terminate
+    // and prevent new tasks from entering the gate.
+    co_await seastar::when_all_succeed(stop_ongoing_compactions("table removal", &t), c_state.gate.close()).discard_result();

-        // Wait for the termination of an ongoing compaction on table T, if any.
-        co_await stop_ongoing_compactions("table removal", &t);
+    c_state.backlog_tracker.disable();
+
+    _compaction_state.erase(&t);

-        // Wait for all functions running under gate to terminate.
-        co_await c_state.gate.close();
-    }
 #ifdef DEBUG
    auto found = false;
    sstring msg;
@@ -1645,6 +1716,14 @@ compaction::strategy_control& compaction_manager::get_strategy_control() const n
    return *_strategy_control;
 }

+void compaction_manager::plug_system_keyspace(db::system_keyspace& sys_ks) noexcept {
+    _sys_ks = sys_ks.shared_from_this();
+}
+
+void compaction_manager::unplug_system_keyspace() noexcept {
+    _sys_ks = nullptr;
+}
+
 double compaction_backlog_tracker::backlog() const {
    return disabled() ? compaction_controller::disable_backlog : _impl->backlog(_ongoing_writes, _ongoing_compactions);
 }
@@ -1704,7 +1783,7 @@ void compaction_backlog_tracker::register_compacting_sstable(sstables::shared_ss
    }
 }

-void compaction_backlog_tracker::transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges) {
+void compaction_backlog_tracker::copy_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges) const {
    for (auto&& w : _ongoing_writes) {
        new_bt.register_partially_written_sstable(w.first, *w.second);
    }
@@ -1714,8 +1793,6 @@ void compaction_backlog_tracker::transfer_ongoing_charges(compaction_backlog_tra
            new_bt.register_compacting_sstable(w.first, *w.second);
        }
    }
-    _ongoing_writes = {};
-    _ongoing_compactions = {};
 }

 void compaction_backlog_tracker::revert_charges(sstables::shared_sstable sst) {
@@ -1723,6 +1800,26 @@ void compaction_backlog_tracker::revert_charges(sstables::shared_sstable sst) {
    _ongoing_compactions.erase(sst);
 }

+compaction_backlog_tracker::compaction_backlog_tracker(compaction_backlog_tracker&& other)
+        : _impl(std::move(other._impl))
+        , _ongoing_writes(std::move(other._ongoing_writes))
+        , _ongoing_compactions(std::move(other._ongoing_compactions))
+        , _manager(std::exchange(other._manager, nullptr)) {
+}
+
+compaction_backlog_tracker&
+compaction_backlog_tracker::operator=(compaction_backlog_tracker&& x) noexcept {
+    if (this != &x) {
+        if (auto manager = std::exchange(_manager, x._manager)) {
+            manager->remove_backlog_tracker(this);
+        }
+        _impl = std::move(x._impl);
+        _ongoing_writes = std::move(x._ongoing_writes);
+        _ongoing_compactions = std::move(x._ongoing_compactions);
+    }
+    return *this;
+}
+
 compaction_backlog_tracker::~compaction_backlog_tracker() {
    if (_manager) {
        _manager->remove_backlog_tracker(this);
@@ -1760,3 +1857,14 @@ compaction_backlog_manager::~compaction_backlog_manager() {
        tracker->_manager = nullptr;
    }
 }
+
+void compaction_manager::register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker) {
+    auto& cs = get_compaction_state(&t);
+    cs.backlog_tracker = std::move(new_backlog_tracker);
+    register_backlog_tracker(cs.backlog_tracker);
+}
+
+compaction_backlog_tracker& compaction_manager::get_backlog_tracker(compaction::table_state& t) {
+    auto& cs = get_compaction_state(&t);
+    return cs.backlog_tracker;
+}
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -8,6 +8,9 @@

 #pragma once

+#include <boost/icl/interval.hpp>
+#include <boost/icl/interval_map.hpp>
+
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/core/shared_ptr.hh>
@@ -29,13 +32,26 @@
 #include "compaction.hh"
 #include "compaction_weight_registration.hh"
 #include "compaction_backlog_manager.hh"
+#include "compaction/compaction_descriptor.hh"
 #include "strategy_control.hh"
 #include "backlog_controller.hh"
 #include "seastarx.hh"
 #include "sstables/exceptions.hh"
+#include "tombstone_gc.hh"
+
+namespace db {
+class system_keyspace;
+}

 class compacting_sstable_registration;

+class repair_history_map {
+public:
+    boost::icl::interval_map<dht::token, gc_clock::time_point, boost::icl::partial_absorber, std::less, boost::icl::inplace_max> map;
+};
+
+using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -70,8 +86,10 @@ private:
        // Signaled whenever a compaction task completes.
        condition_variable compaction_done;

-        compaction_state() = default;
-        compaction_state(compaction_state&&) = default;
+        compaction_backlog_tracker backlog_tracker;
+
+        explicit compaction_state(table_state& t);
+        compaction_state(compaction_state&&) = delete;
        ~compaction_state();

        bool compaction_disabled() const noexcept {
@@ -110,7 +128,7 @@ public:
        shared_future<compaction_stats_opt> _compaction_done = make_ready_future<compaction_stats_opt>();
        exponential_backoff_retry _compaction_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
        sstables::compaction_type _type;
-        utils::UUID _output_run_identifier;
+        sstables::run_id _output_run_identifier;
        gate::holder _gate_holder;
        sstring _description;

@@ -122,11 +140,20 @@ public:

        virtual ~task();

+        // called when a compaction replaces the exhausted sstables with the new set
+        struct on_replacement {
+            virtual ~on_replacement() {}
+            // called after the replacement completes
+            // @param sstables the old sstable which are replaced in this replacement
+            virtual void on_removal(const std::vector<sstables::shared_sstable>& sstables) = 0;
+            // called before the replacement happens
+            // @param sstables the new sstables to be added to the table's sstable set
+            virtual void on_addition(const std::vector<sstables::shared_sstable>& sstables) = 0;
+        };
+
    protected:
        virtual future<compaction_stats_opt> do_run() = 0;

-        using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
        state switch_state(state new_state);

        future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -134,7 +161,7 @@ public:
        // Return true if the task isn't stopped
        // and the compaction manager allows proceeding.
        inline bool can_proceed(throw_if_stopping do_throw_if_stopping = throw_if_stopping::no) const;
-        void setup_new_compaction(utils::UUID output_run_id = utils::null_uuid());
+        void setup_new_compaction(sstables::run_id output_run_id = sstables::run_id::create_null_id());
        void finish_compaction(state finish_state = state::done) noexcept;

        // Compaction manager stop itself if it finds an storage I/O error which results in
@@ -143,12 +170,10 @@ public:
        // otherwise, returns stop_iteration::no after sleep for exponential retry.
        future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);

-        // Compacts set of SSTables according to the descriptor.
-        using release_exhausted_func_t = std::function<void(const std::vector<sstables::shared_sstable>& exhausted_sstables)>;
-        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
-                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
-        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
+        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
+        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
+                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstables::offstrategy offstrategy = sstables::offstrategy::no);
        future<> update_history(compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
        bool should_update_history(sstables::compaction_type ct) {
            return ct == sstables::compaction_type::Compaction;
@@ -179,7 +204,7 @@ public:
        bool generating_output_run() const noexcept {
            return compaction_running() && _output_run_identifier;
        }
-        const utils::UUID& output_run_id() const noexcept {
+        const sstables::run_id& output_run_id() const noexcept {
            return _output_run_identifier;
        }

@@ -276,13 +301,15 @@ private:
    // being picked more than once.
    seastar::named_semaphore _off_strategy_sem = {1, named_semaphore_exception_factory{"off-strategy compaction"}};

+    seastar::shared_ptr<db::system_keyspace> _sys_ks;
+
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
-    timer<lowres_clock> _compaction_submission_timer = timer<lowres_clock>(compaction_submission_callback());
    static constexpr std::chrono::seconds periodic_compaction_submission_interval() { return std::chrono::seconds(3600); }

    config _cfg;
+    timer<lowres_clock> _compaction_submission_timer;
    compaction_controller _compaction_controller;
    compaction_backlog_manager _backlog_manager;
    optimized_optional<abort_source::subscription> _early_abort_subscription;
@@ -294,8 +321,11 @@ private:

    class strategy_control;
    std::unique_ptr<strategy_control> _strategy_control;
+
+    per_table_history_maps _repair_history_maps;
+    tombstone_gc_state _tombstone_gc_state;
 private:
-    future<compaction_stats_opt> perform_task(shared_ptr<task>);
+    future<compaction_stats_opt> perform_task(shared_ptr<task>, throw_if_stopping do_throw_if_stopping = throw_if_stopping::no);

    future<> stop_tasks(std::vector<shared_ptr<task>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -330,7 +360,7 @@ private:
    // table still exists and compaction is not disabled for the table.
    inline bool can_proceed(compaction::table_state* t) const;

-    void postponed_compactions_reevaluation();
+    future<> postponed_compactions_reevaluation();
    void reevaluate_postponed_compactions() noexcept;
    // Postpone compaction for a table that couldn't be executed due to ongoing
    // similar-sized compaction.
@@ -440,7 +470,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping);

    class compaction_reenabler {
        compaction_manager& _cm;
@@ -470,6 +500,9 @@ public:
    // Run a function with compaction temporarily disabled for a table T.
    future<> run_with_compaction_disabled(compaction::table_state& t, std::function<future<> ()> func);

+    void plug_system_keyspace(db::system_keyspace& sys_ks) noexcept;
+    void unplug_system_keyspace() noexcept;
+
    // Adds a table to the compaction manager.
    // Creates a compaction_state structure that can be used for submitting
    // compaction jobs of all types.
@@ -477,7 +510,7 @@ public:

    // Remove a table from the compaction manager.
    // Cancel requests on table and wait for possible ongoing compactions.
-    future<> remove(compaction::table_state& t);
+    future<> remove(compaction::table_state& t) noexcept;

    const stats& get_stats() const {
        return _stats;
@@ -494,7 +527,7 @@ public:
    future<> stop_compaction(sstring type, compaction::table_state* table = nullptr);

    // Stops ongoing compaction of a given table and/or compaction_type.
-    future<> stop_ongoing_compactions(sstring reason, compaction::table_state* t = nullptr, std::optional<sstables::compaction_type> type_opt = {});
+    future<> stop_ongoing_compactions(sstring reason, compaction::table_state* t = nullptr, std::optional<sstables::compaction_type> type_opt = {}) noexcept;

    double backlog() {
        return _backlog_manager.backlog();
@@ -503,11 +536,22 @@ public:
    void register_backlog_tracker(compaction_backlog_tracker& backlog_tracker) {
        _backlog_manager.register_backlog_tracker(backlog_tracker);
    }
+    void register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker);
+
+    compaction_backlog_tracker& get_backlog_tracker(compaction::table_state& t);

    static sstables::compaction_data create_compaction_data();

    compaction::strategy_control& get_strategy_control() const noexcept;

+    tombstone_gc_state& get_tombstone_gc_state() noexcept {
+        return _tombstone_gc_state;
+    };
+
+    const tombstone_gc_state& get_tombstone_gc_state() const noexcept {
+        return _tombstone_gc_state;
+    };
+
    friend class compacting_sstable_registration;
    friend class compaction_weight_registration;
    friend class compaction_manager_test;
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -50,7 +50,7 @@ std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compact
    }));
 }

-bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time) {
+bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state) {
    if (_disable_tombstone_compaction) {
        return false;
    }
@@ -61,11 +61,11 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    if (db_clock::now()-_tombstone_compaction_interval < sst->data_file_write_time()) {
        return false;
    }
-    auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time);
+    auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time, gc_state);
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

-uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
+uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
    return partition_estimate;
 }

@@ -409,7 +409,9 @@ public:
                l0_old_ssts.push_back(std::move(sst));
            }
        }
-        _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        if (l0_old_ssts.size() || l0_new_ssts.size()) {
+            _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        }
    }
 };

@@ -427,14 +429,6 @@ struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {}
 };

-// Just so that if we have more than one CF with NullStrategy, we don't create a lot
-// of objects to iterate over for no reason
-// Still thread local because of make_unique. But this will disappear soon
-static thread_local compaction_backlog_tracker null_backlog_tracker(std::make_unique<null_backlog_tracker>());
-compaction_backlog_tracker& get_null_backlog_tracker() {
-    return null_backlog_tracker;
-}
-
 //
 // Null compaction strategy is the default compaction strategy.
 // As the name implies, it does nothing.
@@ -453,8 +447,8 @@ public:
        return compaction_strategy_type::null;
    }

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return get_null_backlog_tracker();
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override {
+        return std::make_unique<null_backlog_tracker>();
    }
 };

@@ -462,11 +456,14 @@ leveled_compaction_strategy::leveled_compaction_strategy(const std::map<sstring,
        : compaction_strategy_impl(options)
        , _max_sstable_size_in_mb(calculate_max_sstable_size_in_mb(compaction_strategy_impl::get_value(options, SSTABLE_SIZE_OPTION)))
        , _stcs_options(options)
-        , _backlog_tracker(std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options))
 {
    _compaction_counter.resize(leveled_manifest::MAX_LEVELS);
 }

+std::unique_ptr<compaction_backlog_tracker::impl> leveled_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options);
+}
+
 int32_t
 leveled_compaction_strategy::calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const {
    using namespace cql3::statements;
@@ -486,7 +483,6 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
    : compaction_strategy_impl(options)
    , _options(options)
    , _stcs_options(options)
-    , _backlog_tracker(std::make_unique<time_window_backlog_tracker>(_options, _stcs_options))
 {
    if (!options.contains(TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.contains(TOMBSTONE_THRESHOLD_OPTION)) {
        _disable_tombstone_compaction = true;
@@ -497,6 +493,10 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
    _use_clustering_key_filter = true;
 }

+std::unique_ptr<compaction_backlog_tracker::impl> time_window_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<time_window_backlog_tracker>(_options, _stcs_options);
+}
+
 } // namespace sstables

 std::vector<sstables::shared_sstable>
@@ -640,7 +640,6 @@ namespace sstables {
 date_tiered_compaction_strategy::date_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
    : compaction_strategy_impl(options)
    , _manifest(options)
-    , _backlog_tracker(std::make_unique<unimplemented_backlog_tracker>())
 {
    clogger.warn("DateTieredCompactionStrategy is deprecated. Usually cases for which it is used are better handled by TimeWindowCompactionStrategy."
            " Please change your compaction strategy to TWCS as DTCS will be retired in the near future");
@@ -670,8 +669,8 @@ compaction_descriptor date_tiered_compaction_strategy::get_sstables_for_compacti
    }

    // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
-    auto e = boost::range::remove_if(candidates, [this, compaction_time] (const sstables::shared_sstable& sst) -> bool {
-        return !worth_dropping_tombstones(sst, compaction_time);
+    auto e = boost::range::remove_if(candidates, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
+        return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
    });
    candidates.erase(e, candidates.end());
    if (candidates.empty()) {
@@ -685,17 +684,23 @@ compaction_descriptor date_tiered_compaction_strategy::get_sstables_for_compacti
    return sstables::compaction_descriptor({ *it }, service::get_local_compaction_priority());
 }

+std::unique_ptr<compaction_backlog_tracker::impl> date_tiered_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<unimplemented_backlog_tracker>();
+}
+
 size_tiered_compaction_strategy::size_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
    : compaction_strategy_impl(options)
    , _options(options)
-    , _backlog_tracker(std::make_unique<size_tiered_backlog_tracker>(_options))
 {}

 size_tiered_compaction_strategy::size_tiered_compaction_strategy(const size_tiered_compaction_strategy_options& options)
    : _options(options)
-    , _backlog_tracker(std::make_unique<size_tiered_backlog_tracker>(_options))
 {}

+std::unique_ptr<compaction_backlog_tracker::impl> size_tiered_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<size_tiered_backlog_tracker>(_options);
+}
+
 compaction_strategy::compaction_strategy(::shared_ptr<compaction_strategy_impl> impl)
    : _compaction_strategy_impl(std::move(impl)) {}
 compaction_strategy::compaction_strategy() = default;
@@ -736,8 +741,8 @@ bool compaction_strategy::use_clustering_key_filter() const {
    return _compaction_strategy_impl->use_clustering_key_filter();
 }

-compaction_backlog_tracker& compaction_strategy::get_backlog_tracker() {
-    return _compaction_strategy_impl->get_backlog_tracker();
+compaction_backlog_tracker compaction_strategy::make_backlog_tracker() {
+    return compaction_backlog_tracker(_compaction_strategy_impl->make_backlog_tracker());
 }

 sstables::compaction_descriptor
@@ -745,8 +750,8 @@ compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema
    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

-uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate);
+uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
+    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
 }

 reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -106,9 +106,9 @@ public:

    sstable_set make_sstable_set(schema_ptr schema) const;

-    compaction_backlog_tracker& get_backlog_tracker();
+    compaction_backlog_tracker make_backlog_tracker();

-    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr);

    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -13,6 +13,7 @@
 #include "compaction_strategy.hh"
 #include "db_clock.hh"
 #include "compaction_descriptor.hh"
+#include "tombstone_gc.hh"

 namespace compaction {
 class table_state;
@@ -21,8 +22,6 @@ class strategy_control;

 namespace sstables {

-compaction_backlog_tracker& get_unimplemented_backlog_tracker();
-
 class sstable_set_impl;
 class resharding_descriptor;

@@ -67,11 +66,11 @@ public:

    // Check if a given sstable is entitled for tombstone compaction based on its
    // droppable tombstone histogram and gc_before.
-    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time);
+    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state);

-    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() = 0;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema);

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/date_tiered_compaction_strategy.hh
+++ b/compaction/date_tiered_compaction_strategy.hh
@@ -259,7 +259,6 @@ namespace sstables {

 class date_tiered_compaction_strategy : public compaction_strategy_impl {
    date_tiered_manifest _manifest;
-    compaction_backlog_tracker _backlog_tracker;
 public:
    date_tiered_compaction_strategy(const std::map<sstring, sstring>& options);
    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;
@@ -272,9 +271,7 @@ public:
        return compaction_strategy_type::date_tiered;
    }

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;
 };

 }
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -39,16 +39,16 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
    for (auto level = int(manifest.get_level_count()); level >= 0; level--) {
        auto& sstables = manifest.get_level(level);
        // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
-        auto e = boost::range::remove_if(sstables, [this, compaction_time] (const sstables::shared_sstable& sst) -> bool {
-            return !worth_dropping_tombstones(sst, compaction_time);
+        auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
+            return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
        });
        sstables.erase(e, sstables.end());
        if (sstables.empty()) {
            continue;
        }
        auto& sst = *std::max_element(sstables.begin(), sstables.end(), [&] (auto& i, auto& j) {
-            auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time);
-            auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time);
+            auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
+            auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
            return i->estimate_droppable_tombstone_ratio(gc_before1) < j->estimate_droppable_tombstone_ratio(gc_before2);
        });
        return sstables::compaction_descriptor({ sst }, service::get_local_compaction_priority(), sst->get_sstable_level());
@@ -144,6 +144,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    auto max_sstable_size_in_bytes = _max_sstable_size_in_mb * 1024 * 1024;

+    leveled_manifest::logger.debug("get_reshaping_job: mode={} input.size={} max_sstable_size_in_bytes={}", mode == reshape_mode::relaxed ? "relaxed" : "strict", input.size(), max_sstable_size_in_bytes);
+
    for (auto& sst : input) {
        auto sst_level = sst->get_sstable_level();
        if (sst_level > leveled_manifest::MAX_LEVELS - 1) {
@@ -200,10 +202,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

        auto [disjoint, overlapping_sstables] = is_disjoint(level_info[level], tolerance(level));
        if (!disjoint) {
-            auto ideal_level = ideal_level_for_input(input, max_sstable_size_in_bytes);
-            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so compacting everything on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
-            // Unfortunately no good limit to limit input size to max_sstables for LCS major
-            compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
+            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so the level will be entirely compacted on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
+            compaction_descriptor desc(std::move(level_info[level]), iop, level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -229,6 +229,9 @@ leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, s
 }

 unsigned leveled_compaction_strategy::ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size) {
+    if (!max_sstable_size) {
+        return 1;
+    }
    auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
        double inv_log_fanout = 1.0f / std::log(fanout);
        return log(x) * inv_log_fanout;
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -35,7 +35,6 @@ class leveled_compaction_strategy : public compaction_strategy_impl {
    std::optional<std::vector<std::optional<dht::decorated_key>>> _last_compacted_keys;
    std::vector<int> _compaction_counter;
    size_tiered_compaction_strategy_options _stcs_options;
-    compaction_backlog_tracker _backlog_tracker;
    int32_t calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const;
 public:
    static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
@@ -64,9 +63,7 @@ public:
    }
    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) override;
 };
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include "sstables/sstables.hh"
 #include "size_tiered_compaction_strategy.hh"

 #include <boost/range/adaptor/transformed.hpp>
@@ -169,8 +170,8 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
    // tombstone purge, i.e. less likely to shadow even older data.
    for (auto&& sstables : buckets | boost::adaptors::reversed) {
        // filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
-        auto e = boost::range::remove_if(sstables, [this, compaction_time] (const sstables::shared_sstable& sst) -> bool {
-            return !worth_dropping_tombstones(sst, compaction_time);
+        auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
+            return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
        });
        sstables.erase(e, sstables.end());
        if (sstables.empty()) {
--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -10,7 +10,7 @@

 #include "compaction_strategy_impl.hh"
 #include "compaction.hh"
-#include "sstables/sstables.hh"
+#include "sstables/shared_sstable.hh"
 #include <boost/algorithm/cxx11/any_of.hpp>

 class size_tiered_backlog_tracker;
@@ -82,7 +82,6 @@ public:

 class size_tiered_compaction_strategy : public compaction_strategy_impl {
    size_tiered_compaction_strategy_options _options;
-    compaction_backlog_tracker _backlog_tracker;

    // Return a list of pair of shared_sstable and its respective size.
    static std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables);
@@ -128,9 +127,7 @@ public:
    most_interesting_bucket(const std::vector<sstables::shared_sstable>& candidates, int min_threshold, int max_threshold,
        size_tiered_compaction_strategy_options options = {});

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) override;

--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -10,14 +10,15 @@
 #pragma once

 #include "schema_fwd.hh"
-#include "sstables/sstable_set.hh"
-#include "sstables/sstables_manager.hh"
 #include "compaction_descriptor.hh"

 class reader_permit;
+class compaction_backlog_tracker;

 namespace sstables {
+class sstable_set;
 class compaction_strategy;
+class sstables_manager;
 struct sstable_writer_config;
 }

@@ -40,9 +41,10 @@ public:
    virtual sstables::shared_sstable make_sstable() const = 0;
    virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
    virtual api::timestamp_type min_memtable_timestamp() const = 0;
-    virtual future<> update_compaction_history(utils::UUID compaction_id, sstring ks_name, sstring cf_name, std::chrono::milliseconds ended_at, int64_t bytes_in, int64_t bytes_out) = 0;
    virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
    virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
+    virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;
+    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
 };

 }
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -43,6 +43,10 @@ time_window_compaction_strategy_options::time_window_compaction_strategy_options
        }
    }

+    if (window_size <= 0) {
+        throw exceptions::configuration_exception(fmt::format("{} must be greater than 1 for compaction_window_size", window_size));
+    }
+
    sstable_window_size = window_size * window_unit;

    it = options.find(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
@@ -96,16 +100,27 @@ public:
    };
 };

-uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
-        // Not enough information, we assume the worst
-        return partition_estimate / max_data_segregation_window_count;
-    }
-    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
-    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
-    const auto window_size = get_window_size(_options);
+uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) {
+    // If not enough information, we assume the worst
+    auto estimated_window_count = max_data_segregation_window_count;
+    auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
+    bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
+    auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
+        const auto window_size = get_window_size(_options);
+        return (max_window + (window_size - 1) - min_window) / window_size;
+    };

-    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
+    if (!min_and_max_ts_available && default_ttl.count()) {
+        auto min_window = get_window_for(_options, timestamp_type(0));
+        auto max_window = get_window_for(_options, timestamp_type(default_ttl.count()));
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    } else if (min_and_max_ts_available) {
+        auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
+        auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    }

    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
 }
@@ -270,8 +285,8 @@ time_window_compaction_strategy::get_next_non_expired_sstables(table_state& tabl

    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
    // ratio is greater than threshold.
-    auto e = boost::range::remove_if(non_expiring_sstables, [this, compaction_time] (const shared_sstable& sst) -> bool {
-        return !worth_dropping_tombstones(sst, compaction_time);
+    auto e = boost::range::remove_if(non_expiring_sstables, [this, compaction_time, &table_s] (const shared_sstable& sst) -> bool {
+        return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
    });
    non_expiring_sstables.erase(e, non_expiring_sstables.end());
    if (non_expiring_sstables.empty()) {
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -15,7 +15,7 @@
 #include "size_tiered_compaction_strategy.hh"
 #include "timestamp.hh"
 #include "exceptions/exceptions.hh"
-#include "sstables/sstables.hh"
+#include "sstables/shared_sstable.hh"
 #include "service/priority_manager.hh"

 namespace sstables {
@@ -73,7 +73,6 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
    // Keep track of all recent active windows that still need to be compacted into a single SSTable
    std::unordered_set<timestamp_type> _recent_active_windows;
    size_tiered_compaction_strategy_options _stcs_options;
-    compaction_backlog_tracker _backlog_tracker;
 public:
    // The maximum amount of buckets we segregate data into when writing into sstables.
    // To prevent an explosion in the number of sstables we cap it.
@@ -156,11 +155,9 @@ public:

    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) override;
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) override;

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) override;

--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -10,88 +10,6 @@
 #pragma once

 #include "dht/i_partitioner.hh"
-#include <optional>
-#include <variant>
-
-// Wraps ring_position_view so it is compatible with old-style C++: default
-// constructor, stateless comparators, yada yada.
-class compatible_ring_position_view {
-    const ::schema* _schema = nullptr;
-    // Optional to supply a default constructor, no more.
-    std::optional<dht::ring_position_view> _rpv;
-public:
-    constexpr compatible_ring_position_view() = default;
-    compatible_ring_position_view(const schema& s, dht::ring_position_view rpv)
-        : _schema(&s), _rpv(rpv) {
-    }
-    const dht::ring_position_view& position() const {
-        return *_rpv;
-    }
-    const ::schema& schema() const {
-        return *_schema;
-    }
-    friend std::strong_ordering tri_compare(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return dht::ring_position_tri_compare(*x._schema, *x._rpv, *y._rpv);
-    }
-    friend bool operator<(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) < 0;
-    }
-    friend bool operator<=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) <= 0;
-    }
-    friend bool operator>(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) > 0;
-    }
-    friend bool operator>=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) >= 0;
-    }
-    friend bool operator==(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) == 0;
-    }
-    friend bool operator!=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) != 0;
-    }
-};
-
-// Wraps ring_position so it is compatible with old-style C++: default
-// constructor, stateless comparators, yada yada.
-class compatible_ring_position {
-    schema_ptr _schema;
-    // Optional to supply a default constructor, no more.
-    std::optional<dht::ring_position> _rp;
-public:
-    constexpr compatible_ring_position() = default;
-    compatible_ring_position(schema_ptr s, dht::ring_position rp)
-        : _schema(std::move(s)), _rp(std::move(rp)) {
-    }
-    dht::ring_position_view position() const {
-        return *_rp;
-    }
-    const ::schema& schema() const {
-        return *_schema;
-    }
-    friend std::strong_ordering tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return dht::ring_position_tri_compare(*x._schema, *x._rp, *y._rp);
-    }
-    friend bool operator<(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) < 0;
-    }
-    friend bool operator<=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) <= 0;
-    }
-    friend bool operator>(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) > 0;
-    }
-    friend bool operator>=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) >= 0;
-    }
-    friend bool operator==(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) == 0;
-    }
-    friend bool operator!=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) != 0;
-    }
-};

 // Wraps ring_position or ring_position_view so either is compatible with old-style C++: default
 // constructor, stateless comparators, yada yada.
@@ -99,37 +17,22 @@ public:
 // on callers to keep ring position alive, allow lookup on containers that don't support different
 // key types, and also avoiding unnecessary copies.
 class compatible_ring_position_or_view {
-    // Optional to supply a default constructor, no more.
-    std::optional<std::variant<compatible_ring_position, compatible_ring_position_view>> _crp_or_view;
+    schema_ptr _schema;
+    lw_shared_ptr<dht::ring_position> _rp;
+    dht::ring_position_view_opt _rpv; // optional only for default ctor, nothing more
 public:
-    constexpr compatible_ring_position_or_view() = default;
+    compatible_ring_position_or_view() = default;
    explicit compatible_ring_position_or_view(schema_ptr s, dht::ring_position rp)
-        : _crp_or_view(compatible_ring_position(std::move(s), std::move(rp))) {
+        : _schema(std::move(s)), _rp(make_lw_shared<dht::ring_position>(std::move(rp))), _rpv(dht::ring_position_view(*_rp)) {
    }
    explicit compatible_ring_position_or_view(const schema& s, dht::ring_position_view rpv)
-        : _crp_or_view(compatible_ring_position_view(s, rpv)) {
+        : _schema(s.shared_from_this()), _rpv(rpv) {
    }
-    dht::ring_position_view position() const {
-        struct rpv_accessor {
-            dht::ring_position_view operator()(const compatible_ring_position& crp) {
-                return crp.position();
-            }
-            dht::ring_position_view operator()(const compatible_ring_position_view& crpv) {
-                return crpv.position();
-            }
-        };
-        return std::visit(rpv_accessor{}, *_crp_or_view);
+    const dht::ring_position_view& position() const {
+        return *_rpv;
    }
    friend std::strong_ordering tri_compare(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        struct schema_accessor {
-            const ::schema& operator()(const compatible_ring_position& crp) {
-                return crp.schema();
-            }
-            const ::schema& operator()(const compatible_ring_position_view& crpv) {
-                return crpv.schema();
-            }
-        };
-        return dht::ring_position_tri_compare(std::visit(schema_accessor{}, *x._crp_or_view), x.position(), y.position());
+        return dht::ring_position_tri_compare(*x._schema, x.position(), y.position());
    }
    friend bool operator<(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
        return tri_compare(x, y) < 0;
--- a/compound.hh
+++ b/compound.hh
@@ -16,7 +16,6 @@
 #include <boost/range/adaptor/transformed.hpp>
 #include "utils/serialization.hh"
 #include <seastar/util/backtrace.hh>
-#include "cql_serialization_format.hh"

 enum class allow_prefixes { no, yes };

@@ -280,7 +279,7 @@ public:
        }
        for (size_t i = 0; i != values.size(); ++i) {
            //FIXME: is it safe to assume internal serialization-format format?
-            _types[i]->validate(values[i], cql_serialization_format::internal());
+            _types[i]->validate(values[i]);
        }
    }
    bool equal(managed_bytes_view v1, managed_bytes_view v2) const {
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -560,7 +560,7 @@ public:
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
-                throw runtime_exception(format("non-zero component divider found ({:d}) mid", format("0x{:02x}", composite::eoc_type(marker) & 0xff)));
+                throw runtime_exception(format("non-zero component divider found ({:#02x}) mid", composite::eoc_type(marker) & 0xff));
            }
        }
        return ret;
--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -117,6 +117,8 @@ struct date_type_impl final : public concrete_type<db_clock::time_point> {

 using timestamp_date_base_class = concrete_type<db_clock::time_point>;

+sstring timestamp_to_json_string(const timestamp_date_base_class& t, const bytes_view& bv);
+
 struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    timeuuid_type_impl();
    static utils::UUID from_sstring(sstring_view s);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -65,6 +65,13 @@ commitlog_sync_period_in_ms: 10000
 # is reasonable.
 commitlog_segment_size_in_mb: 32

+# The size of the individual schema commitlog file segments.
+
+# The segment size puts a limit on the mutation size that can be
+# written at once, and some schema mutation writes are much larger
+# than average.
+schema_commitlog_segment_size_in_mb: 32
+
 # seed_provider class_name is saved for future use.
 # A seed address is mandatory.
 seed_provider:
@@ -410,6 +417,9 @@ commitlog_total_space_in_mb: -1
 # Log a warning when row number is larger than this value
 # compaction_rows_count_warning_threshold: 100000

+# Log a warning when writing a collection containing more elements than this value
+# compaction_collection_elements_count_warning_threshold: 10000
+
 # How long the coordinator should wait for seq or index scans to complete
 # range_request_timeout_in_ms: 10000
 # How long the coordinator should wait for writes to complete
@@ -445,20 +455,20 @@ commitlog_total_space_in_mb: -1
 #    internode_encryption: none
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-#    truststore: <none, use system trust>
-#    certficate_revocation_list: <none>
+#    truststore: <not set, use system trust>
+#    certficate_revocation_list: <not set>
 #    require_client_auth: False
-#    priority_string: <none, use default>
+#    priority_string: <not set, use default>

 # enable or disable client/server encryption.
 # client_encryption_options:
 #    enabled: false
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-#    truststore: <none, use system trust>
-#    certficate_revocation_list: <none>
+#    truststore: <not set, use system trust>
+#    certficate_revocation_list: <not set>
 #    require_client_auth: False
-#    priority_string: <none, use default>
+#    priority_string: <not set, use default>

 # internode_compression controls whether traffic between nodes is
 # compressed.
@@ -550,4 +560,16 @@ murmur3_partitioner_ignore_msb_bits: 12
 # WARNING: It's unsafe to set this to false if the node previously booted
 # with the schema commit log enabled. In such case, some schema changes
 # may be lost if the node was not cleanly stopped.
-force_schema_commit_log: true
+force_schema_commit_log: true
+
+# Use Raft to consistently manage schema information in the cluster.
+# Refer to https://docs.scylladb.com/master/architecture/raft.html for more details.
+# The 'Handling Failures' section is especially important.
+#
+# Once enabled in a cluster, this cannot be turned off.
+# If you want to bootstrap a new cluster without Raft, make sure to set this to `false`
+# before starting your nodes for the first time.
+#
+# A cluster not using Raft can be 'upgraded' to use Raft. Refer to the aforementioned
+# documentation, section 'Enabling Raft in ScyllaDB 5.2 and further', for the procedure.
+consistent_cluster_management: true
--- a/configure.py
+++ b/configure.py
@@ -48,28 +48,10 @@ employ_ld_trickery = True
 # distro-specific setup
 def distro_setup_nix():
    global os_ids, employ_ld_trickery
-    global distro_extra_ldflags, distro_extra_cflags, distro_extra_cmake_args
-
    os_ids = ['linux']
    employ_ld_trickery = False

-    libdirs = list(dict.fromkeys(os.environ.get('CMAKE_LIBRARY_PATH').split(':')))
-    incdirs = list(dict.fromkeys(os.environ.get('CMAKE_INCLUDE_PATH').split(':')))
-
-    # add nix {lib,inc}dirs to relevant flags, mimicing nix versions of cmake & autotools.
-    # also add rpaths to make sure that any built executables can run in place.
-    distro_extra_ldflags = ' '.join([ '-rpath ' + path + ' -L' + path for path in libdirs ]);
-    distro_extra_cflags = ' '.join([ '-isystem ' + path for path in incdirs ])
-
-    # indexers like clangd may or may not know which stdc++ or glibc
-    # the compiler was configured with, so make the relevant paths
-    # explicit on each compilation command line:
-    implicit_cflags = os.environ.get('IMPLICIT_CFLAGS').strip()
-    distro_extra_cflags += ' ' + implicit_cflags
-    # also propagate to cmake-built dependencies:
-    distro_extra_cmake_args = ['-DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES:INTERNAL=' + implicit_cflags]
-
-if os.environ.get('NIX_BUILD_TOP'):
+if os.environ.get('NIX_CC'):
        distro_setup_nix()

 # distribution "internationalization", converting package names.
@@ -214,7 +196,7 @@ def linker_flags(compiler):


 def maybe_static(flag, libs):
-    if flag and not args.static:
+    if flag:
        libs = '-Wl,-Bstatic {} -Wl,-Bdynamic'.format(libs)
    return libs

@@ -303,7 +285,8 @@ modes = {
        'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
        'cxx_ld_flags': '',
        'stack-usage-threshold': 1024*40,
-        'optimization-level': 'g',
+        # -fasan -Og breaks some coroutines on aarch64, use -O0 instead
+        'optimization-level': ('0' if platform.machine() == 'aarch64' else 'g'),
        'per_src_extra_cxxflags': {},
        'cmake_build_type': 'Debug',
        'can_have_debug_info': True,
@@ -426,10 +409,12 @@ scylla_tests = set([
    'test/boost/limiting_data_source_test',
    'test/boost/linearizing_input_stream_test',
    'test/boost/loading_cache_test',
+    'test/boost/locator_topology_test',
    'test/boost/log_heap_test',
    'test/boost/estimated_histogram_test',
    'test/boost/summary_test',
    'test/boost/logalloc_test',
+    'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
    'test/boost/managed_vector_test',
    'test/boost/managed_bytes_test',
    'test/boost/intrusive_array_test',
@@ -461,7 +446,7 @@ scylla_tests = set([
    'test/boost/schema_change_test',
    'test/boost/schema_registry_test',
    'test/boost/secondary_index_test',
-    'test/boost/tracing',
+    'test/boost/tracing_test',
    'test/boost/index_with_paging_test',
    'test/boost/serialization_test',
    'test/boost/serialized_action_test',
@@ -495,6 +480,8 @@ scylla_tests = set([
    'test/boost/virtual_reader_test',
    'test/boost/virtual_table_mutation_source_test',
    'test/boost/virtual_table_test',
+    'test/boost/wasm_test',
+    'test/boost/wasm_alloc_test',
    'test/boost/bptree_test',
    'test/boost/btree_test',
    'test/boost/radix_tree_test',
@@ -562,6 +549,7 @@ raft_tests = set([
    'test/raft/replication_test',
    'test/raft/randomized_nemesis_test',
    'test/raft/many_test',
+    'test/raft/raft_server_test',
    'test/raft/fsm_test',
    'test/raft/etcd_test',
    'test/raft/raft_sys_table_storage_test',
@@ -585,13 +573,6 @@ all_artifacts = apps | tests | other
 arg_parser = argparse.ArgumentParser('Configure scylla')
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
                        help='Output build-file name (by default build.ninja)')
-arg_parser.add_argument('--static', dest='static', action='store_const', default='',
-                        const='-static',
-                        help='Static link (useful for running on hosts outside the build environment')
-arg_parser.add_argument('--pie', dest='pie', action='store_true',
-                        help='Build position-independent executable (PIE)')
-arg_parser.add_argument('--so', dest='so', action='store_true',
-                        help='Build shared object (SO) instead of executable')
 arg_parser.add_argument('--mode', action='append', choices=list(modes.keys()), dest='selected_modes',
                        help="Build modes to generate ninja files for. The available build modes are:\n{}".format("; ".join(["{} - {}".format(m, cfg['description']) for m, cfg in modes.items()])))
 arg_parser.add_argument('--with', dest='artifacts', action='append', default=[],
@@ -630,6 +611,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
                        help='Link libyaml-cpp statically')
 arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
                        help='Enable(1)/disable(0)compiler debug information generation for tests')
+arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
+                        help='Enable(1)/disable(0)compiler debug information generation for perf tests')
 arg_parser.add_argument('--python', action='store', dest='python', default='python3',
                        help='Python3 path')
 arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
@@ -652,6 +635,8 @@ arg_parser.add_argument('--clang-inline-threshold', action='store', type=int, de
                        help="LLVM-specific inline threshold compilation parameter")
 arg_parser.add_argument('--list-artifacts', dest='list_artifacts', action='store_true', default=False,
                        help='List all available build artifacts, that can be passed to --with')
+arg_parser.add_argument('--date-stamp', dest='date_stamp', type=str,
+                        help='Set datestamp for SCYLLA-VERSION-GEN')
 args = arg_parser.parse_args()

 if args.list_artifacts:
@@ -661,6 +646,7 @@ if args.list_artifacts:

 defines = ['XXH_PRIVATE_API',
           'SEASTAR_TESTING_MAIN',
+           'FMT_DEPRECATED_OSTREAM',
 ]

 scylla_raft_core = [
@@ -671,12 +657,13 @@ scylla_raft_core = [
    'raft/log.cc',
 ]

-scylla_core = (['replica/database.cc',
+scylla_core = (['message/messaging_service.cc',
+                'replica/database.cc',
                'replica/table.cc',
                'replica/distributed_loader.cc',
                'replica/memtable.cc',
                'replica/exceptions.cc',
-                'dirty_memory_manager.cc',
+                'replica/dirty_memory_manager.cc',
                'absl-flat_hash_map.cc',
                'atomic_cell.cc',
                'caching_options.cc',
@@ -711,6 +698,7 @@ scylla_core = (['replica/database.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
+                'utils/on_internal_error.cc',
                'converting_mutation_partition_applier.cc',
                'readers/combined.cc',
                'readers/multishard.cc',
@@ -799,6 +787,8 @@ scylla_core = (['replica/database.cc',
                'cql3/statements/raw/parsed_statement.cc',
                'cql3/statements/property_definitions.cc',
                'cql3/statements/update_statement.cc',
+                'cql3/statements/strongly_consistent_modification_statement.cc',
+                'cql3/statements/strongly_consistent_select_statement.cc',
                'cql3/statements/delete_statement.cc',
                'cql3/statements/prune_materialized_view_statement.cc',
                'cql3/statements/batch_statement.cc',
@@ -828,6 +818,7 @@ scylla_core = (['replica/database.cc',
                'cql3/statements/detach_service_level_statement.cc',
                'cql3/statements/list_service_level_statement.cc',
                'cql3/statements/list_service_level_attachments_statement.cc',
+                'cql3/statements/describe_statement.cc',
                'cql3/update_parameters.cc',
                'cql3/util.cc',
                'cql3/ut_name.cc',
@@ -913,6 +904,7 @@ scylla_core = (['replica/database.cc',
                'utils/config_file.cc',
                'utils/multiprecision_int.cc',
                'utils/gz/crc_combine.cc',
+                'utils/gz/crc_combine_table.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -947,7 +939,8 @@ scylla_core = (['replica/database.cc',
                'locator/ec2_snitch.cc',
                'locator/ec2_multi_region_snitch.cc',
                'locator/gce_snitch.cc',
-                'message/messaging_service.cc',
+                'locator/topology.cc',
+                'locator/util.cc',
                'service/client_state.cc',
                'service/storage_service.cc',
                'service/misc_services.cc',
@@ -977,6 +970,7 @@ scylla_core = (['replica/database.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -999,7 +993,6 @@ scylla_core = (['replica/database.cc',
                'tracing/tracing.cc',
                'tracing/trace_keyspace_helper.cc',
                'tracing/trace_state.cc',
-                'tracing/tracing_backend_registry.cc',
                'tracing/traced_file.cc',
                'table_helper.cc',
                'range_tombstone.cc',
@@ -1037,6 +1030,9 @@ scylla_core = (['replica/database.cc',
                'service/raft/raft_group0.cc',
                'direct_failure_detector/failure_detector.cc',
                'service/raft/raft_group0_client.cc',
+                'service/broadcast_tables/experimental/lang.cc',
+                'tasks/task_manager.cc',
+                'rust/wasmtime_bindings/src/lib.rs',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')] \
                  + scylla_raft_core
               )
@@ -1073,12 +1069,18 @@ api = ['api/api.cc',
       'api/stream_manager.cc',
       Json2Code('api/api-doc/system.json'),
       'api/system.cc',
+       Json2Code('api/api-doc/task_manager.json'),
+       'api/task_manager.cc',
+       Json2Code('api/api-doc/task_manager_test.json'),
+       'api/task_manager_test.cc',
       'api/config.cc',
       Json2Code('api/api-doc/config.json'),
       'api/error_injection.cc',
       Json2Code('api/api-doc/error_injection.json'),
       'api/authorization_cache.cc',
       Json2Code('api/api-doc/authorization_cache.json'),
+       'api/raft.cc',
+       Json2Code('api/api-doc/raft.json'),
       ]

 alternator = [
@@ -1147,12 +1149,9 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/replica_exception.idl.hh',
        'idl/per_partition_rate_limit_info.idl.hh',
        'idl/position_in_partition.idl.hh',
+        'idl/experimental/broadcast_tables_lang.idl.hh',
        ]

-rusts = [
-    'rust/inc/src/lib.rs',
-]
-
 headers = find_headers('.', excluded_dirs=['idl', 'build', 'seastar', '.git'])

 scylla_tests_generic_dependencies = [
@@ -1174,9 +1173,9 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci
    'test/lib/random_schema.cc',
 ]

-scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc']
+scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc']

-scylla_tools = ['tools/scylla-types.cc', 'tools/scylla-sstable.cc', 'tools/schema_loader.cc', 'tools/utils.cc']
+scylla_tools = ['tools/scylla-types.cc', 'tools/scylla-sstable.cc', 'tools/schema_loader.cc', 'tools/utils.cc', 'tools/lua_sstable_consumer.cc']

 deps = {
    'scylla': idls + ['main.cc'] + scylla_core + api + alternator + redis + scylla_tools,
@@ -1274,7 +1273,7 @@ deps['test/boost/bytes_ostream_test'] = [
    "test/lib/log.cc",
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
-deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
+deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc', 'utils/on_internal_error.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
@@ -1298,16 +1297,17 @@ deps['test/boost/linearizing_input_stream_test'] = [
    "test/boost/linearizing_input_stream_test.cc",
    "test/lib/log.cc",
 ]
-deps['test/boost/expr_test'] = ['test/boost/expr_test.cc'] + scylla_core
+deps['test/boost/expr_test'] = ['test/boost/expr_test.cc', 'test/lib/expr_test_utils.cc'] + scylla_core
 deps['test/boost/rate_limiter_test'] = ['test/boost/rate_limiter_test.cc', 'db/rate_limiter.cc']
 deps['test/boost/exceptions_optimized_test'] = ['test/boost/exceptions_optimized_test.cc', 'utils/exceptions.cc']
 deps['test/boost/exceptions_fallback_test'] = ['test/boost/exceptions_fallback_test.cc', 'utils/exceptions.cc']

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
 deps['test/boost/schema_loader_test'] += ['tools/schema_loader.cc']
-deps['test/boost/rust_test'] += rusts
+deps['test/boost/rust_test'] += ['rust/inc/src/lib.rs']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
+deps['test/raft/raft_server_test'] = ['test/raft/raft_server_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
 deps['test/raft/randomized_nemesis_test'] = ['test/raft/randomized_nemesis_test.cc', 'direct_failure_detector/failure_detector.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
 deps['test/raft/failure_detector_test'] = ['test/raft/failure_detector_test.cc', 'direct_failure_detector/failure_detector.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
 deps['test/raft/many_test'] = ['test/raft/many_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
@@ -1321,8 +1321,6 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',
                                     'test/lib/log.cc',
                                     'service/raft/discovery.cc'] + scylla_raft_dependencies

-deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']
-

 warnings = [
    '-Wall',
@@ -1372,7 +1370,7 @@ warnings = [w

 warnings = ' '.join(warnings + ['-Wno-error=deprecated-declarations'])

-def clang_inline_threshold():
+def get_clang_inline_threshold():
    if args.clang_inline_threshold != -1:
        return args.clang_inline_threshold
    elif platform.machine() == 'aarch64':
@@ -1393,7 +1391,7 @@ for mode in modes:

 optimization_flags = [
    '--param inline-unit-growth=300', # gcc
-    f'-mllvm -inline-threshold={clang_inline_threshold()}',  # clang
+    f'-mllvm -inline-threshold={get_clang_inline_threshold()}',  # clang
    # clang generates 16-byte loads that break store-to-load forwarding
    # gcc also has some trouble: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103554
    '-fno-slp-vectorize',
@@ -1407,37 +1405,16 @@ if flag_supported(flag='-Wstack-usage=4096', compiler=args.cxx):
    for mode in modes:
        modes[mode]['cxxflags'] += f' -Wstack-usage={modes[mode]["stack-usage-threshold"]} -Wno-error=stack-usage='

-has_wasmtime = os.path.isfile('/usr/lib64/libwasmtime.a') and os.path.isdir('/usr/local/include/wasmtime')
-
-if has_wasmtime:
-    if platform.machine() == 'aarch64':
-        print("wasmtime is temporarily not supported on aarch64. Ref: issue #9387")
-        has_wasmtime = False
-    else:
-        for mode in modes:
-            modes[mode]['cxxflags'] += ' -DSCYLLA_ENABLE_WASMTIME'
-else:
-    print("wasmtime not found - WASM support will not be enabled in this build")
-
 linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
+perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'

 # Strip if debuginfo is disabled, otherwise we end up with partial
 # debug info from the libraries we static link with
 regular_link_rule = 'link' if args.debuginfo else 'link_stripped'

-if args.so:
-    args.pie = '-shared'
-    args.fpie = '-fpic'
-elif args.pie:
-    args.pie = '-pie'
-    args.fpie = '-fpie'
-else:
-    args.pie = ''
-    args.fpie = ''
-
 # a list element means a list of alternative packages to consider
 # the first element becomes the HAVE_pkg define
 # a string element is a package name with no alternatives
@@ -1536,7 +1513,8 @@ if args.artifacts:
 else:
    build_artifacts = all_artifacts

-status = subprocess.call("./SCYLLA-VERSION-GEN")
+date_stamp = f"--date-stamp {args.date_stamp}" if args.date_stamp else ""
+status = subprocess.call(f"./SCYLLA-VERSION-GEN {date_stamp}", shell=True)
 if status != 0:
    print('Version file generation failed')
    sys.exit(1)
@@ -1551,7 +1529,8 @@ scylla_product = file.read().strip()
 arch = platform.machine()

 for m, mode_config in modes.items():
-    cxxflags = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\" -DSCYLLA_BUILD_MODE=\"\\\"" + m + "\\\"\""
+    mode_config['cxxflags'] += f" -DSCYLLA_BUILD_MODE={m}"
+    cxxflags = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\""
    mode_config["per_src_extra_cxxflags"]["release.cc"] = cxxflags
    if mode_config["can_have_debug_info"]:
        mode_config['cxxflags'] += ' ' + dbgflag
@@ -1592,13 +1571,14 @@ args.user_ldflags = forced_ldflags + ' ' + args.user_ldflags

 args.user_cflags += f" -ffile-prefix-map={curdir}=."

-seastar_cflags = args.user_cflags
-
 if args.target != '':
-    seastar_cflags += ' -march=' + args.target
-seastar_ldflags = args.user_ldflags
+    args.user_cflags += ' -march=' + args.target

-libdeflate_cflags = seastar_cflags
+for mode in modes:
+    # Those flags are passed not only to Scylla objects, but also to libraries
+    # that we compile ourselves.
+    modes[mode]['lib_cflags'] = args.user_cflags
+    modes[mode]['lib_ldflags'] = args.user_ldflags + linker_flags

 # cmake likes to separate things with semicolons
 def semicolon_separated(*flags):
@@ -1618,8 +1598,8 @@ def configure_seastar(build_dir, mode, mode_config):
        '-DCMAKE_C_COMPILER={}'.format(args.cc),
        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
        '-DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON',
-        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags).replace(' ', ';')),
-        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(seastar_ldflags, modes[mode]['cxx_ld_flags'])),
+        '-DSeastar_CXX_FLAGS=SHELL:{}'.format(mode_config['lib_cflags']),
+        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(mode_config['lib_ldflags'], mode_config['cxx_ld_flags'])),
        '-DSeastar_CXX_DIALECT=gnu++20',
        '-DSeastar_API_LEVEL=6',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
@@ -1680,52 +1660,16 @@ for mode in build_modes:
    seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
    modes[mode]['seastar_cflags'] = seastar_pc_cflags
    modes[mode]['seastar_libs'] = seastar_pc_libs
+    modes[mode]['seastar_testing_libs'] = pkg_config(pc[mode].replace('seastar.pc', 'seastar-testing.pc'), '--libs', '--static')

-def configure_abseil(build_dir, mode, mode_config):
-    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
+abseil_pkgs = [
+    'absl_raw_hash_set',
+    'absl_hash',
+]

-    abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
-    cmake_mode = mode_config['cmake_build_type']
-    abseil_cmake_args = [
-        '-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
-        '-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
-        '-DCMAKE_C_COMPILER={}'.format(args.cc),
-        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
-        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
-        '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
-        '-DCMAKE_CXX_STANDARD=20',
-        '-DABSL_PROPAGATE_CXX_STD=ON',
-    ] + distro_extra_cmake_args
-
-    abseil_cmd = ['cmake', '-G', 'Ninja', real_relpath('abseil', abseil_build_dir)] + abseil_cmake_args
-
-    os.makedirs(abseil_build_dir, exist_ok=True)
-    subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
-
-abseil_libs = ['absl/' + lib for lib in [
-    'container/libabsl_hashtablez_sampler.a',
-    'container/libabsl_raw_hash_set.a',
-    'synchronization/libabsl_synchronization.a',
-    'synchronization/libabsl_graphcycles_internal.a',
-    'debugging/libabsl_stacktrace.a',
-    'debugging/libabsl_symbolize.a',
-    'debugging/libabsl_debugging_internal.a',
-    'debugging/libabsl_demangle_internal.a',
-    'time/libabsl_time.a',
-    'time/libabsl_time_zone.a',
-    'numeric/libabsl_int128.a',
-    'hash/libabsl_city.a',
-    'hash/libabsl_hash.a',
-    'hash/libabsl_low_level_hash.a',
-    'base/libabsl_malloc_internal.a',
-    'base/libabsl_spinlock_wait.a',
-    'base/libabsl_base.a',
-    'base/libabsl_raw_logging_internal.a',
-    'profiling/libabsl_exponential_biased.a',
-    'base/libabsl_throw_delegate.a']]
+pkgs += abseil_pkgs

 args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
-args.user_cflags += ' -march=' + args.target
 libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
                 ' -lstdc++fs', ' -lcrypt', ' -lcryptopp', ' -lpthread',
                 # Must link with static version of libzstd, since
@@ -1733,9 +1677,8 @@ libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-l
                 maybe_static(True, '-lzstd'),
                 maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc -licui18n'),
                 '-lxxhash',
+                 '-ldeflate',
                ])
-if has_wasmtime:
-    print("Found wasmtime dependency, linking with libwasmtime")

 if not args.staticboost:
    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'
@@ -1754,7 +1697,6 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
    libs += ' ' + pkg_config(pkg, '--libs')
-args.user_cflags += ' -Iabseil'
 user_cflags = args.user_cflags + ' -fvisibility=hidden'
 user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
 if args.staticcxx:
@@ -1776,10 +1718,6 @@ if args.ragel_exec:
 else:
    ragel_exec = "ragel"

-if not args.dist_only:
-    for mode, mode_config in build_modes.items():
-        configure_abseil(outdir, mode, mode_config)
-
 with open(buildfile, 'w') as f:
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
@@ -1817,8 +1755,14 @@ with open(buildfile, 'w') as f:
        rule copy
            command = cp --reflink=auto $in $out
            description = COPY $out
+        rule strip
+            command = scripts/strip.sh $in
        rule package
            command = scripts/create-relocatable-package.py --mode $mode $out
+        rule stripped_package
+            command = scripts/create-relocatable-package.py --stripped --mode $mode $out
+        rule debuginfo_package
+            command = dist/debuginfo/scripts/create-relocatable-package.py --mode $mode $out
        rule rpmbuild
            command = reloc/build_rpm.sh --reloc-pkg $in --builddir $out
        rule debbuild
@@ -1826,18 +1770,24 @@ with open(buildfile, 'w') as f:
        rule unified
            command = unified/build_unified.sh --mode $mode --unified-pkg $out
        rule rust_header
-            command = cxxbridge $in > $out
+            command = cxxbridge --include rust/cxx.h --header $in > $out
            description = RUST_HEADER $out
+        rule rust_source
+            command = cxxbridge --include rust/cxx.h $in > $out
+            description = RUST_SOURCE $out
+        rule cxxbridge_header
+            command = cxxbridge --header > $out
        ''').format(**globals()))
    for mode in build_modes:
        modeval = modes[mode]
        fmt_lib = 'fmt'
        f.write(textwrap.dedent('''\
            cxx_ld_flags_{mode} = {cxx_ld_flags}
-            ld_flags_{mode} = $cxx_ld_flags_{mode}
-            cxxflags_{mode} = $cxx_ld_flags_{mode} {cxxflags} -iquote. -iquote $builddir/{mode}/gen
+            ld_flags_{mode} = $cxx_ld_flags_{mode} {lib_ldflags}
+            cxxflags_{mode} = $cxx_ld_flags_{mode} {lib_cflags} {cxxflags} -iquote. -iquote $builddir/{mode}/gen
            libs_{mode} = -l{fmt_lib}
            seastar_libs_{mode} = {seastar_libs}
+            seastar_testing_libs_{mode} = {seastar_testing_libs}
            rule cxx.{mode}
              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in
              description = CXX $out
@@ -1887,7 +1837,8 @@ with open(buildfile, 'w') as f:
              pool = console
              description = TEST {mode}
            rule rust_lib.{mode}
-              command = CARGO_HOME=build/{mode}/rust/.cargo cargo build --release --manifest-path=rust/Cargo.toml --target-dir=build/{mode}/rust -p ${{pkg}}
+              command = CARGO_BUILD_DEP_INFO_BASEDIR='.' cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
+                        && touch $out
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, test_repeat=test_repeat, test_timeout=test_timeout, **modeval))
        f.write(
@@ -1906,7 +1857,6 @@ with open(buildfile, 'w') as f:
        ragels = {}
        antlr3_grammars = set()
        rust_headers = {}
-        rust_libs = {}
        seastar_dep = '$builddir/{}/seastar/libseastar.a'.format(mode)
        seastar_testing_dep = '$builddir/{}/seastar/libseastar_testing.a'.format(mode)
        for binary in sorted(build_artifacts):
@@ -1917,9 +1867,8 @@ with open(buildfile, 'w') as f:
                    for src in srcs
                    if src.endswith('.cc')]
            objs.append('$builddir/../utils/arch/powerpc/crc32-vpmsum/crc32.S')
-            if has_wasmtime:
-                objs.append('/usr/lib64/libwasmtime.a')
            has_thrift = False
+            has_rust = False
            for dep in deps[binary]:
                if isinstance(dep, Thrift):
                    has_thrift = True
@@ -1928,40 +1877,36 @@ with open(buildfile, 'w') as f:
                    objs += dep.objects('$builddir/' + mode + '/gen')
                if isinstance(dep, Json2Code):
                    objs += dep.objects('$builddir/' + mode + '/gen')
-                if dep.endswith('/src/lib.rs'):
-                    lib = dep.replace('/src/lib.rs', '.a').replace('rust/','lib')
-                    objs.append('$builddir/' + mode + '/rust/release/' + lib)
-            if binary.endswith('.a'):
-                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
+                if dep.endswith('.rs'):
+                    has_rust = True
+                    idx = dep.rindex('/src/')
+                    obj = dep[:idx].replace('rust/','') + '.o'
+                    objs.append('$builddir/' + mode + '/gen/rust/' + obj)
+            if has_rust:
+                objs.append('$builddir/' + mode +'/rust-' + mode + '/librust_combined.a')
+            local_libs = '$seastar_libs_{} $libs'.format(mode)
+            if has_thrift:
+                local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
+            if binary in tests:
+                if binary in pure_boost_tests:
+                    local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
+                if binary not in tests_not_using_seastar_test_framework:
+                    local_libs += ' ' + "$seastar_testing_libs_{}".format(mode)
+                # Our code's debugging information is huge, and multiplied
+                # by many tests yields ridiculous amounts of disk space.
+                # So we strip the tests by default; The user can very
+                # quickly re-link the test unstripped by adding a "_g"
+                # to the test name, e.g., "ninja build/release/testname_g"
+                link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                f.write('   libs = {}\n'.format(local_libs))
+                f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                f.write('   libs = {}\n'.format(local_libs))
            else:
-                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
-                    'libdeflate/libdeflate.a',
-                ] + [
-                    'abseil/' + x for x in abseil_libs
-                ]])
-                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
-                if binary in tests:
-                    local_libs = '$seastar_libs_{} $libs'.format(mode)
-                    if binary in pure_boost_tests:
-                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
-                    if binary not in tests_not_using_seastar_test_framework:
-                        pc_path = pc[mode].replace('seastar.pc', 'seastar-testing.pc')
-                        local_libs += ' ' + pkg_config(pc_path, '--libs', '--static')
-                    if has_thrift:
-                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
-                    # Our code's debugging information is huge, and multiplied
-                    # by many tests yields ridiculous amounts of disk space.
-                    # So we strip the tests by default; The user can very
-                    # quickly re-link the test unstripped by adding a "_g"
-                    # to the test name, e.g., "ninja build/release/testname_g"
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
-                    f.write('   libs = {}\n'.format(local_libs))
-                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
-                    f.write('   libs = {}\n'.format(local_libs))
-                else:
-                    f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep))
-                    if has_thrift:
-                        f.write('   libs =  {} {} $seastar_libs_{} $libs\n'.format(thrift_libs, maybe_static(args.staticboost, '-lboost_system'), mode))
+                f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep))
+                f.write('   libs = {}\n'.format(local_libs))
+                f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
+                f.write(f'build $builddir/{mode}/{binary}.debug: phony $builddir/{mode}/{binary}.stripped\n')
            for src in srcs:
                if src.endswith('.cc'):
                    obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
@@ -1978,19 +1923,12 @@ with open(buildfile, 'w') as f:
                    thrifts.add(src)
                elif src.endswith('.g'):
                    antlr3_grammars.add(src)
-                elif src.endswith('/src/lib.rs'):
-                    hh = '$builddir/' + mode + '/gen/' + src.replace('/src/lib.rs', '.hh')
+                elif src.endswith('.rs'):
+                    idx = src.rindex('/src/')
+                    hh = '$builddir/' + mode + '/gen/' + src[:idx] + '.hh'
                    rust_headers[hh] = src
-                    staticlib = src.replace('rust/', '$builddir/' + mode + '/rust/release/lib').replace('/src/lib.rs', '.a')
-                    rust_libs[staticlib] = src
                else:
                    raise Exception('No rule for ' + src)
-        compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
-        compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
-        f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
-                                            '$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
-        f.write('build {}: link_build.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
-                                                '$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
        f.write('   libs = $seastar_libs_{}\n'.format(mode))
        f.write(
            'build {mode}-objects: phony {objs}\n'.format(
@@ -2028,6 +1966,7 @@ with open(buildfile, 'w') as f:
        gen_headers += list(serializers.keys())
        gen_headers += list(ragels.keys())
        gen_headers += list(rust_headers.keys())
+        gen_headers.append('$builddir/{}/gen/rust/cxx.h'.format(mode))
        gen_headers_dep = ' '.join(gen_headers)

        for obj in compiles:
@@ -2051,10 +1990,13 @@ with open(buildfile, 'w') as f:
        for hh in rust_headers:
            src = rust_headers[hh]
            f.write('build {}: rust_header {}\n'.format(hh, src))
-        for lib in rust_libs:
-            src = rust_libs[lib]
-            package = src.replace('/src/lib.rs', '').replace('rust/','')
-            f.write('build {}: rust_lib.{} {}\n  pkg = {}\n'.format(lib, mode, src, package))
+            cc = hh.replace('.hh', '.cc')
+            f.write('build {}: rust_source {}\n'.format(cc, src))
+            obj = cc.replace('.cc', '.o')
+            f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, gen_headers_dep))
+        f.write('build {}: cxxbridge_header\n'.format('$builddir/{}/gen/rust/cxx.h'.format(mode)))
+        librust = '$builddir/{}/rust-{}/librust_combined'.format(mode, mode)
+        f.write('build {}.a: rust_lib.{} rust/Cargo.lock\n  depfile={}.d\n'.format(librust, mode, librust))
        for thrift in thrifts:
            outs = ' '.join(thrift.generated('$builddir/{}/gen'.format(mode)))
            f.write('build {}: thrift.{} {}\n'.format(outs, mode, thrift.source))
@@ -2070,7 +2012,8 @@ with open(buildfile, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
@@ -2096,42 +2039,42 @@ with open(buildfile, 'w') as f:
        f.write('  target = iotune\n'.format(**locals()))
        f.write(textwrap.dedent('''\
            build $builddir/{mode}/iotune: copy $builddir/{mode}/seastar/apps/iotune/iotune
+            build $builddir/{mode}/iotune.stripped: strip $builddir/{mode}/iotune
+            build $builddir/{mode}/iotune.debug: phony $builddir/{mode}/iotune.stripped
            ''').format(**locals()))
-        include_scylla_and_iotune = f'$builddir/{mode}/scylla $builddir/{mode}/iotune' if not args.dist_only else ''
-        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz: package {include_scylla_and_iotune} $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter | always\n'.format(**locals()))
+        if args.dist_only:
+            include_scylla_and_iotune = ''
+            include_scylla_and_iotune_stripped = ''
+            include_scylla_and_iotune_debug = ''
+        else:
+            include_scylla_and_iotune = f'$builddir/{mode}/scylla $builddir/{mode}/iotune'
+            include_scylla_and_iotune_stripped = f'$builddir/{mode}/scylla.stripped $builddir/{mode}/iotune.stripped'
+            include_scylla_and_iotune_debug = f'$builddir/{mode}/scylla.debug $builddir/{mode}/iotune.debug'
+        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-unstripped-{scylla_version}-{scylla_release}.{arch}.tar.gz: package {include_scylla_and_iotune} $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter/node_exporter | always\n'.format(**locals()))
+        f.write('  mode = {mode}\n'.format(**locals()))
+        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz: stripped_package {include_scylla_and_iotune_stripped} $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter/node_exporter.stripped | always\n'.format(**locals()))
+        f.write('  mode = {mode}\n'.format(**locals()))
+        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-debuginfo-{scylla_version}-{scylla_release}.{arch}.tar.gz: debuginfo_package {include_scylla_and_iotune_debug} $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter/node_exporter.debug | always\n'.format(**locals()))
        f.write('  mode = {mode}\n'.format(**locals()))
        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz\n'.format(**locals()))
        f.write('  mode = {mode}\n'.format(**locals()))
        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-{arch}-package.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz\n'.format(**locals()))
        f.write('  mode = {mode}\n'.format(**locals()))

-        f.write(f'build $builddir/dist/{mode}/redhat: rpmbuild $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
+        f.write(f'build $builddir/dist/{mode}/redhat: rpmbuild $builddir/{mode}/dist/tar/{scylla_product}-unstripped-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
+        f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/dist/tar/{scylla_product}-unstripped-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian dist-server-compat-{mode} dist-server-compat-arch-{mode}\n')
-        f.write(f'build dist-server-compat-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
-        f.write(f'build dist-server-compat-arch-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-{arch}-package.tar.gz\n')
-        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz dist-jmx-rpm dist-jmx-deb dist-jmx-compat\n')
-        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz dist-tools-rpm dist-tools-deb dist-tools-compat\n')
-        f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb dist-python3-compat dist-python3-compat-arch\n')
-        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz dist-unified-compat-{mode} dist-unified-compat-arch-{mode}\n')
-        f.write(f'build dist-unified-compat-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz\n')
-        f.write(f'build dist-unified-compat-arch-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz\n')
+        f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian\n')
+        f.write(f'build dist-server-debuginfo-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-debuginfo-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
+        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz dist-jmx-rpm dist-jmx-deb\n')
+        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz dist-tools-rpm dist-tools-deb\n')
+        f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb\n')
+        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz: unified $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz | always\n')
        f.write(f'  mode = {mode}\n')
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
-        f.write('rule libdeflate.{mode}\n'.format(**locals()))
-        f.write('  command = make -C libdeflate BUILD_DIR=../$builddir/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../$builddir/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
-        f.write('build $builddir/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
-        f.write('  pool = submodule_pool\n')
-
-        for lib in abseil_libs:
-            f.write('build $builddir/{mode}/abseil/{lib}: ninja $builddir/{mode}/abseil/build.ninja\n'.format(**locals()))
-            f.write('  pool = submodule_pool\n')
-            f.write('  subdir = $builddir/{mode}/abseil\n'.format(**locals()))
-            f.write('  target = {lib}\n'.format(**locals()))

    checkheaders_mode = 'dev' if 'dev' in modes else modes.keys()[0]
    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(checkheaders_mode, hh) for hh in headers])))
@@ -2148,16 +2091,13 @@ with open(buildfile, 'w') as f:

    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
-        build dist-unified-compat: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz' for mode in default_modes])}
-        build dist-unified-compat-arch: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz' for mode in default_modes])}
-        build dist-unified: phony dist-unified-tar dist-unified-compat dist-unified-compat-arch
+        build dist-unified: phony dist-unified-tar

        build dist-server-deb: phony {' '.join(['$builddir/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
        build dist-server-rpm: phony {' '.join(['$builddir/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
        build dist-server-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-server-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-server-compat-arch: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-{arch}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-server: phony dist-server-tar dist-server-compat dist-server-compat-arch dist-server-rpm dist-server-deb
+        build dist-server-debuginfo: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-debuginfo-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
+        build dist-server: phony dist-server-tar dist-server-debuginfo dist-server-rpm dist-server-deb

        rule build-submodule-reloc
          command = cd $reloc_dir && ./reloc/build_reloc.sh --version $$(<../../build/SCYLLA-PRODUCT-FILE)-$$(sed 's/-/~/' <../../build/SCYLLA-VERSION-FILE)-$$(<../../build/SCYLLA-RELEASE-FILE) --nodeps $args
@@ -2175,8 +2115,7 @@ with open(buildfile, 'w') as f:
          dir = tools/jmx
          artifact = $builddir/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz
        build dist-jmx-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz'.format(mode=mode, scylla_product=scylla_product, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-jmx-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-jmx: phony dist-jmx-tar dist-jmx-compat dist-jmx-rpm dist-jmx-deb
+        build dist-jmx: phony dist-jmx-tar dist-jmx-rpm dist-jmx-deb

        build tools/java/build/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz: build-submodule-reloc | build/SCYLLA-PRODUCT-FILE build/SCYLLA-VERSION-FILE build/SCYLLA-RELEASE-FILE
          reloc_dir = tools/java
@@ -2187,8 +2126,7 @@ with open(buildfile, 'w') as f:
          dir = tools/java
          artifact = $builddir/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz
        build dist-tools-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz'.format(mode=mode, scylla_product=scylla_product, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-tools-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-tools: phony dist-tools-tar dist-tools-compat dist-tools-rpm dist-tools-deb
+        build dist-tools: phony dist-tools-tar dist-tools-rpm dist-tools-deb

        build tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: build-submodule-reloc | build/SCYLLA-PRODUCT-FILE build/SCYLLA-VERSION-FILE build/SCYLLA-RELEASE-FILE
          reloc_dir = tools/python3
@@ -2200,14 +2138,10 @@ with open(buildfile, 'w') as f:
          dir = tools/python3
          artifact = $builddir/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build dist-python3-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-python3-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-python3-compat-arch: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-{arch}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-python3: phony dist-python3-tar dist-python3-compat dist-python3-compat-arch dist-python3-rpm dist-python3-deb
+        build dist-python3: phony dist-python3-tar dist-python3-rpm dist-python3-deb
        build dist-deb: phony dist-server-deb dist-python3-deb dist-jmx-deb dist-tools-deb
        build dist-rpm: phony dist-server-rpm dist-python3-rpm dist-jmx-rpm dist-tools-rpm
        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-jmx-tar dist-tools-tar
-        build dist-compat: phony dist-unified-compat dist-server-compat dist-python3-compat
-        build dist-compat-arch: phony dist-unified-compat-arch dist-server-compat-arch dist-python3-compat-arch

        build dist: phony dist-unified dist-server dist-python3 dist-jmx dist-tools
        '''))
@@ -2227,7 +2161,7 @@ with open(buildfile, 'w') as f:
        build $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz: copy tools/jmx/build/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz: copy tools/jmx/build/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz

-        build {mode}-dist: phony dist-server-{mode} dist-python3-{mode} dist-tools-{mode} dist-jmx-{mode} dist-unified-{mode}
+        build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-tools-{mode} dist-jmx-{mode} dist-unified-{mode}
        build dist-{mode}: phony {mode}-dist
        build dist-check-{mode}: dist-check
          mode = {mode}
@@ -2252,7 +2186,7 @@ with open(buildfile, 'w') as f:
            description = List configured modes
        build mode_list: mode_list
        default {modes_list}
-        ''').format(modes_list=' '.join(default_modes), build_ninja_list=' '.join([f'build/{mode}/{dir}/build.ninja' for mode in build_modes for dir in ['seastar', 'abseil']]), **globals()))
+        ''').format(modes_list=' '.join(default_modes), build_ninja_list=' '.join([f'build/{mode}/{dir}/build.ninja' for mode in build_modes for dir in ['seastar']]), **globals()))
    unit_test_list = set(test for test in build_artifacts if test in set(tests))
    f.write(textwrap.dedent('''\
        rule unit_test_list
@@ -2270,7 +2204,9 @@ with open(buildfile, 'w') as f:
        build $builddir/debian/debian: debian_files_gen | always
        rule extract_node_exporter
            command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
-        build $builddir/node_exporter: extract_node_exporter | always
+        build $builddir/node_exporter/node_exporter: extract_node_exporter | always
+        build $builddir/node_exporter/node_exporter.stripped: strip $builddir/node_exporter/node_exporter
+        build $builddir/node_exporter/node_exporter.debug: phony $builddir/node_exporter/node_exporter.stripped
        rule print_help
             command = ./scripts/build-help.sh
        build help: print_help | always
@@ -2279,7 +2215,7 @@ with open(buildfile, 'w') as f:
 compdb = 'compile_commands.json'
 # per-mode compdbs are built by taking the relevant entries from the
 # output of "ninja -t compdb" and combining them with the CMake-made
-# compdbs for Seastar and Abseil in the relevant mode.
+# compdbs for Seastar in the relevant mode.
 #
 # "ninja -t compdb" output has to be filtered because
 # - it contains rules for all selected modes, and several entries for
@@ -2294,7 +2230,7 @@ with tempfile.NamedTemporaryFile() as ninja_compdb:
    # build mode-specific compdbs
    for mode in selected_modes:
        mode_out = outdir + '/' + mode
-        submodule_compdbs = [mode_out + '/' + submodule + '/' + compdb for submodule in ['abseil', 'seastar']]
+        submodule_compdbs = [mode_out + '/' + submodule + '/' + compdb for submodule in ['seastar']]
        with open(mode_out + '/' + compdb, 'w+b') as combined_mode_specific_compdb:
            subprocess.run(['./scripts/merge-compdb.py', 'build/' + mode,
                            ninja_compdb.name] + submodule_compdbs, stdout=combined_mode_specific_compdb)
--- a/counters.cc
+++ b/counters.cc
@@ -12,10 +12,6 @@

 #include <boost/range/algorithm/sort.hpp>

-std::ostream& operator<<(std::ostream& os, const counter_id& id) {
-    return os << id.to_uuid();
-}
-
 std::ostream& operator<<(std::ostream& os, counter_shard_view csv) {
    return os << "{global_shard id: " << csv.id() << " value: " << csv.value()
              << " clock: " << csv.logical_clock() << "}";
@@ -174,9 +170,11 @@ std::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, ato
 }


-void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id) {
+void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset, locator::host_id local_host_id) {
    // FIXME: allow current_state to be frozen_mutation

+    utils::UUID local_id = local_host_id.uuid();
+
    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& cells) {
        cells.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
--- a/counters.hh
+++ b/counters.hh
@@ -14,51 +14,12 @@

 #include "atomic_cell.hh"
 #include "types.hh"
+#include "locator/host_id.hh"

 class mutation;
 class atomic_cell_or_collection;

-class counter_id {
-    int64_t _least_significant;
-    int64_t _most_significant;
-public:
-    static_assert(std::is_same<decltype(std::declval<utils::UUID>().get_least_significant_bits()), int64_t>::value
-            &&  std::is_same<decltype(std::declval<utils::UUID>().get_most_significant_bits()), int64_t>::value,
-        "utils::UUID is expected to work with two signed 64-bit integers");
-
-    counter_id() = default;
-    explicit counter_id(utils::UUID uuid) noexcept
-        : _least_significant(uuid.get_least_significant_bits())
-        , _most_significant(uuid.get_most_significant_bits())
-    { }
-
-    utils::UUID to_uuid() const {
-        return utils::UUID(_most_significant, _least_significant);
-    }
-
-    bool operator<(const counter_id& other) const {
-        return to_uuid() < other.to_uuid();
-    }
-    bool operator>(const counter_id& other) const {
-        return other.to_uuid() < to_uuid();
-    }
-    bool operator==(const counter_id& other) const {
-        return to_uuid() == other.to_uuid();
-    }
-    bool operator!=(const counter_id& other) const {
-        return !(*this == other);
-    }
-public:
-    // For tests.
-    static counter_id generate_random() {
-        return counter_id(utils::make_random_uuid());
-    }
-};
-static_assert(
-        std::is_standard_layout_v<counter_id> && std::is_trivial_v<counter_id>,
-        "counter_id should be a POD type");
-
-std::ostream& operator<<(std::ostream& os, const counter_id& id);
+using counter_id = utils::tagged_uuid<struct counter_id_tag>;

 template<mutable_view is_mutable>
 class basic_counter_shard_view {
@@ -406,13 +367,13 @@ struct counter_cell_mutable_view : basic_counter_cell_view<mutable_view::yes> {
 // Transforms mutation dst from counter updates to counter shards using state
 // stored in current_state.
 // If current_state is present it has to be in the same schema as dst.
-void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id);
+void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset, locator::host_id local_id);

 template<>
 struct appending_hash<counter_shard_view> {
    template<typename Hasher>
    void operator()(Hasher& h, const counter_shard_view& cshard) const {
-        ::feed_hash(h, cshard.id().to_uuid());
+        ::feed_hash(h, cshard.id());
        ::feed_hash(h, cshard.value());
        ::feed_hash(h, cshard.logical_clock());
    }
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -44,13 +44,14 @@ options {
 #include "cql3/statements/drop_aggregate_statement.hh"
 #include "cql3/statements/drop_service_level_statement.hh"
 #include "cql3/statements/detach_service_level_statement.hh"
-#include "cql3/statements/truncate_statement.hh"
+#include "cql3/statements/raw/truncate_statement.hh"
 #include "cql3/statements/raw/update_statement.hh"
 #include "cql3/statements/raw/insert_statement.hh"
 #include "cql3/statements/raw/delete_statement.hh"
 #include "cql3/statements/index_prop_defs.hh"
 #include "cql3/statements/raw/use_statement.hh"
 #include "cql3/statements/raw/batch_statement.hh"
+#include "cql3/statements/raw/describe_statement.hh"
 #include "cql3/statements/list_users_statement.hh"
 #include "cql3/statements/grant_statement.hh"
 #include "cql3/statements/revoke_statement.hh"
@@ -358,6 +359,7 @@ cqlStatement returns [std::unique_ptr<raw::parsed_statement> stmt]
    | st46=listServiceLevelStatement { $stmt = std::move(st46); }
    | st47=listServiceLevelAttachStatement { $stmt = std::move(st47); }
    | st48=pruneMaterializedViewStatement  { $stmt = std::move(st48); }
+    | st49=describeStatement           { $stmt = std::move(st49); }
    ;

 /*
@@ -371,7 +373,8 @@ useStatement returns [std::unique_ptr<raw::use_statement> stmt]
 * SELECT [JSON] <expression>
 * FROM <CF>
 * WHERE KEY = "key1" AND COL > 1 AND COL < 100
- * LIMIT <NUMBER>;
+ * LIMIT <NUMBER>
+ * [USING TIMEOUT <duration>];
 */
 selectStatement returns [std::unique_ptr<raw::select_statement> expr]
    @init {
@@ -398,7 +401,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
      ( K_LIMIT rows=intValue { limit = rows; } )?
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
      ( K_BYPASS K_CACHE { bypass_cache = true; })?
-      ( usingClause[attrs] )?
+      ( usingTimeoutClause[attrs] )?
      {
          auto params = make_lw_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, statement_subtype, bypass_cache);
          $expr = std::make_unique<raw::select_statement>(std::move(cf), std::move(params),
@@ -460,8 +463,7 @@ orderByClause[raw::select_statement::parameters::orderings_type& orderings]
    ;

 jsonValue returns [expression value]
-    :
-    | s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
+    : s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
    | m=marker         { $value = std::move(m); }
    ;

@@ -518,6 +520,19 @@ usingClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
    | K_TIMEOUT to=term { attrs->timeout = to; }
    ;

+usingTimestampTimeoutClause[std::unique_ptr<cql3::attributes::raw>& attrs]
+    : K_USING usingTimestampTimeoutClauseObjective[attrs] ( K_AND usingTimestampTimeoutClauseObjective[attrs] )*
+    ;
+
+usingTimestampTimeoutClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
+    : K_TIMESTAMP ts=intValue { attrs->timestamp = ts; }
+    | K_TIMEOUT to=term { attrs->timeout = to; }
+    ;
+
+usingTimeoutClause[std::unique_ptr<cql3::attributes::raw>& attrs]
+    : K_USING K_TIMEOUT to=term { attrs->timeout = to; }
+    ;
+
 /**
 * UPDATE <CF>
 * USING TIMESTAMP <long>
@@ -552,7 +567,7 @@ updateConditions returns [conditions_type conditions]
 /**
 * DELETE name1, name2
 * FROM <CF>
- * USING TIMESTAMP <long>
+ * [USING (TIMESTAMP <long> | TIMEOUT <duration>) [AND ...]]
 * WHERE KEY = keyname
   [IF (EXISTS | name = value, ...)];
 */
@@ -564,7 +579,7 @@ deleteStatement returns [std::unique_ptr<raw::delete_statement> expr]
    }
    : K_DELETE ( dels=deleteSelection { column_deletions = std::move(dels); } )?
      K_FROM cf=columnFamilyName
-      ( usingClause[attrs] )?
+      ( usingTimestampTimeoutClause[attrs] )?
      K_WHERE wclause=whereClause
      ( K_IF ( K_EXISTS { if_exists = true; } | conditions=updateConditions ))?
      {
@@ -857,7 +872,8 @@ indexIdent returns [::shared_ptr<index_target::raw> id]
    @init {
        std::vector<::shared_ptr<cql3::column_identifier::raw>> columns;
    }
-    : c=cident                   { $id = index_target::raw::values_of(c); }
+    : c=cident                   { $id = index_target::raw::regular_values_of(c); }
+    | K_VALUES '(' c=cident ')'  { $id = index_target::raw::collection_values_of(c); }
    | K_KEYS '(' c=cident ')'    { $id = index_target::raw::keys_of(c); }
    | K_ENTRIES '(' c=cident ')' { $id = index_target::raw::keys_and_values_of(c); }
    | K_FULL '(' c=cident ')'    { $id = index_target::raw::full_collection(c); }
@@ -1057,10 +1073,18 @@ dropIndexStatement returns [std::unique_ptr<drop_index_statement> expr]
    ;

 /**
-  * TRUNCATE <CF>;
+  * TRUNCATE [TABLE] <CF>
+  * [USING TIMEOUT <duration>];
  */
-truncateStatement returns [std::unique_ptr<truncate_statement> stmt]
-    : K_TRUNCATE (K_COLUMNFAMILY)? cf=columnFamilyName { $stmt = std::make_unique<truncate_statement>(cf); }
+truncateStatement returns [std::unique_ptr<raw::truncate_statement> stmt]
+    @init {
+        auto attrs = std::make_unique<cql3::attributes::raw>();
+    }
+    : K_TRUNCATE (K_COLUMNFAMILY)? cf=columnFamilyName
+      ( usingTimeoutClause[attrs] )?
+      {
+        $stmt = std::make_unique<raw::truncate_statement>(std::move(cf), std::move(attrs));
+      }
    ;

 /**
@@ -1345,6 +1369,59 @@ listServiceLevelAttachStatement returns [std::unique_ptr<list_service_level_atta
      { $stmt = std::make_unique<list_service_level_attachments_statement>(); }
    ;

+/**
+ * (DESCRIBE | DESC) (
+ *    CLUSTER
+ *    [FULL] SCHEMA
+ *    KEYSPACES
+ *    [ONLY] KEYSPACE <name>?
+ *    TABLES
+ *    TABLE <name>
+ *    TYPES
+ *    TYPE <name>
+ *    FUNCTIONS
+ *    FUNCTION <name>
+ *    AGGREGATES
+ *    AGGREGATE <name>
+ * ) (WITH INTERNALS)?
+ */
+describeStatement returns [std::unique_ptr<cql3::statements::raw::describe_statement> stmt]
+    @init {
+        bool fullSchema = false;
+        bool pending = false;
+        bool config = false;
+        bool only = false;
+        std::optional<sstring> keyspace;
+        sstring generic_name = "";
+    }
+    : ( K_DESCRIBE | K_DESC )
+    ( (K_CLUSTER) => K_CLUSTER                      { $stmt = cql3::statements::raw::describe_statement::cluster();                }
+    | (K_FULL { fullSchema=true; })? K_SCHEMA       { $stmt = cql3::statements::raw::describe_statement::schema(fullSchema);       }
+    | (K_KEYSPACES) => K_KEYSPACES                  { $stmt = cql3::statements::raw::describe_statement::keyspaces();              }
+    | (K_ONLY { only=true; })? K_KEYSPACE ( ks=keyspaceName { keyspace = ks; })?
+                                                    { $stmt = cql3::statements::raw::describe_statement::keyspace(keyspace, only); }
+    | (K_TABLES) => K_TABLES                        { $stmt = cql3::statements::raw::describe_statement::tables();                 }
+    | K_COLUMNFAMILY cf=columnFamilyName            { $stmt = cql3::statements::raw::describe_statement::table(cf);                }
+    | K_INDEX idx=columnFamilyName                  { $stmt = cql3::statements::raw::describe_statement::index(idx);               }
+    | K_MATERIALIZED K_VIEW view=columnFamilyName   { $stmt = cql3::statements::raw::describe_statement::view(view);               }
+    | (K_TYPES) => K_TYPES                          { $stmt = cql3::statements::raw::describe_statement::types();                  }
+    | K_TYPE tn=userTypeName                        { $stmt = cql3::statements::raw::describe_statement::type(tn);                 }
+    | (K_FUNCTIONS) => K_FUNCTIONS                  { $stmt = cql3::statements::raw::describe_statement::functions();              }
+    | K_FUNCTION fn=functionName                    { $stmt = cql3::statements::raw::describe_statement::function(fn);             }
+    | (K_AGGREGATES) => K_AGGREGATES                { $stmt = cql3::statements::raw::describe_statement::aggregates();             }
+    | K_AGGREGATE ag=functionName                   { $stmt = cql3::statements::raw::describe_statement::aggregate(ag);            }
+    | ( ( ksT=IDENT                                 { keyspace = sstring{$ksT.text}; }
+        | ksT=QUOTED_NAME                           { keyspace = sstring{$ksT.text}; }
+        | ksK=unreserved_keyword                    { keyspace = ksK; } ) 
+        '.' )?
+        ( tT=IDENT                                  { generic_name = sstring{$tT.text}; }
+        | tT=QUOTED_NAME                            { generic_name = sstring{$tT.text}; }
+        | tK=unreserved_keyword                     { generic_name = tK; } )
+                                                    { $stmt = cql3::statements::raw::describe_statement::generic(keyspace, generic_name); }
+    )
+    ( K_WITH K_INTERNALS { $stmt->with_internals_details(); } )?
+    ;
+
 /** DEFINITIONS **/

 // Column Identifiers.  These need to be treated differently from other
@@ -1396,7 +1473,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
@@ -1490,7 +1567,7 @@ value returns [expression value]
    | l=collectionLiteral  { $value = std::move(l); }
    | u=usertypeLiteral    { $value = std::move(u); }
    | t=tupleLiteral       { $value = std::move(t); }
-    | K_NULL               { $value = null(); }
+    | K_NULL               { $value = make_untyped_null(); }
    | e=marker             { $value = std::move(e); }
    ;

@@ -1500,8 +1577,7 @@ marker returns [expression value]
    ;

 intValue returns [expression value]
-    :
-    | t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
+    : t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
    | e=marker      { $value = std::move(e); }
    ;

@@ -1655,7 +1731,7 @@ relation returns [expression e]
    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
        { $e = binary_operator(token{std::move(l.elements)}, type, std::move(t)); }
    | name=cident K_IS K_NOT K_NULL {
-          $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IS_NOT, null()); }
+          $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IS_NOT, make_untyped_null()); }
    | name=cident K_IN marker1=marker
        { $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IN, std::move(marker1)); }
    | name=cident K_IN in_values=singleColumnInValues
@@ -1874,10 +1950,13 @@ unreserved_function_keyword returns [sstring str]
 basic_unreserved_keyword returns [sstring str]
    : k=( K_KEYS
        | K_AS
+        | K_CLUSTER
        | K_CLUSTERING
        | K_COMPACT
        | K_STORAGE
+        | K_TABLES
        | K_TYPE
+        | K_TYPES
        | K_VALUES
        | K_MAP
        | K_LIST
@@ -1901,11 +1980,14 @@ basic_unreserved_keyword returns [sstring str]
        | K_TRIGGER
        | K_DISTINCT
        | K_CONTAINS
+        | K_INTERNALS
        | K_STATIC
        | K_FROZEN
        | K_TUPLE
        | K_FUNCTION
+        | K_FUNCTIONS
        | K_AGGREGATE
+        | K_AGGREGATES
        | K_SFUNC
        | K_STYPE
        | K_REDUCEFUNC
@@ -1933,6 +2015,9 @@ basic_unreserved_keyword returns [sstring str]
        | K_LEVEL
        | K_LEVELS
        | K_PRUNE
+        | K_ONLY
+        | K_DESCRIBE
+        | K_DESC
        ) { $str = $k.text; }
    ;

@@ -1990,11 +2075,14 @@ K_TRUNCATE:    T R U N C A T E;
 K_DELETE:      D E L E T E;
 K_IN:          I N;
 K_CREATE:      C R E A T E;
+K_SCHEMA:      S C H E M A;
 K_KEYSPACE:    ( K E Y S P A C E
-                 | S C H E M A );
+                 | K_SCHEMA );
 K_KEYSPACES:   K E Y S P A C E S;
 K_COLUMNFAMILY:( C O L U M N F A M I L Y
                 | T A B L E );
+K_TABLES:      ( C O L U M N F A M I L I E S
+                 | T A B L E S );
 K_MATERIALIZED:M A T E R I A L I Z E D;
 K_VIEW:        V I E W;
 K_INDEX:       I N D E X;
@@ -2011,6 +2099,7 @@ K_ALTER:       A L T E R;
 K_RENAME:      R E N A M E;
 K_ADD:         A D D;
 K_TYPE:        T Y P E;
+K_TYPES:       T Y P E S;
 K_COMPACT:     C O M P A C T;
 K_STORAGE:     S T O R A G E;
 K_ORDER:       O R D E R;
@@ -2022,6 +2111,8 @@ K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
 K_IS:          I S;
 K_CONTAINS:    C O N T A I N S;
+K_INTERNALS:   I N T E R N A L S;
+K_ONLY:        O N L Y;

 K_GRANT:       G R A N T;
 K_ALL:         A L L;
@@ -2045,6 +2136,7 @@ K_LOGIN:       L O G I N;
 K_NOLOGIN:     N O L O G I N;
 K_OPTIONS:     O P T I O N S;

+K_CLUSTER:     C L U S T E R;
 K_CLUSTERING:  C L U S T E R I N G;
 K_ASCII:       A S C I I;
 K_BIGINT:      B I G I N T;
@@ -2084,7 +2176,9 @@ K_STATIC:      S T A T I C;
 K_FROZEN:      F R O Z E N;

 K_FUNCTION:    F U N C T I O N;
+K_FUNCTIONS:   F U N C T I O N S;
 K_AGGREGATE:   A G G R E G A T E;
+K_AGGREGATES:  A G G R E G A T E S;
 K_SFUNC:       S F U N C;
 K_STYPE:       S T Y P E;
 K_REDUCEFUNC:  R E D U C E F U N C;
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -10,6 +10,7 @@

 #include "cql3/attributes.hh"
 #include "cql3/column_identifier.hh"
+#include <optional>

 namespace cql3 {

@@ -20,7 +21,9 @@ std::unique_ptr<attributes> attributes::none() {
 attributes::attributes(std::optional<cql3::expr::expression>&& timestamp,
                       std::optional<cql3::expr::expression>&& time_to_live,
                       std::optional<cql3::expr::expression>&& timeout)
-    : _timestamp{std::move(timestamp)}
+    : _timestamp_unset_guard(timestamp)
+    , _timestamp{std::move(timestamp)}
+    , _time_to_live_unset_guard(time_to_live)
    , _time_to_live{std::move(time_to_live)}
    , _timeout{std::move(timeout)}
 { }
@@ -38,7 +41,7 @@ bool attributes::is_timeout_set() const {
 }

 int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
-    if (!_timestamp.has_value()) {
+    if (!_timestamp.has_value() || _timestamp_unset_guard.is_unset(options)) {
        return now;
    }

@@ -46,31 +49,25 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of timestamp");
    }
-    if (tval.is_unset_value()) {
-        return now;
-    }
    try {
-        return tval.view().validate_and_deserialize<int64_t>(*long_type, cql_serialization_format::internal());
+        return tval.view().validate_and_deserialize<int64_t>(*long_type);
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
 }

-int32_t attributes::get_time_to_live(const query_options& options) {
-    if (!_time_to_live.has_value())
-        return 0;
+std::optional<int32_t> attributes::get_time_to_live(const query_options& options) {
+    if (!_time_to_live.has_value() || _time_to_live_unset_guard.is_unset(options))
+        return std::nullopt;

    cql3::raw_value tval = expr::evaluate(*_time_to_live, options);
    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of TTL");
    }
-    if (tval.is_unset_value()) {
-        return 0;
-    }

    int32_t ttl;
    try {
-        ttl = tval.view().validate_and_deserialize<int32_t>(*int32_type, cql_serialization_format::internal());
+        ttl = tval.view().validate_and_deserialize<int32_t>(*int32_type);
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
@@ -91,8 +88,8 @@ int32_t attributes::get_time_to_live(const query_options& options) {

 db::timeout_clock::duration attributes::get_timeout(const query_options& options) const {
    cql3::raw_value timeout = expr::evaluate(*_timeout, options);
-    if (timeout.is_null() || timeout.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Timeout value cannot be unset/null");
+    if (timeout.is_null()) {
+        throw exceptions::invalid_request_exception("Timeout value cannot be null");
    }
    cql_duration duration = timeout.view().deserialize<cql_duration>(*duration_type);
    if (duration.months || duration.days) {
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -11,6 +11,7 @@
 #pragma once

 #include "cql3/expr/expression.hh"
+#include "cql3/expr/unset.hh"
 #include "db/timeout_clock.hh"

 namespace cql3 {
@@ -24,7 +25,9 @@ class prepare_context;
 */
 class attributes final {
 private:
+    expr::unset_bind_variable_guard _timestamp_unset_guard;
    std::optional<cql3::expr::expression> _timestamp;
+    expr::unset_bind_variable_guard _time_to_live_unset_guard;
    std::optional<cql3::expr::expression> _time_to_live;
    std::optional<cql3::expr::expression> _timeout;
 public:
@@ -42,7 +45,7 @@ public:

    int64_t get_timestamp(int64_t now, const query_options& options);

-    int32_t get_time_to_live(const query_options& options);
+    std::optional<int32_t> get_time_to_live(const query_options& options);

    db::timeout_clock::duration get_timeout(const query_options& options) const;

--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -139,10 +139,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti

        cql3::raw_value key_constant = expr::evaluate(*_collection_element, options);
        cql3::raw_value_view key = key_constant.view();
-        if (key.is_unset_value()) {
-            throw exceptions::invalid_request_exception(
-                    format("Invalid 'unset' value in {} element access", cell_type.cql3_type_name()));
-        }
        if (key.is_null()) {
            throw exceptions::invalid_request_exception(
                    format("Invalid null value for {} element access", cell_type.cql3_type_name()));
@@ -196,9 +192,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        // <, >, >=, <=, !=
        cql3::raw_value param = expr::evaluate(*_value, options);

-        if (param.is_unset_value()) {
-            throw exceptions::invalid_request_exception("Invalid 'unset' value in condition");
-        }
        if (param.is_null()) {
            if (_op == expr::oper_t::EQ) {
                return cell_value == nullptr;
@@ -224,9 +217,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
            return (*_matcher)(bytes_view(cell_value->serialize_nonnull()));
        } else {
            auto param = expr::evaluate(*_value, options);  // LIKE pattern
-            if (param.is_unset_value()) {
-                throw exceptions::invalid_request_exception("Invalid 'unset' value in LIKE pattern");
-            }
            if (param.is_null()) {
                throw exceptions::invalid_request_exception("Invalid NULL value in LIKE pattern");
            }
@@ -309,7 +299,7 @@ column_condition::raw::prepare(data_dictionary::database db, const sstring& keys

    if (_op == expr::oper_t::LIKE) {
        auto literal_term = expr::as_if<expr::untyped_constant>(&*_value);
-        if (literal_term) {
+        if (literal_term && literal_term->partial_type != expr::untyped_constant::type_class::null) {
            // Pass matcher object
            const sstring& pattern = literal_term->raw_text;
            return column_condition::condition(receiver, std::move(collection_element_expression),
--- a/cql3/column_condition.hh
+++ b/cql3/column_condition.hh
@@ -73,6 +73,14 @@ public:
            std::move(in_values), nullptr, expr::oper_t::IN);
    }

+    const std::optional<expr::expression>& get_value() const {
+        return _value;
+    }
+
+    expr::oper_t get_operation() const {
+        return _op;
+    }
+
    class raw final {
    private:
        std::optional<cql3::expr::expression> _value;
--- a/cql3/constants.cc
+++ b/cql3/constants.cc
@@ -10,6 +10,7 @@

 #include "cql3/constants.hh"
 #include "cql3/cql3_type.hh"
+#include "cql3/statements/strongly_consistent_modification_statement.hh"

 namespace cql3 {
 void constants::deleter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
@@ -22,4 +23,9 @@ void constants::deleter::execute(mutation& m, const clustering_key_prefix& prefi
        m.set_cell(prefix, column, params.make_dead_cell());
    }
 }
+
+void
+constants::setter::prepare_for_broadcast_tables(statements::broadcast_tables::prepared_update& query) const {
+    query.new_value = *_e;
+}
 }
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -11,12 +11,17 @@
 #pragma once

 #include "cql3/abstract_marker.hh"
+#include "cql3/query_options.hh"
 #include "cql3/update_parameters.hh"
 #include "cql3/operation.hh"
 #include "cql3/values.hh"
 #include "mutation.hh"
 #include <seastar/core/shared_ptr.hh>

+namespace service::broadcast_tables {
+    class update_query;
+}
+
 namespace cql3 {

 /**
@@ -28,9 +33,9 @@ public:
    private static final Logger logger = LoggerFactory.getLogger(Constants.class);
 #endif
 public:
-    class setter : public operation {
+    class setter : public operation_skip_if_unset {
    public:
-        using operation::operation;
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
@@ -44,32 +49,30 @@ public:
                m.set_cell(prefix, column, params.make_cell(*column.type, value));
            }
        }
+
+        virtual void prepare_for_broadcast_tables(statements::broadcast_tables::prepared_update& query) const override;
    };

-    struct adder final : operation {
-        using operation::operation;
+    struct adder final : operation_skip_if_unset {
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
-            } else if (value.is_unset_value()) {
-                return;
            }
            auto increment = value.view().deserialize<int64_t>(*long_type);
            m.set_cell(prefix, column, params.make_counter_update_cell(increment));
        }
    };

-    struct subtracter final : operation {
-        using operation::operation;
+    struct subtracter final : operation_skip_if_unset {
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
-            } else if (value.is_unset_value()) {
-                return;
            }
            auto increment = value.view().deserialize<int64_t>(*long_type);
            if (increment == std::numeric_limits<int64_t>::min()) {
@@ -79,10 +82,10 @@ public:
        }
    };

-    class deleter : public operation {
+    class deleter : public operation_no_unset_support {
    public:
        deleter(const column_definition& column)
-            : operation(column, std::nullopt)
+            : operation_no_unset_support(column, std::nullopt)
        { }

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -473,27 +473,40 @@ sstring maybe_quote(const sstring& identifier) {
    return result;
 }

-sstring quote(const sstring& identifier) {
+template <char C>
+static sstring quote_with(const sstring& str) {
+    static const std::string quote_str{C};
+
    // quote empty string
-    if (identifier.empty()) {
-        return "\"\"";
+    if (str.empty()) {
+        return make_sstring(quote_str, quote_str);
    }
    size_t num_quotes = 0;
-    for (char c : identifier) {
-        num_quotes += (c == '"');
+    for (char c : str) {
+        num_quotes += (c == C);
    }
    if (num_quotes == 0) {
-        return make_sstring("\"", identifier, "\"");
+        return make_sstring(quote_str, str, quote_str);
    }
-    static const std::regex double_quote_re("\"");
+
+    static const std::string double_quote_str{C, C};
+    static const std::regex quote_re(std::string{C});
    std::string result;
-    result.reserve(2 + identifier.size() + num_quotes);
-    result.push_back('"');
-    std::regex_replace(std::back_inserter(result), identifier.begin(), identifier.end(), double_quote_re, "\"\"");
-    result.push_back('"');
+    result.reserve(2 + str.size() + num_quotes);
+    result.push_back(C);
+    std::regex_replace(std::back_inserter(result), str.begin(), str.end(), quote_re, double_quote_str);
+    result.push_back(C);
    return result;
 }

+sstring quote(const sstring& identifier) {
+    return quote_with<'"'>(identifier);
+}
+
+sstring single_quote(const sstring& str) {
+    return quote_with<'\''>(str);
+}
+
 }

 }
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
--- a/cql3/expr/expression.hh
+++ b/cql3/expr/expression.hh
@@ -76,7 +76,6 @@ struct column_mutation_attribute;
 struct function_call;
 struct cast;
 struct field_selection;
-struct null;
 struct bind_variable;
 struct untyped_constant;
 struct constant;
@@ -96,7 +95,6 @@ concept ExpressionElement
        || std::same_as<T, function_call>
        || std::same_as<T, cast>
        || std::same_as<T, field_selection>
-        || std::same_as<T, null>
        || std::same_as<T, bind_variable>
        || std::same_as<T, untyped_constant>
        || std::same_as<T, constant>
@@ -117,7 +115,6 @@ concept invocable_on_expression
        && std::invocable<Func, function_call>
        && std::invocable<Func, cast>
        && std::invocable<Func, field_selection>
-        && std::invocable<Func, null>
        && std::invocable<Func, bind_variable>
        && std::invocable<Func, untyped_constant>
        && std::invocable<Func, constant>
@@ -138,7 +135,6 @@ concept invocable_on_expression_ref
        && std::invocable<Func, function_call&>
        && std::invocable<Func, cast&>
        && std::invocable<Func, field_selection&>
-        && std::invocable<Func, null&>
        && std::invocable<Func, bind_variable&>
        && std::invocable<Func, untyped_constant&>
        && std::invocable<Func, constant&>
@@ -147,7 +143,7 @@ concept invocable_on_expression_ref
        && std::invocable<Func, usertype_constructor&>
        ;

-/// A CQL expression -- union of all possible expression types.  bool means a Boolean constant.
+/// A CQL expression -- union of all possible expression types.
 class expression final {
    // 'impl' holds a variant of all expression types, but since 
    // variants of incomplete types are not allowed, we forward declare it
@@ -198,9 +194,7 @@ bool operator==(const expression& e1, const expression& e2);
 // An expression that doesn't contain subexpressions
 template <typename E>
 concept LeafExpression
-        = std::same_as<bool, E>
-        || std::same_as<unresolved_identifier, E> 
-        || std::same_as<null, E> 
+        = std::same_as<unresolved_identifier, E>
        || std::same_as<bind_variable, E> 
        || std::same_as<untyped_constant, E> 
        || std::same_as<constant, E>
@@ -346,12 +340,6 @@ struct field_selection {
    friend bool operator==(const field_selection&, const field_selection&) = default;
 };

-struct null {
-    data_type type; // may be null before prepare
-
-    friend bool operator==(const null&, const null&) = default;
-};
-
 struct bind_variable {
    int32_t bind_index;

@@ -365,17 +353,18 @@ struct bind_variable {
 // A constant which does not yet have a date type. It is partially typed
 // (we know if it's floating or int) but not sized.
 struct untyped_constant {
-    enum type_class { integer, floating_point, string, boolean, duration, uuid, hex };
+    enum type_class { integer, floating_point, string, boolean, duration, uuid, hex, null };
    type_class partial_type;
    sstring raw_text;

    friend bool operator==(const untyped_constant&, const untyped_constant&) = default;
 };

+untyped_constant make_untyped_null();
+
 // Represents a constant value with known value and type
 // For null and unset the type can sometimes be set to empty_type
 struct constant {
-    // A value serialized using the internal (latest) cql_serialization_format
    cql3::raw_value value;

    // Never nullptr, for NULL and UNSET might be empty_type
@@ -383,7 +372,6 @@ struct constant {

    constant(cql3::raw_value value, data_type type);
    static constant make_null(data_type val_type = empty_type);
-    static constant make_unset_value(data_type val_type = empty_type);
    static constant make_bool(bool bool_val);

    bool is_null() const;
@@ -436,7 +424,7 @@ struct usertype_constructor {
 struct expression::impl final {
    using variant_type = std::variant<
            conjunction, binary_operator, column_value, token, unresolved_identifier,
-            column_mutation_attribute, function_call, cast, field_selection, null,
+            column_mutation_attribute, function_call, cast, field_selection,
            bind_variable, untyped_constant, constant, tuple_constructor, collection_constructor,
            usertype_constructor, subscript>;
    variant_type v;
@@ -666,6 +654,9 @@ inline auto find_clustering_order(const expression& e) {
 /// empty conjunction).
 std::vector<expression> boolean_factors(expression e);

+/// Run the given function for each element in the top level conjunction.
+void for_each_boolean_factor(const expression& e, const noncopyable_function<void (const expression&)>& for_each_func);
+
 /// True iff binary_operator involves a collection.
 extern bool is_on_collection(const binary_operator&);

--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -78,7 +78,7 @@ static
 void
 usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    if (!receiver.type->is_user_type()) {
-        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto ut = static_pointer_cast<const user_type_impl>(receiver.type);
@@ -90,7 +90,7 @@ usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_
        const expression& value = u.elements.at(field);
        auto&& field_spec = usertype_field_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *field_spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", receiver.name, field, field_spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", *receiver.name, field, field_spec->type->as_cql3_type()));
        }
    }
 }
@@ -123,7 +123,7 @@ usertype_constructor_prepare_expression(const usertype_constructor& u, data_dict
        auto iraw = u.elements.find(field);
        expression raw;
        if (iraw == u.elements.end()) {
-            raw = expr::null();
+            raw = expr::make_untyped_null();
        } else {
            raw = iraw->second;
            ++found_values;
@@ -246,6 +246,21 @@ map_prepare_expression(const collection_constructor& c, data_dictionary::databas

    auto key_spec = maps::key_spec_of(*receiver);
    auto value_spec = maps::value_spec_of(*receiver);
+    const map_type_impl* map_type = dynamic_cast<const map_type_impl*>(&receiver->type->without_reversed());
+    if (map_type == nullptr) {
+        on_internal_error(expr_logger,
+                          format("map_prepare_expression bad non-map receiver type: {}", receiver->type->name()));
+    }
+    data_type map_element_tuple_type = tuple_type_impl::get_instance({map_type->get_keys_type(), map_type->get_values_type()});
+
+    // In Cassandra, an empty (unfrozen) map/set/list is equivalent to the column being null. In
+    // other words a non-frozen collection only exists if it has elements.  Return nullptr right
+    // away to simplify predicate evaluation.  See also
+    // https://issues.apache.org/jira/browse/CASSANDRA-5141
+    if (map_type->is_multi_cell() && c.elements.empty()) {
+        return constant::make_null(receiver->type);
+    }
+
    std::vector<expression> values;
    values.reserve(c.elements.size());
    bool all_terminal = true;
@@ -264,7 +279,7 @@ map_prepare_expression(const collection_constructor& c, data_dictionary::databas

        values.emplace_back(tuple_constructor {
            .elements = {std::move(k), std::move(v)},
-            .type = entry_tuple.type
+            .type = map_element_tuple_type
        });
    }

@@ -298,7 +313,7 @@ set_validate_assignable_to(const collection_constructor& c, data_dictionary::dat
            return;
        }

-        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto&& value_spec = set_value_spec_of(receiver);
@@ -486,18 +501,18 @@ void
 tuple_constructor_validate_assignable_to(const tuple_constructor& tc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver.type->underlying_type());
    if (!tt) {
-        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    for (size_t i = 0; i < tc.elements.size(); ++i) {
        if (i >= tt->size()) {
            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: too many elements. Type {} expects {:d} but got {:d}",
-                                                            receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
+                                                            *receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
        }

        auto&& value = tc.elements[i];
        auto&& spec = component_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", receiver.name, i, spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", *receiver.name, i, spec->type->as_cql3_type()));
        }
    }
 }
@@ -567,6 +582,7 @@ operator<<(std::ostream&out, untyped_constant::type_class t)
        case untyped_constant::type_class::boolean:  return out << "BOOLEAN";
        case untyped_constant::type_class::hex:      return out << "HEX";
        case untyped_constant::type_class::duration: return out << "DURATION";
+        case untyped_constant::type_class::null:     return out << "NULL";
    }
    abort();
 }
@@ -594,8 +610,9 @@ static
 assignment_testable::test_result
 untyped_constant_test_assignment(const untyped_constant& uc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver)
 {
+    bool uc_is_null = uc.partial_type == untyped_constant::type_class::null;
    auto receiver_type = receiver.type->as_cql3_type();
-    if (receiver_type.is_collection() || receiver_type.is_user_type()) {
+    if ((receiver_type.is_collection() || receiver_type.is_user_type()) && !uc_is_null) {
        return assignment_testable::test_result::NOT_ASSIGNABLE;
    }
    if (!receiver_type.is_native()) {
@@ -660,6 +677,10 @@ untyped_constant_test_assignment(const untyped_constant& uc, data_dictionary::da
                return assignment_testable::test_result::EXACT_MATCH;
            }
            break;
+        case untyped_constant::type_class::null:
+            return receiver.type->is_counter()
+                ? assignment_testable::test_result::NOT_ASSIGNABLE
+                : assignment_testable::test_result::WEAKLY_ASSIGNABLE;
    }
    return assignment_testable::test_result::NOT_ASSIGNABLE;
 }
@@ -673,9 +694,18 @@ untyped_constant_prepare_expression(const untyped_constant& uc, data_dictionary:
        return std::nullopt;
    }
    if (!is_assignable(untyped_constant_test_assignment(uc, db, keyspace, *receiver))) {
+      if (uc.partial_type != untyped_constant::type_class::null) {
        throw exceptions::invalid_request_exception(format("Invalid {} constant ({}) for \"{}\" of type {}",
            uc.partial_type, uc.raw_text, *receiver->name, receiver->type->as_cql3_type().to_string()));
+      } else {
+        throw exceptions::invalid_request_exception("Invalid null value for counter increment/decrement");
+      }
    }
+
+    if (uc.partial_type == untyped_constant::type_class::null) {
+        return constant::make_null(receiver->type);
+    }
+
    raw_value raw_val = cql3::raw_value::make_value(untyped_constant_parsed_value(uc, receiver->type));
    return constant(std::move(raw_val), receiver->type);
 }
@@ -687,38 +717,19 @@ bind_variable_test_assignment(const bind_variable& bv, data_dictionary::database
 }

 static
-bind_variable
+std::optional<bind_variable>
 bind_variable_prepare_expression(const bind_variable& bv, data_dictionary::database db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver)
 {   
+    if (!receiver) {
+        return std::nullopt;
+    }
+
    return bind_variable {
        .bind_index = bv.bind_index,
        .receiver = receiver
    };
 }

-static
-assignment_testable::test_result
-null_test_assignment(data_dictionary::database db,
-        const sstring& keyspace,
-        const column_specification& receiver) {
-    return receiver.type->is_counter()
-        ? assignment_testable::test_result::NOT_ASSIGNABLE
-        : assignment_testable::test_result::WEAKLY_ASSIGNABLE;
-}
-
-static
-std::optional<expression>
-null_prepare_expression(data_dictionary::database db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) {
-    if (!receiver) {
-        // TODO: It is not possible to infer the type of NULL, but perhaps we can have a matcing null_type that can be cast to anything
-        return std::nullopt;
-    }
-    if (!is_assignable(null_test_assignment(db, keyspace, *receiver))) {
-        throw exceptions::invalid_request_exception("Invalid null value for counter increment/decrement");
-    }
-    return constant::make_null(receiver->type);
-}
-
 static
 sstring
 cast_display_name(const cast& c) {
@@ -864,6 +875,53 @@ test_assignment_function_call(const cql3::expr::function_call& fc, data_dictiona
    }
 }

+std::optional<expression> prepare_conjunction(const conjunction& conj,
+                                              data_dictionary::database db,
+                                              const sstring& keyspace,
+                                              const schema* schema_opt,
+                                              lw_shared_ptr<column_specification> receiver) {
+    if (receiver.get() != nullptr && receiver->type->without_reversed().get_kind() != abstract_type::kind::boolean) {
+        throw exceptions::invalid_request_exception(
+            format("AND conjunction produces a boolean value, which doesn't match the type: {} of {}",
+                   receiver->type->name(), receiver->name->text()));
+    }
+
+    lw_shared_ptr<column_specification> child_receiver;
+    if (receiver.get() != nullptr) {
+        ::shared_ptr<column_identifier> child_receiver_name =
+            ::make_shared<column_identifier>(format("AND_element({})", receiver->name->text()), true);
+        child_receiver = make_lw_shared<column_specification>(receiver->ks_name, receiver->cf_name,
+                                                              std::move(child_receiver_name), boolean_type);
+    } else {
+        ::shared_ptr<column_identifier> child_receiver_name =
+            ::make_shared<column_identifier>("AND_element(unknown)", true);
+        sstring cf_name = schema_opt ? schema_opt->cf_name() : "unknown_cf";
+        child_receiver = make_lw_shared<column_specification>(keyspace, std::move(cf_name),
+                                                              std::move(child_receiver_name), boolean_type);
+    }
+
+    std::vector<expression> prepared_children;
+
+    bool all_terminal = true;
+    for (const expression& child : conj.children) {
+        std::optional<expression> prepared_child =
+            try_prepare_expression(child, db, keyspace, schema_opt, child_receiver);
+        if (!prepared_child.has_value()) {
+            throw exceptions::invalid_request_exception(fmt::format("Could not infer type of {}", child));
+        }
+        if (!is<constant>(*prepared_child)) {
+            all_terminal = false;
+        }
+        prepared_children.push_back(std::move(*prepared_child));
+    }
+
+    conjunction result = conjunction{std::move(prepared_children)};
+    if (all_terminal) {
+        return constant(evaluate(result, evaluation_inputs{}), boolean_type);
+    }
+    return result;
+}
+
 std::optional<expression>
 try_prepare_expression(const expression& expr, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
    return expr::visit(overloaded_functor{
@@ -873,8 +931,8 @@ try_prepare_expression(const expression& expr, data_dictionary::database db, con
        [&] (const binary_operator&) -> std::optional<expression> {
            on_internal_error(expr_logger, "binary_operators are not yet reachable via prepare_expression()");
        },
-        [&] (const conjunction&) -> std::optional<expression> {
-            on_internal_error(expr_logger, "conjunctions are not yet reachable via prepare_expression()");
+        [&] (const conjunction& conj) -> std::optional<expression> {
+            return prepare_conjunction(conj, db, keyspace, schema_opt, receiver);
        },
        [] (const column_value& cv) -> std::optional<expression> {
            return cv;
@@ -945,9 +1003,6 @@ try_prepare_expression(const expression& expr, data_dictionary::database db, con
        [&] (const field_selection&) -> std::optional<expression> {
            on_internal_error(expr_logger, "field_selections are not yet reachable via prepare_expression()");
        },
-        [&] (const null&) -> std::optional<expression> {
-            return null_prepare_expression(db, keyspace, receiver);
-        },
        [&] (const bind_variable& bv) -> std::optional<expression> {
            return bind_variable_prepare_expression(bv, db, keyspace, receiver);
        },
@@ -1009,9 +1064,6 @@ test_assignment(const expression& expr, data_dictionary::database db, const sstr
        [&] (const field_selection&) -> test_result {
            on_internal_error(expr_logger, "field_selections are not yet reachable via test_assignment()");
        },
-        [&] (const null&) -> test_result {
-            return null_test_assignment(db, keyspace, receiver);
-        },
        [&] (const bind_variable& bv) -> test_result {
            return bind_variable_test_assignment(bv, db, keyspace, receiver);
        },
@@ -1138,7 +1190,7 @@ static lw_shared_ptr<column_specification> get_lhs_receiver(const expression& pr
 // Given type of LHS and the operation finds the expected type of RHS.
 // The type will be the same as LHS for simple operations like =, but it will be different for more complex ones like IN or CONTAINS.
 static lw_shared_ptr<column_specification> get_rhs_receiver(lw_shared_ptr<column_specification>& lhs_receiver, oper_t oper) {
-    const data_type& lhs_type = lhs_receiver->type->underlying_type();
+    const data_type lhs_type = lhs_receiver->type->underlying_type();

    if (oper == oper_t::IN) {
        data_type rhs_receiver_type = list_type_impl::get_instance(std::move(lhs_type), false);
--- a/cql3/expr/restrictions.cc
+++ b/cql3/expr/restrictions.cc
@@ -144,7 +144,7 @@ void preliminary_binop_vaidation_checks(const binary_operator& binop) {
    }

    if (binop.op == oper_t::IS_NOT) {
-        bool rhs_is_null = is<null>(binop.rhs)
+        bool rhs_is_null = (is<untyped_constant>(binop.rhs) && as<untyped_constant>(binop.rhs).partial_type == untyped_constant::type_class::null)
                           || (is<constant>(binop.rhs) && as<constant>(binop.rhs).is_null());
        if (!rhs_is_null) {
            throw exceptions::invalid_request_exception(format("Unsupported \"IS NOT\" relation: {}", pretty_binop_printer));
--- a/cql3/expr/unset.hh
+++ b/cql3/expr/unset.hh
@@ -0,0 +1,30 @@
+// Copyright (C) 2023-present ScyllaDB
+// SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
+
+#pragma once
+
+#include <optional>
+#include "expression.hh"
+
+namespace cql3 {
+
+class query_options;
+
+}
+
+namespace cql3::expr {
+
+// Some expression users can behave differently if the expression is a bind variable
+// and if that bind variable is unset. unset_bind_variable_guard encapsulates the two
+// conditions.
+class unset_bind_variable_guard {
+    // Disengaged if the operand is not exactly a single bind variable.
+    std::optional<bind_variable> _var;
+public:
+    explicit unset_bind_variable_guard(const expr::expression& operand);
+    explicit unset_bind_variable_guard(std::nullopt_t) {}
+    explicit unset_bind_variable_guard(const std::optional<expr::expression>& operand);
+    bool is_unset(const query_options& qo) const;
+};
+
+}
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -12,7 +12,7 @@
 #include "types.hh"
 #include "types/tuple.hh"
 #include "cql3/functions/scalar_function.hh"
-#include "cql_serialization_format.hh"
+#include "cql3/util.hh"
 #include "utils/big_decimal.hh"
 #include "aggregate_fcts.hh"
 #include "user_aggregate.hh"
@@ -40,10 +40,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        ++_count;
    }
    virtual void set_accumulator(const opt_bytes& acc) override {
@@ -56,7 +56,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return long_type->decompose(_count);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = value_cast<int64_t>(long_type->deserialize(bytes_view(*acc)));
            _count += other;
@@ -189,13 +189,13 @@ public:
    virtual void reset() override {
        _acc = _initcond;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        return _finalfunc ? _finalfunc->execute(sf, std::vector<bytes_opt>{_acc}) : _acc;
+    virtual opt_bytes compute() override {
+        return _finalfunc ? _finalfunc->execute(std::vector<bytes_opt>{_acc}) : _acc;
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        std::vector<bytes_opt> args{_acc};
        args.insert(args.end(), values.begin(), values.end());
-        _acc = _sfunc->execute(sf, args);
+        _acc = _sfunc->execute(args);
    }
    virtual void set_accumulator(const opt_bytes& acc) override {
        _acc = acc;
@@ -203,9 +203,9 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _acc;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        std::vector<bytes_opt> args{_acc, acc};
-        _acc = _rfunc->execute(sf, args);
+        _acc = _rfunc->execute(args);
    }
 };

@@ -218,10 +218,10 @@ public:
    virtual void reset() override {
        _sum = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return data_type_for<Type>()->decompose(accumulator_for<Type>::narrow(_sum));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -237,7 +237,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return accumulator_for<Type>::decompose(_sum);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = accumulator_for<Type>::deserialize(acc);
            _sum += other;
@@ -248,7 +248,7 @@ public:
 template <typename Type>
 class impl_reducible_sum_function final : public impl_sum_function_for<Type> {
 public:
-    virtual bytes_opt compute(cql_serialization_format sf) override {
+    virtual bytes_opt compute() override {
        return this->get_accumulator();
    }
 };
@@ -316,14 +316,14 @@ public:
        _sum = {};
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        Type ret{};
        if (_count) {
            ret = impl_div_for_avg<Type>::div(_sum, _count);
        }
        return data_type_for<Type>()->decompose(ret);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -348,7 +348,7 @@ public:
        );
        return tuple_val.serialize();
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            data_type tuple_type = tuple_type_impl::get_instance({accumulator_for<Type>::data_type(), long_type});
            auto tuple = value_cast<tuple_type_impl::native_type>(tuple_type->deserialize(bytes_view(*acc)));
@@ -362,7 +362,7 @@ public:
 template <typename Type>
 class impl_reducible_avg_function : public impl_avg_function_for<Type> {
 public:
-    virtual bytes_opt compute(cql_serialization_format sf) override {
+    virtual bytes_opt compute() override {
        return this->get_accumulator();
    }
 };
@@ -457,13 +457,13 @@ public:
    virtual void reset() override {
        _max = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        if (!_max) {
            return {};
        }
        return data_type_for<Type>()->decompose(data_value(Type{*_max}));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -487,8 +487,8 @@ public:
        }
        return {};
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
-        return add_input(sf, {acc});
+    virtual void reduce(const opt_bytes& acc) override {
+        return add_input({acc});
    }
 };

@@ -502,10 +502,10 @@ public:
    virtual void reset() override {
        _max = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return _max.value_or(bytes{});
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (values.empty() || !values[0]) {
            return;
        }
@@ -519,11 +519,11 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _max;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc && !acc->length()) {
            return;
        }
-        return add_input(sf, {acc});
+        return add_input({acc});
    }
 };

@@ -598,13 +598,13 @@ public:
    virtual void reset() override {
        _min = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        if (!_min) {
            return {};
        }
        return data_type_for<Type>()->decompose(data_value(Type{*_min}));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -628,8 +628,8 @@ public:
        }
        return {};
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
-        return add_input(sf, {acc});
+    virtual void reduce(const opt_bytes& acc) override {
+        return add_input({acc});
    }
 };

@@ -643,10 +643,10 @@ public:
    virtual void reset() override {
        _min = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return _min.value_or(bytes{});
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (values.empty() || !values[0]) {
            return;
        }
@@ -660,11 +660,11 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _min;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc && !acc->length()) {
            return;
        }
-        return add_input(sf, {acc});
+        return add_input({acc});
    }
 };

@@ -720,10 +720,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -739,7 +739,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return long_type->decompose(_count);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = value_cast<int64_t>(long_type->deserialize(bytes_view(*acc)));
            _count += other;
@@ -814,6 +814,35 @@ bool user_aggregate::is_reducible() const { return _reducefunc != nullptr; }
 bool user_aggregate::requires_thread() const { return _sfunc->requires_thread() || (_finalfunc && _finalfunc->requires_thread()); }
 bool user_aggregate::has_finalfunc() const { return _finalfunc != nullptr; }

+std::ostream& user_aggregate::describe(std::ostream& os) const {
+    auto ks = cql3::util::maybe_quote(name().keyspace);
+    auto na = cql3::util::maybe_quote(name().name);
+
+    os << "CREATE AGGREGATE " << ks << "." << na << "(";
+    for (size_t i = 0; i < _arg_types.size(); i++) {
+        if (i > 0) {
+            os << ", ";
+        }
+        os << _arg_types[i]->cql3_type_name();
+    }
+    os << ")\n";
+
+    os << "SFUNC " << cql3::util::maybe_quote(_sfunc->name().name) << "\n"
+       << "STYPE " << _sfunc->return_type()->cql3_type_name();
+    if (is_reducible()) {
+        os << "\n" << "REDUCEFUNC " << cql3::util::maybe_quote(_reducefunc->name().name);
+    }
+    if (has_finalfunc()) {
+        os << "\n" << "FINALFUNC " << cql3::util::maybe_quote(_finalfunc->name().name);
+    }
+    if (_initcond) {
+        os << "\n" << "INITCOND " << _sfunc->return_type()->deserialize(bytes_view(*_initcond)).to_parsable_string();
+    }
+    os << ";";
+
+    return os;
+}
+
 shared_ptr<aggregate_function>
 aggregate_fcts::make_count_rows_function() {
    return make_shared<count_rows_function>();
--- a/cql3/functions/as_json_function.hh
+++ b/cql3/functions/as_json_function.hh
@@ -18,7 +18,6 @@

 #include "bytes_ostream.hh"
 #include "types.hh"
-#include "cql_serialization_format.hh"

 #include <boost/algorithm/cxx11/any_of.hpp>

@@ -47,7 +46,7 @@ public:

    virtual bool requires_thread() const override;

-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        bytes_ostream encoded_row;
        encoded_row.write("{", 1);
        for (size_t i = 0; i < _selector_names.size(); ++i) {
--- a/Show More
+++ b/Show More