doc: remove wrong image upgrade info (5.2-to-2023.1)

This commit removes the information about the recommended way of upgrading ScyllaDB images - by updating ScyllaDB and OS packages in one step. This upgrade procedure is not supported (it was implemented, but then reverted). Refs https://github.com/scylladb/scylladb/issues/15733 Closes scylladb/scylladb#21876 Fixes https://github.com/scylladb/scylla-enterprise/issues/5041 Fixes https://github.com/scylladb/scylladb/issues/21898 (cherry picked from commit 98860905d8)
db/config.cc: increment components_memory_reclaim_threshold config default
2024-12-12 15:28:20 +02:00 · 2024-06-04 07:13:28 +03:00 · 2024-05-30 11:11:39 +03:00 · 2024-05-30 11:10:49 +03:00 · 2024-05-27 08:52:06 +03:00 · 2024-05-26 16:30:06 +03:00
921 changed files with 44309 additions and 20757 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -12,7 +12,7 @@ test/cql/cdc_* @kbr- @elcallio @piodul @jul-stas
 test/boost/cdc_* @kbr- @elcallio @piodul @jul-stas

 # COMMITLOG / BATCHLOG
-db/commitlog/* @elcallio
+db/commitlog/* @elcallio @eliransin
 db/batch* @elcallio

 # COORDINATOR
@@ -25,7 +25,7 @@ compaction/* @raphaelsc @nyh
 transport/*

 # CQL QUERY LANGUAGE
-cql3/* @tgrabiec @psarna @cvybhu
+cql3/* @tgrabiec @cvybhu @nyh

 # COUNTERS
 counters* @jul-stas
@@ -33,7 +33,7 @@ tests/counter_test* @jul-stas

 # DOCS
 docs/* @annastuchlik @tzach
-docs/alternator @annastuchlik @tzach @nyh @psarna
+docs/alternator @annastuchlik @tzach @nyh @havaker @nuivall

 # GOSSIP
 gms/* @tgrabiec @asias
@@ -45,9 +45,9 @@ dist/docker/*
 utils/logalloc* @tgrabiec

 # MATERIALIZED VIEWS
-db/view/* @nyh @psarna
-cql3/statements/*view* @nyh @psarna
-test/boost/view_* @nyh @psarna
+db/view/* @nyh @cvybhu @piodul
+cql3/statements/*view* @nyh @cvybhu @piodul
+test/boost/view_* @nyh @cvybhu @piodul

 # PACKAGING
 dist/* @syuu1228
@@ -62,9 +62,9 @@ service/migration* @tgrabiec @nyh
 schema* @tgrabiec @nyh

 # SECONDARY INDEXES
-db/index/* @nyh @psarna
-cql3/statements/*index* @nyh @psarna
-test/boost/*index* @nyh @psarna
+index/* @nyh @cvybhu @piodul
+cql3/statements/*index* @nyh @cvybhu @piodul
+test/boost/*index* @nyh @cvybhu @piodul

 # SSTABLES
 sstables/* @tgrabiec @raphaelsc @nyh
@@ -74,11 +74,11 @@ streaming/* @tgrabiec @asias
 service/storage_service.* @tgrabiec @asias

 # ALTERNATOR
-alternator/* @nyh @psarna
-test/alternator/* @nyh @psarna
+alternator/* @nyh @havaker @nuivall
+test/alternator/* @nyh @havaker @nuivall

 # HINTED HANDOFF
-db/hints/* @piodul @vladzcloudius
+db/hints/* @piodul @vladzcloudius @eliransin

 # REDIS
 redis/* @nyh @syuu1228
--- a/.github/workflows/docs-amplify-enhanced.yaml
+++ b/.github/workflows/docs-amplify-enhanced.yaml
@@ -0,0 +1,17 @@
+name: "Docs / Amplify enhanced"
+
+on: issue_comment
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.issue.pull_request }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Amplify enhanced
+        env:
+          TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        uses: scylladb/sphinx-scylladb-theme/.github/actions/amplify-enhanced@master
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,17 +1,11 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
 	url = ../scylla-swagger-ui
 	ignore = dirty
-[submodule "libdeflate"]
-	path = libdeflate
-	url = ../libdeflate
-[submodule "abseil"]
-	path = abseil
-	url = ../abseil-cpp
 [submodule "scylla-jmx"]
 	path = tools/jmx
 	url = ../scylla-jmx
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,22 +42,13 @@ set(Seastar_CXX_FLAGS ${cxx_coro_flag} ${target_arch_flag} CACHE INTERNAL "" FOR
 set(Seastar_CXX_DIALECT gnu++20 CACHE INTERNAL "" FORCE)

 add_subdirectory(seastar)
-add_subdirectory(abseil)
-# Exclude absl::strerror from the default "all" target since it's not
-# used in Scylla build and, moreover, makes use of deprecated glibc APIs,
-# such as sys_nerr, which are not exposed from "stdio.h" since glibc 2.32,
-# which happens to be the case for recent Fedora distribution versions.
-#
-# Need to use the internal "absl_strerror" target name instead of namespaced
-# variant because `set_target_properties` does not understand the latter form,
-# unfortunately.
-set_target_properties(absl_strerror PROPERTIES EXCLUDE_FROM_ALL TRUE)

 # System libraries dependencies
 find_package(Boost COMPONENTS filesystem program_options system thread regex REQUIRED)
 find_package(Lua REQUIRED)
 find_package(ZLIB REQUIRED)
 find_package(ICU COMPONENTS uc REQUIRED)
+find_package(Abseil REQUIRED)

 set(scylla_build_dir "${CMAKE_BINARY_DIR}/build/${BUILD_TYPE}")
 set(scylla_gen_build_dir "${scylla_build_dir}/gen")
@@ -746,7 +737,6 @@ target_compile_definitions(scylla PRIVATE XXH_PRIVATE_API HAVE_LZ4_COMPRESS_DEFA
 target_include_directories(scylla PRIVATE
    "${CMAKE_CURRENT_SOURCE_DIR}"
    libdeflate
-    abseil
    "${scylla_gen_build_dir}")

 ###
--- a/4
+++ b/4
@@ -34,7 +34,7 @@ END

 DATE=""

-while [[ $# -gt 0 ]]; do
+while [ $# -gt 0 ]; do
 	opt="$1"
 	case $opt in
 		-h|--help)
@@ -72,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.2.0-dev
+VERSION=5.2.19

 if test -f version
 then
--- a/1
+++ b/1
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -141,7 +141,7 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::strin
    service::storage_proxy::coordinator_query_result qr = co_await proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
            service::storage_proxy::coordinator_query_options(executor::default_timeout(), empty_service_permit(), client_state));

-    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

    auto result_set = builder.build();
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -23,7 +23,7 @@ namespace alternator {
 // api_error into a JSON object, and that is returned to the user.
 class api_error final : public std::exception {
 public:
-    using status_type = httpd::reply::status_type;
+    using status_type = http::reply::status_type;
    status_type _http_code;
    std::string _type;
    std::string _msg;
@@ -77,7 +77,7 @@ public:
        return api_error("TableNotFoundException", std::move(msg));
    }
    static api_error internal(std::string msg) {
-        return api_error("InternalServerError", std::move(msg), reply::status_type::internal_server_error);
+        return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error);
    }

    // Provide the "std::exception" interface, to make it easier to print this
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -34,6 +34,7 @@
 #include "expressions.hh"
 #include "conditions.hh"
 #include "cql3/constants.hh"
+#include "cql3/util.hh"
 #include <optional>
 #include "utils/overloaded_functor.hh"
 #include <seastar/json/json_elements.hh>
@@ -87,17 +88,20 @@ json::json_return_type make_streamed(rjson::value&& value) {
        // move objects to coroutine frame.
        auto los = std::move(os);
        auto lrs = std::move(rs);
+        std::exception_ptr ex;
        try {
            co_await rjson::print(*lrs, los);
-            co_await los.flush();
-            co_await los.close();
        } catch (...) {
            // at this point, we cannot really do anything. HTTP headers and return code are
            // already written, and quite potentially a portion of the content data.
            // just log + rethrow. It is probably better the HTTP server closes connection
            // abruptly or something...
-            elogger.error("Unhandled exception in data streaming: {}", std::current_exception());
-            throw;
+            ex = std::current_exception();
+            elogger.error("Exception during streaming HTTP response: {}", ex);
+        }
+        co_await los.close();
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
        }
        co_return;
    };
@@ -760,7 +764,6 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
        co_return api_error::access_denied("Incorrect resource identifier");
    }
    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
    const rjson::value* tags = rjson::find(request, "Tags");
    if (!tags || !tags->IsArray()) {
        co_return api_error::validation("Cannot parse tags");
@@ -768,8 +771,9 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    if (tags->Size() < 1) {
        co_return api_error::validation("The number of tags must be at least 1") ;
    }
-    update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
+    });
    co_return json_string("");
 }

@@ -787,9 +791,9 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli

    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+    });
    co_return json_string("");
 }

@@ -927,9 +931,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            if  (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
                add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
            }
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -984,9 +989,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -2302,7 +2308,7 @@ void executor::describe_single_item(const cql3::selection::selection& selection,
                rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(*cell, **column_it));
            }
        } else if (cell) {
-            auto deserialized = attrs_type()->deserialize(*cell, cql_serialization_format::latest());
+            auto deserialized = attrs_type()->deserialize(*cell);
            auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
            for (auto entry : keys_and_values) {
                std::string attr_name = value_cast<sstring>(entry.first);
@@ -2337,7 +2343,7 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
        const std::optional<attrs_to_get>& attrs_to_get) {
    rjson::value item = rjson::empty_object();

-    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
+    cql3::selection::result_set_builder builder(selection, gc_clock::now());
    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));

    auto result_set = builder.build();
@@ -2355,21 +2361,22 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
    return item;
 }

-std::vector<rjson::value> executor::describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get) {
-    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
-    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
+future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get) {
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        describe_single_item(selection, result_row, attrs_to_get, item);
+        describe_single_item(*selection, result_row, *attrs_to_get, item);
        ret.push_back(std::move(item));
+        co_await coroutine::maybe_yield();
    }
-    return ret;
+    co_return ret;
 }

 static bool check_needs_read_before_write(const parsed::value& v) {
@@ -3251,8 +3258,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                std::vector<rjson::value> jsons = describe_multi_item(schema, partition_slice, *selection, *qr.query_result, *attrs_to_get);
-                return make_ready_future<std::vector<rjson::value>>(std::move(jsons));
+                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get));
            });
            response_futures.push_back(std::move(f));
        }
@@ -3508,7 +3514,7 @@ public:
                    rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it));
                }
            } else {
-                auto deserialized = attrs_type()->deserialize(bv, cql_serialization_format::latest());
+                auto deserialized = attrs_type()->deserialize(bv);
                auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
                for (auto entry : keys_and_values) {
                    std::string attr_name = value_cast<sstring>(entry.first);
@@ -3642,7 +3648,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr

    if (exclusive_start_key) {
        partition_key pk = pk_from_json(*exclusive_start_key, schema);
-        auto pos = position_in_partition(position_in_partition::partition_start_tag_t());
+        auto pos = position_in_partition::for_partition_start();
        if (schema->clustering_key_size() > 0) {
            pos = pos_from_json(*exclusive_start_key, schema);
        }
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -222,11 +222,11 @@ public:
        const query::result&,
        const std::optional<attrs_to_get>&);

-    static std::vector<rjson::value> describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get);
+    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<bytes_opt>&,
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -73,7 +73,7 @@ struct from_json_visitor {
    }
    // default
    void operator()(const abstract_type& t) const {
-        bo.write(from_json_object(t, v, cql_serialization_format::internal()));
+        bo.write(from_json_object(t, v));
    }
 };

@@ -279,7 +279,7 @@ position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema)
        return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(std::move(ck)) : std::nullopt);
    }
    if (ck.is_empty()) {
-        return position_in_partition(position_in_partition::partition_start_tag_t());
+        return position_in_partition::for_partition_start();
    }
    return position_in_partition::for_key(std::move(ck));
 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -28,6 +28,8 @@
 static logging::logger slogger("alternator-server");

 using namespace httpd;
+using request = http::request;
+using reply = http::reply;

 namespace alternator {

--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -27,7 +27,7 @@ using chunked_content = rjson::chunked_content;
 class server {
    static constexpr size_t content_length_limit = 16*MB;
    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
-            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
+            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<http::request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

    http_server _http_server;
@@ -76,8 +76,8 @@ public:
 private:
    void set_routes(seastar::httpd::routes& r);
    // If verification succeeds, returns the authenticated user's username
-    future<std::string> verify_signature(const seastar::httpd::request&, const chunked_content&);
-    future<executor::request_return_type> handle_api_request(std::unique_ptr<request> req);
+    future<std::string> verify_signature(const seastar::http::request&, const chunked_content&);
+    future<executor::request_return_type> handle_api_request(std::unique_ptr<http::request> req);
 };

 }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -145,19 +145,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id().uuid() < t2.schema()->id().uuid();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id().uuid() == streams_start
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
@@ -883,7 +888,7 @@ future<executor::request_return_type> executor::get_records(client_state& client

    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

        auto result_set = builder.build();
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -8,6 +8,7 @@

 #include <chrono>
 #include <cstdint>
+#include <exception>
 #include <optional>
 #include <seastar/core/sstring.hh>
 #include <seastar/core/coroutine.hh>
@@ -17,6 +18,7 @@
 #include <seastar/coroutine/maybe_yield.hh>
 #include <boost/multiprecision/cpp_int.hpp>

+#include "exceptions/exceptions.hh"
 #include "gms/gossiper.hh"
 #include "gms/inet_address.hh"
 #include "inet_address_vectors.hh"
@@ -92,24 +94,25 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    if (enabled) {
-        if (tags_map.contains(TTL_TAG_KEY)) {
-            co_return api_error::validation("TTL is already enabled");
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
+        if (enabled) {
+            if (tags_map.contains(TTL_TAG_KEY)) {
+                throw api_error::validation("TTL is already enabled");
+            }
+            tags_map[TTL_TAG_KEY] = attribute_name;
+        } else {
+            auto i = tags_map.find(TTL_TAG_KEY);
+            if (i == tags_map.end()) {
+                throw api_error::validation("TTL is already disabled");
+            } else if (i->second != attribute_name) {
+                throw api_error::validation(format(
+                    "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
+                    attribute_name, i->second));
+            }
+            tags_map.erase(TTL_TAG_KEY);
        }
-        tags_map[TTL_TAG_KEY] = attribute_name;
-    } else {
-        auto i = tags_map.find(TTL_TAG_KEY);
-        if (i == tags_map.end()) {
-            co_return api_error::validation("TTL is already disabled");
-        } else if (i->second != attribute_name) {
-            co_return api_error::validation(format(
-                "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
-                attribute_name, i->second));
-        }
-        tags_map.erase(TTL_TAG_KEY);
-    }
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    });
+
    // Prepare the response, which contains a TimeToLiveSpecification
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
@@ -548,13 +551,34 @@ static future<> scan_table_ranges(
            co_return;
        }
        auto units = co_await get_units(page_sem, 1);
-        // We don't to limit page size in number of rows because there is a
-        // builtin limit of the page's size in bytes. Setting this limit to 1
-        // is useful for debugging the paging code with moderate-size data.
+        // We don't need to limit page size in number of rows because there is
+        // a builtin limit of the page's size in bytes. Setting this limit to
+        // 1 is useful for debugging the paging code with moderate-size data.
        uint32_t limit = std::numeric_limits<uint32_t>::max();
-        // FIXME: which timeout?
-        // FIXME: if read times out, need to retry it.
-        std::unique_ptr<cql3::result_set> rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
+        // Read a page, and if that times out, try again after a small sleep.
+        // If we didn't catch the timeout exception, it would cause the scan
+        // be aborted and only be restarted at the next scanning period.
+        // If we retry too many times, give up and restart the scan later.
+        std::unique_ptr<cql3::result_set> rs;
+        for (int retries=0; ; retries++) {
+            try {
+                // FIXME: which timeout?
+                rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
+                break;
+            } catch(exceptions::read_timeout_exception&) {
+                tlogger.warn("expiration scanner read timed out, will retry: {}",
+                    std::current_exception());
+            }
+            // If we didn't break out of this loop, add a minimal sleep
+            if (retries >= 10) {
+                // Don't get stuck forever asking the same page, maybe there's
+                // a bug or a real problem in several replicas. Give up on
+                // this scan an retry the scan from a random position later,
+                // in the next scan period.
+                throw runtime_exception("scanner thread failed after too many timeouts for the same page");
+            }
+            co_await sleep_abortable(std::chrono::seconds(1), abort_source);
+        }
        auto rows = rs->rows();
        auto meta = rs->get_metadata().get_names();
        std::optional<unsigned> expiration_column;
--- a/amplify.yml
+++ b/amplify.yml
@@ -0,0 +1,15 @@
+version: 1
+applications:
+  - frontend:
+      phases:
+        build:
+          commands:
+            - make setupenv
+            - make dirhtml
+      artifacts:
+        baseDirectory: _build/dirhtml
+        files:
+          - '**/*'
+      cache:
+        paths: []
+    appRoot: docs
--- a/api/api-doc/raft.json
+++ b/api/api-doc/raft.json
@@ -0,0 +1,43 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/raft",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/raft/trigger_snapshot/{group_id}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Triggers snapshot creation and log truncation for the given Raft group",
+               "type":"string",
+               "nickname":"trigger_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"group_id",
+                     "description":"The ID of the group which should get snapshotted",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Timeout in seconds after which the endpoint returns a failure. If not provided, 60s is used.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1946,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Reset local schema",
+               "summary":"Forces this node to recalculate versions of schema objects.",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -148,7 +148,34 @@
              ]
           }
        ]
-     }
+     },
+     {
+      "path":"/task_manager/task_status_recursive/{task_id}",
+      "operations":[
+         {
+            "method":"GET",
+            "summary":"Get statuses of the task and all its descendants",
+            "type":"array",
+            "items":{
+               "type":"task_status"
+            },
+            "nickname":"get_task_status_recursively",
+            "produces":[
+               "application/json"
+            ],
+            "parameters":[
+                {
+                    "name":"task_id",
+                    "description":"The uuid of a task to query about",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                }
+            ]
+         }
+      ]
+    }
    ],
    "models":{
       "task_stats" :{
@@ -168,6 +195,26 @@
                  "failed"
                ],
                "description":"The state of a task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "keyspace":{
+                "type":"string",
+                "description":"The keyspace the task is working on (if applicable)"
+             },
+             "table":{
+                "type":"string",
+                "description":"The table the task is working on (if applicable)"
+             },
+             "entity":{
+                "type":"string",
+                "description":"Task-specific entity description"
+             },
+             "sequence_number":{
+                "type":"long",
+                "description":"The running sequence number of the task"
             }
           }
       },
@@ -244,6 +291,13 @@
            "progress_completed":{
               "type":"double",
               "description":"The number of units completed so far"
+            },
+            "children_ids":{
+               "type":"array",
+                "items":{
+                    "type":"string"
+                },
+               "description":"Task IDs of children of this task"
            }
          }
       }
--- a/api/api-doc/task_manager_test.json
+++ b/api/api-doc/task_manager_test.json
@@ -86,14 +86,6 @@
                        "type":"string",
                        "paramType":"query"
                    },
-                    {
-                        "name":"type",
-                        "description":"The type of the task",
-                        "required":false,
-                        "allowMultiple":false,
-                        "type":"string",
-                        "paramType":"query"
-                    },
                    {
                        "name":"entity",
                        "description":"Task-specific entity description",
--- a/api/api.cc
+++ b/api/api.cc
@@ -31,6 +31,7 @@
 #include "api/config.hh"
 #include "task_manager.hh"
 #include "task_manager_test.hh"
+#include "raft.hh"

 logging::logger apilog("api");

@@ -277,6 +278,18 @@ future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::confi

 #endif

+future<> set_server_raft(http_context& ctx, sharded<service::raft_group_registry>& raft_gr) {
+    auto rb = std::make_shared<api_registry_builder>(ctx.api_doc);
+    return ctx.http_server.set_routes([rb, &ctx, &raft_gr] (routes& r) {
+        rb->register_function(r, "raft", "The Raft API");
+        set_raft(ctx, r, raft_gr);
+    });
+}
+
+future<> unset_server_raft(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
+}
+
 void req_params::process(const request& req) {
    // Process mandatory parameters
    for (auto& [name, ent] : params) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -14,11 +14,15 @@
 #include "tasks/task_manager.hh"
 #include "seastarx.hh"

+using request = http::request;
+using reply = http::reply;
+
 namespace service {

 class load_meter;
 class storage_proxy;
 class storage_service;
+class raft_group_registry;

 } // namespace service

@@ -113,5 +117,7 @@ future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
 future<> set_server_task_manager(http_context& ctx);
 future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::config> cfg);
+future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
+future<> unset_server_raft(http_context&);

 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -334,13 +334,13 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](replica::column_family& cf) {
-            return cf.active_memtable().partition_count();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t{0}, [](replica::column_family& cf) {
-            return cf.active_memtable().partition_count();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });

@@ -354,25 +354,33 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().total_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().total_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().total_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().total_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

@@ -410,7 +418,9 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_cf_all_memtables_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        warn(unimplemented::cause::INDEXES);
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

@@ -529,13 +539,13 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

    cf::get_all_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -41,7 +41,6 @@ static std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_ha
    return std::move(a);
 }

-
 void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
@@ -68,9 +67,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<request> req) {
        return ctx.db.map_reduce0([&ctx](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&ctx, &db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) {
+                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) -> future<> {
                    replica::table& cf = *i.second.get();
-                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
                    return make_ready_future<>();
                }).then([&tasks] {
                    return std::move(tasks);
@@ -119,7 +118,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
            auto& cm = db.get_compaction_manager();
            return parallel_for_each(table_names, [&db, &cm, &ks_name, type] (sstring& table_name) {
                auto& t = db.find_column_family(ks_name, table_name);
-                return cm.stop_compaction(type, &t.as_table_state());
+                return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                    return cm.stop_compaction(type, &ts);
+                });
            });
        });
        co_return json_void();
@@ -127,7 +128,7 @@ void set_compaction_manager(http_context& ctx, routes& r) {

    cm::get_pending_tasks.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

--- a/api/endpoint_snitch.cc
+++ b/api/endpoint_snitch.cc
@@ -25,7 +25,7 @@ void set_endpoint_snitch(http_context& ctx, routes& r, sharded<locator::snitch_p
    httpd::endpoint_snitch_info_json::get_datacenter.set(r, [&ctx](const_req req) {
        auto& topology = ctx.shared_token_metadata.local().get()->get_topology();
        auto ep = host_or_broadcast(req);
-        if (!topology.has_endpoint(ep, locator::topology::pending::yes)) {
+        if (!topology.has_endpoint(ep)) {
            // Cannot return error here, nodetool status can race, request
            // info about just-left node and not handle it nicely
            return sstring(locator::production_snitch_base::default_dc);
@@ -36,7 +36,7 @@ void set_endpoint_snitch(http_context& ctx, routes& r, sharded<locator::snitch_p
    httpd::endpoint_snitch_info_json::get_rack.set(r, [&ctx](const_req req) {
        auto& topology = ctx.shared_token_metadata.local().get()->get_topology();
        auto ep = host_or_broadcast(req);
-        if (!topology.has_endpoint(ep, locator::topology::pending::yes)) {
+        if (!topology.has_endpoint(ep)) {
            // Cannot return error here, nodetool status can race, request
            // info about just-left node and not handle it nicely
            return sstring(locator::production_snitch_base::default_rack);
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -17,36 +17,42 @@ namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
-        std::vector<fd::endpoint_state> res;
-        for (auto i : g.get_endpoint_states()) {
-            fd::endpoint_state val;
-            val.addrs = boost::lexical_cast<std::string>(i.first);
-            val.is_alive = i.second.is_alive();
-            val.generation = i.second.get_heart_beat_state().get_generation();
-            val.version = i.second.get_heart_beat_state().get_heart_beat_version();
-            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
-            for (auto a : i.second.get_application_state_map()) {
-                fd::version_value version_val;
-                // We return the enum index and not it's name to stay compatible to origin
-                // method that the state index are static but the name can be changed.
-                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
-                version_val.value = a.second.value;
-                version_val.version = a.second.version;
-                val.application_state.push(version_val);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::vector<fd::endpoint_state> res;
+            for (auto i : g.get_endpoint_states()) {
+                fd::endpoint_state val;
+                val.addrs = boost::lexical_cast<std::string>(i.first);
+                val.is_alive = i.second.is_alive();
+                val.generation = i.second.get_heart_beat_state().get_generation();
+                val.version = i.second.get_heart_beat_state().get_heart_beat_version();
+                val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+                for (auto a : i.second.get_application_state_map()) {
+                    fd::version_value version_val;
+                    // We return the enum index and not it's name to stay compatible to origin
+                    // method that the state index are static but the name can be changed.
+                    version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                    version_val.value = a.second.value;
+                    version_val.version = a.second.version;
+                    val.application_state.push(version_val);
+                }
+                res.push_back(val);
            }
-            res.push_back(val);
-        }
-        return make_ready_future<json::json_return_type>(res);
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_up_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_up_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_up_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_down_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_down_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_down_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_phi_convict_threshold.set(r, [] (std::unique_ptr<request> req) {
@@ -54,11 +60,13 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
-        std::map<sstring, sstring> nodes_status;
-        for (auto& entry : g.get_endpoint_states()) {
-            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
-        }
-        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::map<sstring, sstring> nodes_status;
+            for (auto& entry : g.get_endpoint_states()) {
+                nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+            }
+            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        });
    });

    fd::set_phi_convict_threshold.set(r, [](std::unique_ptr<request> req) {
@@ -67,13 +75,15 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
-        if (!state) {
-            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
-        }
-        std::stringstream ss;
-        g.append_endpoint_state(ss, *state);
-        return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
+            auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
+            if (!state) {
+                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+            }
+            std::stringstream ss;
+            g.append_endpoint_state(ss, *state);
+            return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        });
    });

    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -6,6 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <seastar/core/coroutine.hh>
+
 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
 #include "gms/gossiper.hh"
@@ -14,19 +16,23 @@ namespace api {
 using namespace json;

 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
-    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_unreachable_members();
-        return container_to_vec(res);
+    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto res = co_await g.get_unreachable_members_synchronized();
+        co_return json::json_return_type(container_to_vec(res));
    });

-    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_live_members();
-        return container_to_vec(res);
+
+    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (std::unique_ptr<request> req) {
+        return g.get_live_members_synchronized().then([] (auto res) {
+            return make_ready_future<json::json_return_type>(container_to_vec(res));
+        });
    });

-    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
-        gms::inet_address ep(req.param["addr"]);
-        return g.get_endpoint_downtime(ep);
+    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        gms::inet_address ep(req->param["addr"]);
+        // synchronize unreachable_members on all shards
+        co_await g.get_unreachable_members_synchronized();
+        co_return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<request> req) {
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <seastar/core/coroutine.hh>
+
+#include "api/api.hh"
+#include "api/api-doc/raft.json.hh"
+
+#include "service/raft/raft_group_registry.hh"
+
+using namespace seastar::httpd;
+
+extern logging::logger apilog;
+
+namespace api {
+
+namespace r = httpd::raft_json;
+using namespace json;
+
+void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
+    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
+        raft::group_id gid{utils::UUID{req->param["group_id"]}};
+        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
+            if (timeout_str.empty()) {
+                return std::chrono::seconds{60};
+            }
+            auto dur = std::stoll(timeout_str);
+            if (dur <= 0) {
+                throw std::runtime_error{"Timeout must be a positive number."};
+            }
+            return std::chrono::seconds{dur};
+        });
+
+        std::atomic<bool> found_srv{false};
+        co_await raft_gr.invoke_on_all([gid, timeout_dur, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
+            auto* srv = raft_gr.find_server(gid);
+            if (!srv) {
+                co_return;
+            }
+
+            found_srv = true;
+            abort_on_expiry aoe(lowres_clock::now() + timeout_dur);
+            apilog.info("Triggering Raft group {} snapshot", gid);
+            auto result = co_await srv->trigger_snapshot(&aoe.abort_source());
+            if (result) {
+                apilog.info("New snapshot for Raft group {} created", gid);
+            } else {
+                apilog.info("Could not create new snapshot for Raft group {}, no new entries applied", gid);
+            }
+        });
+
+        if (!found_srv) {
+            throw std::runtime_error{fmt::format("Server for group ID {} not found", gid)};
+        }
+
+        co_return json_void{};
+    });
+}
+
+void unset_raft(http_context&, httpd::routes& r) {
+    r::trigger_snapshot.unset(r);
+}
+
+}
+
--- a/api/raft.hh
+++ b/api/raft.hh
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2023-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "api_init.hh"
+
+namespace api {
+
+void set_raft(http_context& ctx, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr);
+void unset_raft(http_context& ctx, httpd::routes& r);
+
+}
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -49,6 +49,14 @@

 extern logging::logger apilog;

+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti) {
+    return os << "table{name=" << ti.name << ", id=" << ti.id << "}";
+}
+
+} // namespace std
+
 namespace api {

 const locator::token_metadata& http_context::get_token_metadata() {
@@ -100,6 +108,55 @@ std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, con
    return parse_tables(ks_name, ctx, it->second);
 }

+std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, sstring value) {
+    std::vector<table_info> res;
+    try {
+        if (value.empty()) {
+            const auto& cf_meta_data = ctx.db.local().find_keyspace(ks_name).metadata().get()->cf_meta_data();
+            res.reserve(cf_meta_data.size());
+            for (const auto& [name, schema] : cf_meta_data) {
+                res.emplace_back(table_info{name, schema->id()});
+            }
+        } else {
+            std::vector<sstring> names = split(value, ",");
+            res.reserve(names.size());
+            const auto& db = ctx.db.local();
+            for (const auto& table_name : names) {
+                res.emplace_back(table_info{table_name, db.find_uuid(ks_name, table_name)});
+            }
+        }
+    } catch (const replica::no_such_keyspace& e) {
+        throw bad_param_exception(e.what());
+    } catch (const replica::no_such_column_family& e) {
+        throw bad_param_exception(e.what());
+    }
+    return res;
+}
+
+std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name) {
+    auto it = query_params.find(param_name);
+    return parse_table_infos(ks_name, ctx, it != query_params.end() ? it->second : "");
+}
+
+// Run on all tables, skipping dropped tables
+future<> run_on_existing_tables(sstring op, replica::database& db, std::string_view keyspace, const std::vector<table_info> local_tables, std::function<future<> (replica::table&)> func) {
+    std::exception_ptr ex;
+    for (const auto& ti : local_tables) {
+        apilog.debug("Starting {} on {}.{}", op, keyspace, ti);
+        try {
+            co_await func(db.find_column_family(ti.id));
+        } catch (const replica::no_such_column_family& e) {
+            apilog.warn("Skipping {} of {}.{}: {}", op, keyspace, ti, e.what());
+        } catch (...) {
+            ex = std::current_exception();
+            apilog.error("Failed {} of {}.{}: {}", op, keyspace, ti, ex);
+        }
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+    }
+}
+
 static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
    ss::token_range r;
    r.start_token = d._start_token;
@@ -118,16 +175,13 @@ static ss::token_range token_range_endpoints_to_json(const dht::token_range_endp
    return r;
 }

-using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
+using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<table_info>)>;

 static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-        }
-        return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        return f(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
    };
 }

@@ -189,17 +243,21 @@ future<json::json_return_type> set_tables_autocompaction(http_context& ctx, cons
 }

 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
-    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::start_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::stop_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -221,17 +279,21 @@ void unset_transport_controller(http_context& ctx, routes& r) {
 }

 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
-    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+    ss::stop_rpc_server.set(r, [&ctx, &ctl] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+    ss::start_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -609,93 +671,114 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    });

-    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-        }
-        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
-            auto table_ids = boost::copy_range<std::vector<table_id>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
-                return db.find_uuid(keyspace, cf_name);
-            }));
-            // major compact smaller tables first, to increase chances of success if low on space.
-            std::ranges::sort(table_ids, std::less<>(), [&] (const table_id& id) {
-                return db.find_column_family(id).get_stats().live_disk_space_used;
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, table_infos);
+        try {
+            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
+                auto local_tables = table_infos;
+                // major compact smaller tables first, to increase chances of success if low on space.
+                std::ranges::sort(local_tables, std::less<>(), [&] (const table_info& ti) {
+                    try {
+                        return db.find_column_family(ti.id).get_stats().live_disk_space_used;
+                    } catch (const replica::no_such_column_family& e) {
+                        return int64_t(-1);
+                    }
+                });
+                co_await run_on_existing_tables("force_keyspace_compaction", db, keyspace, local_tables, [] (replica::table& t) {
+                    return t.compact_all_sstables();
+                });
            });
-            // as a table can be dropped during loop below, let's find it before issuing major compaction request.
-            for (auto& id : table_ids) {
-                co_await db.find_column_family(id).compact_all_sstables();
-            }
-            co_return;
-        }).then([]{
-                return make_ready_future<json::json_return_type>(json_void());
-        });
+        } catch (...) {
+            apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json_void();
    });

-    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<request> req) {
+    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
-        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
+            auto msg = "Can not perform cleanup operation when topology changes";
+            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+            co_await coroutine::return_exception(std::runtime_error(msg));
        }
-        return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
-                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
-            if (!is_cleanup_allowed) {
-                return make_exception_future<json::json_return_type>(
-                        std::runtime_error("Can not perform cleanup operation when topology changes"));
-            }
-            return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
-                auto table_ids = boost::copy_range<std::vector<table_id>>(column_families | boost::adaptors::transformed([&] (auto& table_name) {
-                    return db.find_uuid(keyspace, table_name);
-                }));
+        try {
+            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
+                auto local_tables = table_infos;
                // cleanup smaller tables first, to increase chances of success if low on space.
-                std::ranges::sort(table_ids, std::less<>(), [&] (const table_id& id) {
-                    return db.find_column_family(id).get_stats().live_disk_space_used;
+                std::ranges::sort(local_tables, std::less<>(), [&] (const table_info& ti) {
+                    try {
+                        return db.find_column_family(ti.id).get_stats().live_disk_space_used;
+                    } catch (const replica::no_such_column_family& e) {
+                        return int64_t(-1);
+                    }
                });
                auto& cm = db.get_compaction_manager();
                auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
-                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
-                for (auto& id : table_ids) {
-                    replica::table& t = db.find_column_family(id);
-                    co_await cm.perform_cleanup(owned_ranges_ptr, t.as_table_state());
-                }
-                co_return;
-            }).then([]{
-                return make_ready_future<json::json_return_type>(0);
+                co_await run_on_existing_tables("force_keyspace_cleanup", db, keyspace, local_tables, [&] (replica::table& t) {
+                    return t.perform_cleanup_compaction(owned_ranges_ptr);
+                });
            });
-        });
+        } catch (...) {
+            apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(0);
    });

-    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
-        co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
-            bool needed = false;
-            for (const auto& table : tables) {
-                auto& t = db.find_column_family(keyspace, table);
-                needed |= co_await t.perform_offstrategy_compaction();
-            }
-            co_return needed;
-        }, false, std::plus<bool>());
+    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
+        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
+        bool res = false;
+        try {
+            res = co_await ctx.db.map_reduce0([&] (replica::database& db) -> future<bool> {
+                bool needed = false;
+                co_await run_on_existing_tables("perform_keyspace_offstrategy_compaction", db, keyspace, table_infos, [&needed] (replica::table& t) -> future<> {
+                    needed |= co_await t.perform_offstrategy_compaction();
+                });
+                co_return needed;
+            }, false, std::plus<bool>());
+        } catch (...) {
+            apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(res);
    }));

-    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

-        return ctx.db.invoke_on_all([=] (replica::database& db) {
-            auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
-            return do_for_each(column_families, [=, &db](sstring cfname) {
-                auto& cm = db.get_compaction_manager();
-                auto& cf = db.find_column_family(keyspace, cfname);
-                return cm.perform_sstable_upgrade(owned_ranges_ptr, cf.as_table_state(), exclude_current_version);
+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+        try {
+            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
+                auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
+                co_await run_on_existing_tables("upgrade_sstables", db, keyspace, table_infos, [&] (replica::table& t) {
+                    return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                        return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, exclude_current_version);
+                    });
+                });
            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
-        });
+        } catch (...) {
+            apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(0);
    }));

    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
        auto& db = ctx.db;
        if (column_families.empty()) {
            co_await replica::database::flush_keyspace_on_all_shards(db, keyspace);
@@ -707,6 +790,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_


    ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("decommission");
        return ss.local().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -722,6 +806,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
        auto host_id = validate_host_id(req->get_query_param("host_id"));
        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
+        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<locator::host_id_or_endpoint>();
        for (std::string n : ignore_nodes_strs) {
            try {
@@ -797,6 +882,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("drain");
        return ss.local().drain().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -820,12 +906,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("stop_gossiping");
        return ss.local().stop_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("start_gossiping");
        return ss.local().start_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -928,6 +1016,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
        auto source_dc = req->get_query_param("source_dc");
+        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -960,17 +1049,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&sys_ks](std::unique_ptr<request> req) {
+    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
-        auto& sp = service::get_storage_proxy();
-        auto& fs = sp.local().features();
-        return db::schema_tables::recalculate_schema_version(sys_ks, sp, fs).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        apilog.info("reset_local_schema");
+        co_await ss.local().reload_schema();
+        co_return json_void();
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
+        apilog.info("set_trace_probability: probability={}", probability);
        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -1008,6 +1096,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto ttl = req->get_query_param("ttl");
        auto threshold = req->get_query_param("threshold");
        auto fast = req->get_query_param("fast");
+        apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
        try {
            return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
                if (threshold != "") {
@@ -1034,6 +1123,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, keyspace, tables, true);
    });

@@ -1041,6 +1131,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, keyspace, tables, false);
    });

@@ -1366,7 +1457,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        });
    });

-    ss::scrub.set(r, [&ctx, &snap_ctl] (std::unique_ptr<request> req) {
+    ss::scrub.set(r, [&ctx, &snap_ctl] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& db = ctx.db;
        auto rp = req_params({
            {"keyspace", {mandatory::yes}},
            {"cf", {""}},
@@ -1402,10 +1494,9 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
            }
        }

-        auto f = make_ready_future<>();
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
+            co_await coroutine::parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
                // We always pass here db::snapshot_ctl::snap_views::no since:
                // 1. When scrubbing particular tables, there's no need to auto-snapshot their views.
                // 2. When scrubbing the whole keyspace, column_families will contain both base tables and views.
@@ -1434,28 +1525,30 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
            return stats;
        };

-        return f.then([&ctx, keyspace, column_families, opts, &reduce_compaction_stats] {
-            return ctx.db.map_reduce0([=] (replica::database& db) {
-                return map_reduce(column_families, [=, &db] (sstring cfname) {
+        try {
+            auto opt_stats = co_await db.map_reduce0([&] (replica::database& db) {
+                return map_reduce(column_families, [&] (sstring cfname) -> future<std::optional<sstables::compaction_stats>> {
                    auto& cm = db.get_compaction_manager();
                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(cf.as_table_state(), opts);
+                    sstables::compaction_stats stats{};
+                    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
+                        auto r = co_await cm.perform_sstable_scrub(ts, opts);
+                        stats += r.value_or(sstables::compaction_stats{});
+                    });
+                    co_return stats;
                }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
            }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
-        }).then_wrapped([] (auto f) {
-            if (f.failed()) {
-                auto ex = f.get_exception();
-                if (try_catch<sstables::compaction_aborted_exception>(ex)) {
-                    return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::aborted));
-                } else {
-                    return make_exception_future<json::json_return_type>(std::move(ex));
-                }
-            } else if (f.get()->validation_errors) {
-                return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::validation_errors));
-            } else {
-                return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::successful));
+            if (opt_stats && opt_stats->validation_errors) {
+                co_return json::json_return_type(static_cast<int>(scrub_status::validation_errors));
            }
-        });
+        } catch (const sstables::compaction_aborted_exception&) {
+            co_return json::json_return_type(static_cast<int>(scrub_status::aborted));
+        } catch (...) {
+            apilog.error("scrub keyspace={} tables={} failed: {}", keyspace, column_families, std::current_exception());
+            throw;
+        }
+
+        co_return json::json_return_type(static_cast<int>(scrub_status::successful));
    });
 }

--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -8,6 +8,8 @@

 #pragma once

+#include <iostream>
+
 #include <seastar/core/sharded.hh>
 #include "api.hh"
 #include "db/data_listeners.hh"
@@ -41,8 +43,22 @@ sstring validate_keyspace(http_context& ctx, const parameters& param);
 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective no_such_column_family error.
+// Returns an empty vector if no parameter was found.
+// If the parameter is found and empty, returns a list of all table names in the keyspace.
 std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);

+struct table_info {
+    sstring name;
+    table_id id;
+};
+
+// splits a request parameter assumed to hold a comma-separated list of table names
+// verify that the tables are found, otherwise a bad_param_exception exception is thrown
+// containing the description of the respective no_such_column_family error.
+// Returns a vector of all table infos given by the parameter, or
+// if the parameter is not found or is empty, returns a list of all table infos in the keyspace.
+std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);
+
 void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ls);
 void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>& sst_loader);
 void unset_sstables_loader(http_context& ctx, routes& r);
@@ -58,4 +74,10 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
 void unset_snapshot(http_context& ctx, routes& r);
 seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);

-}
+} // namespace api
+
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, const api::table_info& ti);
+
+} // namespace std
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -30,17 +30,32 @@ inline bool filter_tasks(tasks::task_manager::task_ptr task, std::unordered_map<

 struct full_task_status {
    tasks::task_manager::task::status task_status;
+    std::string type;
    tasks::task_manager::task::progress progress;
    std::string module;
    tasks::task_id parent_id;
    tasks::is_abortable abortable;
+    std::vector<std::string> children_ids;
 };

 struct task_stats {
-    task_stats(tasks::task_manager::task_ptr task) : task_id(task->id().to_sstring()), state(task->get_status().state) {}
+    task_stats(tasks::task_manager::task_ptr task)
+        : task_id(task->id().to_sstring())
+        , state(task->get_status().state)
+        , type(task->type())
+        , keyspace(task->get_status().keyspace)
+        , table(task->get_status().table)
+        , entity(task->get_status().entity)
+        , sequence_number(task->get_status().sequence_number)
+    { }

    sstring task_id;
    tasks::task_manager::task_state state;
+    std::string type;
+    std::string keyspace;
+    std::string table;
+    std::string entity;
+    uint64_t sequence_number;
 };

 tm::task_status make_status(full_task_status status) {
@@ -52,7 +67,7 @@ tm::task_status make_status(full_task_status status) {

    tm::task_status res{};
    res.id = status.task_status.id.to_sstring();
-    res.type = status.task_status.type;
+    res.type = status.type;
    res.state = status.task_status.state;
    res.is_abortable = bool(status.abortable);
    res.start_time = st;
@@ -67,22 +82,29 @@ tm::task_status make_status(full_task_status status) {
    res.progress_units = status.task_status.progress_units;
    res.progress_total = status.progress.total;
    res.progress_completed = status.progress.completed;
+    res.children_ids = std::move(status.children_ids);
    return res;
 }

-future<json::json_return_type> retrieve_status(tasks::task_manager::foreign_task_ptr task) {
+future<full_task_status> retrieve_status(const tasks::task_manager::foreign_task_ptr& task) {
    if (task.get() == nullptr) {
        co_return coroutine::return_exception(httpd::bad_param_exception("Task not found"));
    }
    auto progress = co_await task->get_progress();
    full_task_status s;
    s.task_status = task->get_status();
+    s.type = task->type();
    s.parent_id = task->get_parent_id();
    s.abortable = task->is_abortable();
    s.module = task->get_module_name();
    s.progress.completed = progress.completed;
    s.progress.total = progress.total;
-    co_return make_status(s);
+    std::vector<std::string> ct{task->get_children().size()};
+    boost::transform(task->get_children(), ct.begin(), [] (const auto& child) {
+        return child->id().to_sstring();
+    });
+    s.children_ids = std::move(ct);
+    co_return s;
 }

 void set_task_manager(http_context& ctx, routes& r) {
@@ -134,7 +156,8 @@ void set_task_manager(http_context& ctx, routes& r) {
            }
            co_return std::move(task);
        }));
-        co_return co_await retrieve_status(std::move(task));
+        auto s = co_await retrieve_status(task);
+        co_return make_status(s);
    });

    tm::abort_task.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
@@ -153,11 +176,55 @@ void set_task_manager(http_context& ctx, routes& r) {
        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
            return task->done().then_wrapped([task] (auto f) {
                task->unregister_task();
-                f.get();
+                // done() is called only because we want the task to be complete before getting its status.
+                // The future should be ignored here as the result does not matter.
+                f.ignore_ready_future();
                return make_foreign(task);
            });
        }));
-        co_return co_await retrieve_status(std::move(task));
+        auto s = co_await retrieve_status(task);
+        co_return make_status(s);
+    });
+
+    tm::get_task_status_recursively.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& _ctx = ctx;
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        std::queue<tasks::task_manager::foreign_task_ptr> q;
+        utils::chunked_vector<full_task_status> res;
+
+        // Get requested task.
+        auto task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return task;
+        }));
+
+        // Push children's statuses in BFS order.
+        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
+        while (!q.empty()) {
+            auto& current = q.front();
+            res.push_back(co_await retrieve_status(current));
+            for (auto& child: current->get_children()) {
+                q.push(co_await child.copy());
+            }
+            q.pop();
+        }
+
+        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
+            auto s = std::move(os);
+            auto res = std::move(r);
+            co_await s.write("[");
+            std::string delim = "";
+            for (auto& status: res) {
+                co_await s.write(std::exchange(delim, ", "));
+                co_await formatter::write(s, make_status(status));
+            }
+            co_await s.write("]");
+            co_await s.close();
+        };
+        co_return f;
    });
 }

--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -47,8 +47,6 @@ void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg) {
        std::string keyspace = it != req->query_parameters.end() ? it->second : "";
        it = req->query_parameters.find("table");
        std::string table = it != req->query_parameters.end() ? it->second : "";
-        it = req->query_parameters.find("type");
-        std::string type = it != req->query_parameters.end() ? it->second : "";
        it = req->query_parameters.find("entity");
        std::string entity = it != req->query_parameters.end() ? it->second : "";
        it = req->query_parameters.find("parent_id");
@@ -60,7 +58,7 @@ void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg) {
        }

        auto module = tms.local().find_module("test");
-        id = co_await module->make_task<tasks::test_task_impl>(shard, id, keyspace, table, type, entity, data);
+        id = co_await module->make_task<tasks::test_task_impl>(shard, id, keyspace, table, entity, data);
        co_await tms.invoke_on(shard, [id] (tasks::task_manager& tm) {
            auto it = tm.get_all_tasks().find(id);
            if (it != tm.get_all_tasks().end()) {
@@ -99,7 +97,7 @@ void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg) {

    tmt::get_and_update_ttl.set(r, [&ctx, &cfg] (std::unique_ptr<request> req) -> future<json::json_return_type> {
        uint32_t ttl = cfg.task_ttl_seconds();
-        cfg.task_ttl_seconds.set(boost::lexical_cast<uint32_t>(req->query_parameters["ttl"]));
+        co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
        co_return json::json_return_type(ttl);
    });
 }
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -66,36 +66,48 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+// Based on Cassandra's resolveRegular function:
+//  - https://github.com/apache/cassandra/blob/e4f31b73c21b04966269c5ac2d3bd2562e5f6c63/src/java/org/apache/cassandra/db/rows/Cells.java#L79-L119
+//
+// Note: the ordering algorithm for cell is the same as for rows,
+// except that the cell value is used to break a tie in case all other attributes are equal.
+// See compare_row_marker_for_merge.
 std::strong_ordering
 compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    // Largest write timestamp wins.
    if (left.timestamp() != right.timestamp()) {
        return left.timestamp() <=> right.timestamp();
    }
+    // Tombstones always win reconciliation with live cells of the same timestamp
    if (left.is_live() != right.is_live()) {
        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
    }
    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
+        // Prefer expiring cells (which will become tombstones at some future date) over live cells.
+        // See https://issues.apache.org/jira/browse/CASSANDRA-14592
        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
+        // If both are expiring, choose the cell with the latest expiry or derived write time.
        if (left.is_live_and_has_ttl()) {
+            // Prefer cell with latest expiry
            if (left.expiry() != right.expiry()) {
                return left.expiry() <=> right.expiry();
-            } else {
-                // prefer the cell that was written later,
-                // so it survives longer after it expires, until purged.
+            } else if (right.ttl() != left.ttl()) {
+                // The cell write time is derived by (expiry - ttl).
+                // Prefer the cell that was written later,
+                // so it survives longer after it expires, until purged,
+                // as it become purgeable gc_grace_seconds after it was written.
+                //
+                // Note that this is an extension to Cassandra's algorithm
+                // which stops at the expiration time, and if equal,
+                // move forward to compare the cell values.
                return right.ttl() <=> left.ttl();
            }
        }
+        // The cell with the largest value wins, if all other attributes of the cells are identical.
+        // This is quite arbitrary, but still required to break the tie in a deterministic way.
+        return compare_unsigned(left.value(), right.value());
    } else {
        // Both are deleted

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -229,6 +229,8 @@ future<authenticated_user> password_authenticator::authenticate(
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
        } catch (exceptions::authentication_exception& e) {
            std::throw_with_nested(e);
+        } catch (exceptions::unavailable_exception& e) {
+            std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
        } catch (...) {
            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -55,6 +55,7 @@ future<bool> default_role_row_satisfies(
        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
+                internal_distributed_query_state(),
                {meta::DEFAULT_SUPERUSER_NAME},
                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -457,7 +457,9 @@ public:
            _begin.ptr->size = _size;
            _current = nullptr;
            _size = 0;
-            return managed_bytes(std::exchange(_begin.ptr, {}));
+            auto begin_ptr = _begin.ptr;
+            _begin.ptr = nullptr;
+            return managed_bytes(begin_ptr);
        } else {
            return managed_bytes();
        }
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -572,7 +572,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        _read_context.cache().on_mispopulate();
        return;
    }
-    auto rt_opt = _rt_assembler.flush(*_schema, position_in_partition::after_key(cr.key()));
+    auto rt_opt = _rt_assembler.flush(*_schema, position_in_partition::after_key(*_schema, cr.key()));
    clogger.trace("csm {}: populate({})", fmt::ptr(this), clustering_row::printer(*_schema, cr));
    _lsa_manager.run_in_update_section_with_allocator([this, &cr, &rt_opt] {
        mutation_partition& mp = _snp->version()->partition();
@@ -634,8 +634,8 @@ inline
 void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", fmt::ptr(this), _next_row.position(), _next_row_in_range);
    _next_row.touch();
-    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
-    auto upper_bound = _next_row_in_range ? next_lower_bound : _upper_bound;
+    auto next_lower_bound = position_in_partition_view::after_key(table_schema(), _next_row.position());
+    auto upper_bound = _next_row_in_range ? next_lower_bound.view : _upper_bound;
    if (_snp->range_tombstones(_lower_bound, upper_bound, [&] (range_tombstone rts) {
        add_range_tombstone_to_buffer(std::move(rts));
        return stop_iteration(_lower_bound_changed && is_buffer_full());
@@ -774,14 +774,14 @@ void cache_flat_mutation_reader::move_to_next_entry() {
    }
 }

-void cache_flat_mutation_reader::flush_tombstones(position_in_partition_view pos, bool end_of_range) {
+void cache_flat_mutation_reader::flush_tombstones(position_in_partition_view pos_, bool end_of_range) {
    // Ensure position is appropriate for range tombstone bound
-    pos = position_in_partition_view::after_key(pos);
-    clogger.trace("csm {}: flush_tombstones({}) end_of_range: {}", fmt::ptr(this), pos, end_of_range);
-    _rt_gen.flush(pos, [this] (range_tombstone_change&& rtc) {
+    auto pos = position_in_partition_view::after_key(*_schema, pos_);
+    clogger.trace("csm {}: flush_tombstones({}) end_of_range: {}", fmt::ptr(this), pos.view, end_of_range);
+    _rt_gen.flush(pos.view, [this] (range_tombstone_change&& rtc) {
        add_to_buffer(std::move(rtc), source::cache);
    }, end_of_range);
-    if (auto rtc_opt = _rt_merger.flush(pos, end_of_range)) {
+    if (auto rtc_opt = _rt_merger.flush(pos.view, end_of_range)) {
        do_add_to_buffer(std::move(*rtc_opt));
    }
 }
@@ -832,7 +832,7 @@ inline
 void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment_v2&& mf) {
    clogger.trace("csm {}: add_clustering_row_to_buffer({})", fmt::ptr(this), mutation_fragment_v2::printer(*_schema, mf));
    auto& row = mf.as_clustering_row();
-    auto new_lower_bound = position_in_partition::after_key(row.key());
+    auto new_lower_bound = position_in_partition::after_key(*_schema, row.key());
    push_mutation_fragment(std::move(mf));
    _lower_bound = std::move(new_lower_bound);
    _lower_bound_changed = true;
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -25,6 +25,7 @@
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
 #include "utils/UUID_gen.hh"
+#include "utils/error_injection.hh"

 #include "cdc/generation.hh"
 #include "cdc/cdc_options.hh"
@@ -44,8 +45,16 @@ static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const

 namespace cdc {

-extern const api::timestamp_clock::duration generation_leeway =
-    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+api::timestamp_clock::duration get_generation_leeway() {
+    static thread_local auto generation_leeway =
+            std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+    utils::get_local_injector().inject("increase_cdc_generation_leeway", [&] {
+        generation_leeway = std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::minutes(5));
+    });
+
+    return generation_leeway;
+}

 static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    i = net::hton(i);
@@ -160,18 +169,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(std::vector<token_range_description> entries)
+topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const& {
+const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-std::vector<token_range_description>&& topology_description::entries() && {
+utils::chunked_vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -263,7 +272,7 @@ public:
    topology_description generate() const {
        const auto tokens = get_tokens();

-        std::vector<token_range_description> vnode_descriptions;
+        utils::chunked_vector<token_range_description> vnode_descriptions;
        vnode_descriptions.reserve(tokens.size());

        vnode_descriptions.push_back(
@@ -331,7 +340,7 @@ future<cdc::generation_id> generation_service::make_new_generation(const std::un
    auto new_generation_timestamp = [add_delay, ring_delay = _cfg.ring_delay] {
        auto ts = db_clock::now();
        if (add_delay && ring_delay != 0ms) {
-            ts += 2 * ring_delay + duration_cast<milliseconds>(generation_leeway);
+            ts += 2 * ring_delay + duration_cast<milliseconds>(get_generation_leeway());
        }
        return ts;
    };
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -46,6 +46,8 @@ namespace gms {

 namespace cdc {

+api::timestamp_clock::duration get_generation_leeway();
+
 class stream_id final {
    bytes _value;
 public:
@@ -94,13 +96,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    std::vector<token_range_description> _entries;
+    utils::chunked_vector<token_range_description> _entries;
 public:
-    topology_description(std::vector<token_range_description> entries);
+    topology_description(utils::chunked_vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const&;
-    std::vector<token_range_description>&& entries() &&;
+    const utils::chunked_vector<token_range_description>& entries() const&;
+    utils::chunked_vector<token_range_description>&& entries() &&;
 };

 /**
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -605,7 +605,7 @@ private:
 public:
    collection_iterator(managed_bytes_view_opt v = {})
        : _v(v.value_or(managed_bytes_view{}))
-        , _rem(_v.empty() ? 0 : read_collection_size(_v, cql_serialization_format::internal()))
+        , _rem(_v.empty() ? 0 : read_collection_size(_v))
    {
        if (_rem != 0) {
            parse();
@@ -650,8 +650,8 @@ template<>
 void collection_iterator<std::pair<managed_bytes_view, managed_bytes_view>>::parse() {
    assert(_rem > 0);
    _next = _v;
-    auto k = read_collection_value(_next, cql_serialization_format::internal());
-    auto v = read_collection_value(_next, cql_serialization_format::internal());
+    auto k = read_collection_value(_next);
+    auto v = read_collection_value(_next);
    _current = std::make_pair(k, v);
 }

@@ -659,7 +659,7 @@ template<>
 void collection_iterator<managed_bytes_view>::parse() {
    assert(_rem > 0);
    _next = _v;
-    auto k = read_collection_value(_next, cql_serialization_format::internal());
+    auto k = read_collection_value(_next);
    _current = k;
 }

@@ -728,7 +728,7 @@ auto make_maybe_back_inserter(Container& c, const abstract_type& type, collectio
 static size_t collection_size(const managed_bytes_opt& bo) {
    if (bo) {
        managed_bytes_view mbv(*bo);
-        return read_collection_size(mbv, cql_serialization_format::internal());
+        return read_collection_size(mbv);
    }
    return 0;
 }
@@ -750,7 +750,7 @@ static managed_bytes merge(const collection_type_impl& ctype, const managed_byte
    // note order: set_union, when finding doubles, use value from first1 (j here). So
    // since this is next, it has prio
    std::set_union(j, e, i, e, make_maybe_back_inserter(res, *type, collection_iterator<managed_bytes_view>(deleted)), cmp);
-    return map_type_impl::serialize_partially_deserialized_form_fragmented(res, cql_serialization_format::internal());
+    return map_type_impl::serialize_partially_deserialized_form_fragmented(res);
 }
 static managed_bytes merge(const set_type_impl& ctype, const managed_bytes_opt& prev, const managed_bytes_opt& next, const managed_bytes_opt& deleted) {
    std::vector<managed_bytes_view> res;
@@ -761,7 +761,7 @@ static managed_bytes merge(const set_type_impl& ctype, const managed_bytes_opt&
    };
    collection_iterator<managed_bytes_view> e, i(prev), j(next), d(deleted);
    std::set_union(j, e, i, e, make_maybe_back_inserter(res, *type, d), cmp);
-    return set_type_impl::serialize_partially_deserialized_form_fragmented(res, cql_serialization_format::internal());
+    return set_type_impl::serialize_partially_deserialized_form_fragmented(res);
 }
 static managed_bytes merge(const user_type_impl& type, const managed_bytes_opt& prev, const managed_bytes_opt& next, const managed_bytes_opt& deleted) {
    std::vector<managed_bytes_view_opt> res(type.size());
@@ -812,15 +812,14 @@ static managed_bytes_opt get_preimage_col_value(const column_definition& cdef, c
            // flatten set
            [&] (const set_type_impl& type) {
                auto v = pirow->get_view(cdef.name_as_text());
-                auto f = cql_serialization_format::internal();
-                auto n = read_collection_size(v, f);
+                auto n = read_collection_size(v);
                std::vector<managed_bytes> tmp;
                tmp.reserve(n);
                while (n--) {
-                    tmp.emplace_back(read_collection_value(v, f)); // key
-                    read_collection_value(v, f); // value. ignore.
+                    tmp.emplace_back(read_collection_value(v)); // key
+                    read_collection_value(v); // value. ignore.
                }
-                return set_type_impl::serialize_partially_deserialized_form_fragmented({tmp.begin(), tmp.end()}, f);
+                return set_type_impl::serialize_partially_deserialized_form_fragmented({tmp.begin(), tmp.end()});
            },
            [&] (const abstract_type& o) -> managed_bytes {
                return pirow->get_blob_fragmented(cdef.name_as_text());
@@ -1122,7 +1121,7 @@ struct process_row_visitor {
                visit_collection(v);

                managed_bytes_opt added_keys = v._added_keys.empty() ? std::nullopt :
-                    std::optional{set_type_impl::serialize_partially_deserialized_form_fragmented(v._added_keys, cql_serialization_format::internal())};
+                    std::optional{set_type_impl::serialize_partially_deserialized_form_fragmented(v._added_keys)};

                return {
                    v._is_column_delete,
@@ -1178,7 +1177,7 @@ struct process_row_visitor {
                visit_collection(v);

                managed_bytes_opt added_cells = v._added_cells.empty() ? std::nullopt :
-                    std::optional{map_type_impl::serialize_partially_deserialized_form_fragmented(v._added_cells, cql_serialization_format::internal())};
+                    std::optional{map_type_impl::serialize_partially_deserialized_form_fragmented(v._added_cells)};

                return {
                    v._is_column_delete,
@@ -1198,7 +1197,7 @@ struct process_row_visitor {
        // then we deserialize again when merging images below
        managed_bytes_opt deleted_elements = std::nullopt;
        if (!deleted_keys.empty()) {
-            deleted_elements = set_type_impl::serialize_partially_deserialized_form_fragmented(deleted_keys, cql_serialization_format::internal());
+            deleted_elements = set_type_impl::serialize_partially_deserialized_form_fragmented(deleted_keys);
        }

        // delta
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -15,10 +15,6 @@

 extern logging::logger cdc_log;

-namespace cdc {
-    extern const api::timestamp_clock::duration generation_leeway;
-} // namespace cdc
-
 static api::timestamp_type to_ts(db_clock::time_point tp) {
    // This assumes that timestamp_clock and db_clock have the same epochs.
    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
@@ -40,7 +36,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const std::vector<cdc::token_range_description>& entries,
+        const utils::chunked_vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
@@ -73,7 +69,7 @@ bool cdc::metadata::streams_available() const {

 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
-    if (ts > now + generation_leeway.count()) {
+    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
@@ -86,27 +82,43 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // Nothing protects us from that until we start using transactions for generation switching.
    }

-    auto it = gen_used_at(now);
-    if (it == _gens.end()) {
+    auto it = gen_used_at(now - get_generation_leeway().count());
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+    }
+
+    if (ts <= now - get_generation_leeway().count()) {
+        // We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
+        // happens iff one of the following is true:
+        // - the write is to no generation,
+        // - the write is to a generation older than the generation under `it`,
+        // - the write is to the generation under `it` and that generation is not the current generation.
+        // Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
+        // but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
+        // are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
+        // the generation under `it` because that generation was operating at `now - generation_leeway`.
+        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
+        if (it == _gens.end() || ts < it->first || is_previous_gen) {
+            throw exceptions::invalid_request_exception(format(
+                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
+                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                    " consistency properties.\n"
+                    "We *do* allow sending writes into the near past, but our ability to do that is limited."
+                    " Are you using client-side timestamps? Make sure your clocks are well-synchronized"
+                    " with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        }
+    }
+
+    it = _gens.begin();
+    if (it == _gens.end() || ts < it->first) {
        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
    }

-    // Garbage-collect generations that will no longer be used.
-    it = _gens.erase(_gens.begin(), it);
-
-    if (it->first > ts) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream from an earlier generation than the currently used one."
-                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                " consistency properties (write timestamp: {}, current generation started at: {})",
-                format_timestamp(ts), format_timestamp(it->first)));
-    }
-
-    // With `generation_leeway` we allow sending writes to the near future. It might happen
-    // that `ts` doesn't belong to the current generation ("current" according to our clock),
-    // but to the next generation. Adjust for this case:
+    // Find the generation operating at `ts`.
    {
        auto next_it = std::next(it);
        while (next_it != _gens.end() && next_it->first <= ts) {
@@ -147,8 +159,8 @@ bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
        ++it;
    }

-    // Check if some new generation has already superseded this one.
-    return it != _gens.end() && it->first <= api::new_timestamp();
+    // Check if the generation is obsolete.
+    return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
 }

 bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
@@ -157,7 +169,7 @@ bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen)
    }

    auto now = api::new_timestamp();
-    auto it = gen_used_at(now);
+    auto it = gen_used_at(now - get_generation_leeway().count());

    if (it != _gens.end()) {
        // Garbage-collect generations that will no longer be used.
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -42,7 +42,9 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
 public:
-    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
+     * it is older than the generation operating at `now - get_generation_leeway()`.
+     */
    bool known_or_obsolete(db_clock::time_point) const;

    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
@@ -54,8 +56,9 @@ public:
     *
     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
-     * by the `cdc::generation_leeway` constant.
+     * yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
+     * the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
+     * `get_generation_leeway()`.
     */
    stream_id get_stream(api::timestamp_type ts, dht::token tok);

--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -21,8 +21,6 @@ class row_tombstone;

 class collection_mutation;

-class cql_serialization_format;
-
 // An auxiliary struct used to (de)construct collection_mutations.
 // Unlike collection_mutation which is a serialized blob, this struct allows to inspect logical units of information
 // (tombstone and cells) inside the mutation easily.
@@ -131,4 +129,4 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec
 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
-bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
+bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view);
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -12,11 +12,11 @@

 class schema;
 class partition_key;
-class clustering_row;
 struct atomic_cell_view;
 struct tombstone;

 namespace db::view {
+struct clustering_or_static_row;
 struct view_key_and_action;
 }

@@ -118,7 +118,7 @@ class collection_column_computation final : public column_computation {
    using collection_kv = std::pair<bytes_view, atomic_cell_view>;
    void operate_on_collection_entries(
            std::invocable<collection_kv*, collection_kv*, tombstone> auto&& old_and_new_row_func, const schema& schema,
-            const partition_key& key, const clustering_row& update, const std::optional<clustering_row>& existing) const;
+            const partition_key& key, const db::view::clustering_or_static_row& update, const std::optional<db::view::clustering_or_static_row>& existing) const;

 public:
    static collection_column_computation for_keys(const bytes& collection_name) {
@@ -141,5 +141,6 @@ public:
        return true;
    }

-    std::vector<db::view::view_key_and_action> compute_values_with_action(const schema& schema, const partition_key& key, const clustering_row& row, const std::optional<clustering_row>& existing) const;
+    std::vector<db::view::view_key_and_action> compute_values_with_action(const schema& schema, const partition_key& key,
+            const db::view::clustering_or_static_row& row, const std::optional<db::view::clustering_or_static_row>& existing) const;
 };
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -28,6 +28,7 @@
 #include <seastar/util/closeable.hh>
 #include <seastar/core/shared_ptr.hh>

+#include "dht/i_partitioner.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstable_writer.hh"
 #include "sstables/progress_monitor.hh"
@@ -41,6 +42,7 @@
 #include "mutation_compactor.hh"
 #include "leveled_manifest.hh"
 #include "dht/token.hh"
+#include "dht/partition_filter.hh"
 #include "mutation_writer/shard_based_splitting_writer.hh"
 #include "mutation_writer/partition_based_splitting_writer.hh"
 #include "mutation_source_metadata.hh"
@@ -166,7 +168,7 @@ std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
 }

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -177,6 +179,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
            hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
        }
        if (sst->filter_has_key(*hk)) {
+            bloom_filter_checks++;
            timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
        }
    }
@@ -220,13 +223,13 @@ public:

    ~compaction_write_monitor() {
        if (_sst) {
-            _table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
+            _table_s.get_backlog_tracker().revert_charges(_sst);
        }
    }

    virtual void on_write_started(const sstables::writer_offset_tracker& tracker) override {
        _tracker = &tracker;
-        _table_s.get_compaction_strategy().get_backlog_tracker().register_partially_written_sstable(_sst, *this);
+        _table_s.get_backlog_tracker().register_partially_written_sstable(_sst, *this);
    }

    virtual void on_data_write_completed() override {
@@ -351,7 +354,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
    public:
        virtual void on_read_started(const sstables::reader_position_tracker& tracker) override {
            _tracker = &tracker;
-            _table_s.get_compaction_strategy().get_backlog_tracker().register_compacting_sstable(_sst, *this);
+            _table_s.get_backlog_tracker().register_compacting_sstable(_sst, *this);
        }

        virtual void on_read_completed() override {
@@ -370,7 +373,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {

        void remove_sstable() {
            if (_sst) {
-                _table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
+                _table_s.get_backlog_tracker().revert_charges(_sst);
            }
            _sst = {};
        }
@@ -382,7 +385,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
            // We failed to finish handling this SSTable, so we have to update the backlog_tracker
            // about it.
            if (_sst) {
-                _table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
+                _table_s.get_backlog_tracker().revert_charges(_sst);
            }
        }

@@ -412,9 +415,12 @@ private:

 class formatted_sstables_list {
    bool _include_origin = true;
-    std::vector<sstring> _ssts;
+    std::vector<std::string> _ssts;
 public:
    formatted_sstables_list() = default;
+    void reserve(size_t n) {
+        _ssts.reserve(n);
+    }
    explicit formatted_sstables_list(const std::vector<shared_sstable>& ssts, bool include_origin) : _include_origin(include_origin) {
        _ssts.reserve(ssts.size());
        for (const auto& sst : ssts) {
@@ -433,9 +439,7 @@ public:
 };

 std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst) {
-    os << "[";
-    os << boost::algorithm::join(lst._ssts, ",");
-    os << "]";
+    fmt::print(os, "[{}]", fmt::join(lst._ssts, ","));
    return os;
 }

@@ -460,6 +464,8 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    double _estimated_droppable_tombstone_ratio = 0;
+    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
    bool _can_split_large_partition = false;
@@ -516,7 +522,7 @@ protected:
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -571,14 +577,15 @@ protected:
        return bool(_sstable_set);
    }

-    compaction_writer create_gc_compaction_writer() const {
+    compaction_writer create_gc_compaction_writer(run_id gc_run) const {
        auto sst = _sstable_creator(this_shard_id());

        auto&& priority = _io_priority;
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
-        cfg.run_identifier = _run_identifier;
+        cfg.run_identifier = gc_run;
        cfg.monitor = monitor.get();
+        uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }
@@ -598,8 +605,14 @@ protected:
    // When compaction finishes, all the temporary sstables generated here will be deleted and removed
    // from table's sstable set.
    compacted_fragments_writer get_gc_compacted_fragments_writer() {
+        // because the temporary sstable run can overlap with the non-gc sstables run created by
+        // get_compacted_fragments_writer(), we have to use a different run_id. the gc_run_id is
+        // created here as:
+        // 1. it can be shared across all sstables created by this writer
+        // 2. it is optional, as gc writer is not always used
+        auto gc_run = run_id::create_random_id();
        return compacted_fragments_writer(*this,
-             [this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
+             [this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
             [this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
             _stop_request_observable);
    }
@@ -616,8 +629,8 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -639,9 +652,11 @@ private:
    future<> setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
        formatted_sstables_list formatted_msg;
+        formatted_msg.reserve(_sstables.size());
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

+        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
@@ -676,12 +691,16 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
+            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state());
+            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->all()->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->all()->size(), _sstables.size());
        }
+        // _estimated_droppable_tombstone_ratio could exceed 1.0 in certain cases, so limit it to 1.0.
+        _estimated_droppable_tombstone_ratio = std::min(1.0, sum_of_estimated_droppable_tombstone_ratio / ssts->all()->size());

        _compacting = std::move(ssts);

@@ -755,6 +774,7 @@ protected:
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
+                .bloom_filter_checks = _bloom_filter_checks,
            },
        };

@@ -774,7 +794,7 @@ protected:
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_start_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -795,7 +815,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
        };
    }

@@ -905,7 +925,7 @@ void compacted_fragments_writer::split_large_partition() {
    // will result in current fragment storing an inclusive end bound for last pos, and the
    // next fragment storing an exclusive start bound for last pos. This is very important
    // for not losing information on the range tombstone.
-    auto after_last_pos = position_in_partition::after_key(_current_partition.last_pos.key());
+    auto after_last_pos = position_in_partition::after_key(*_c.schema(), _current_partition.last_pos.key());
    if (_current_partition.current_emitted_tombstone) {
        auto rtc = range_tombstone_change(after_last_pos, tombstone{});
        _c.log_debug("Closing active tombstone {} with {} for partition {}", _current_partition.current_emitted_tombstone, rtc, *_current_partition.dk);
@@ -948,7 +968,7 @@ void compacted_fragments_writer::consume_new_partition(const dht::decorated_key&
        .dk = dk,
        .tombstone = tombstone(),
        .current_emitted_tombstone = tombstone(),
-        .last_pos = position_in_partition(position_in_partition::partition_start_tag_t()),
+        .last_pos = position_in_partition::for_partition_start(),
        .is_splitting_partition = false
    };
    do_consume_new_partition(dk);
@@ -995,51 +1015,6 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

-class reshape_compaction : public compaction {
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
-        : compaction(table_s, std::move(descriptor), cdata) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader() const override {
-        return _compacting->make_local_shard_sstable_reader(_schema,
-                _permit,
-                query::full_partition_range,
-                _schema->full_slice(),
-                _io_priority,
-                tracing::trace_state_ptr(),
-                ::streamed_mutation::forwarding::no,
-                ::mutation_reader::forwarding::no,
-                default_read_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            finish_new_sstable(writer);
-        }
-    }
-};
-
 class regular_compaction : public compaction {
    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
    mutable compaction_read_monitor_generator _monitor_generator;
@@ -1149,12 +1124,13 @@ private:
    }

    void update_pending_ranges() {
-        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
+        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
+        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : _cdata.pending_replacements) {
+        for (auto& pending_replacement : pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1168,35 +1144,76 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-        _cdata.pending_replacements.clear();
+    }
+};
+
+class reshape_compaction : public regular_compaction {
+private:
+    bool has_sstable_replacer() const noexcept {
+        return bool(_replacer);
+    }
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+            : regular_compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
+        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            if (has_sstable_replacer()) {
+                regular_compaction::stop_sstable_writer(writer);
+            } else {
+                finish_new_sstable(writer);
+            }
+        }
+    }
+
+    virtual void on_end_of_compaction() override {
+        if (has_sstable_replacer()) {
+            regular_compaction::on_end_of_compaction();
+        }
    }
 };

 class cleanup_compaction final : public regular_compaction {
-    class incremental_owned_ranges_checker {
-        const dht::token_range_vector& _sorted_owned_ranges;
-        mutable dht::token_range_vector::const_iterator _it;
-    public:
-        incremental_owned_ranges_checker(const dht::token_range_vector& sorted_owned_ranges)
-                : _sorted_owned_ranges(sorted_owned_ranges)
-                , _it(_sorted_owned_ranges.begin()) {
-        }
-
-        // Must be called with increasing token values.
-        bool belongs_to_current_node(const dht::token& t) const {
-            // While token T is after a range Rn, advance the iterator.
-            // iterator will be stopped at a range which either overlaps with T (if T belongs to node),
-            // or at a range which is after T (if T doesn't belong to this node).
-            while (_it != _sorted_owned_ranges.end() && _it->after(t, dht::token_comparator())) {
-                _it++;
-            }
-
-            return _it != _sorted_owned_ranges.end() && _it->contains(t, dht::token_comparator());
-        }
-    };
-
    owned_ranges_ptr _owned_ranges;
-    incremental_owned_ranges_checker _owned_ranges_checker;
+    mutable dht::incremental_owned_ranges_checker _owned_ranges_checker;
 private:
    // Called in a seastar thread
    dht::partition_range_vector
@@ -1209,21 +1226,8 @@ private:
            return dht::partition_range::make({sst->get_first_decorated_key(), true},
                                              {sst->get_last_decorated_key(), true});
        }));
-        // optimize set of potentially overlapping ranges by deoverlapping them.
-        non_owned_ranges = dht::partition_range::deoverlap(std::move(non_owned_ranges), dht::ring_position_comparator(*_schema));

-        // subtract *each* owned range from the partition range of *each* sstable*,
-        // such that we'll be left only with a set of non-owned ranges.
-        for (auto& owned_range : owned_ranges) {
-            dht::partition_range_vector new_non_owned_ranges;
-            for (auto& non_owned_range : non_owned_ranges) {
-                auto ret = non_owned_range.subtract(owned_range, dht::ring_position_comparator(*_schema));
-                new_non_owned_ranges.insert(new_non_owned_ranges.end(), ret.begin(), ret.end());
-                seastar::thread::maybe_yield();
-            }
-            non_owned_ranges = std::move(new_non_owned_ranges);
-        }
-        return non_owned_ranges;
+        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
    }
 protected:
    virtual compaction_completion_desc
@@ -1623,7 +1627,7 @@ private:
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -80,8 +80,10 @@ struct compaction_data {
    }

    void stop(sstring reason) {
-        stop_requested = std::move(reason);
-        abort.request_abort();
+        if (!abort.abort_requested()) {
+            stop_requested = std::move(reason);
+            abort.request_abort();
+        }
    }
 };

@@ -90,12 +92,15 @@ struct compaction_stats {
    uint64_t start_size = 0;
    uint64_t end_size = 0;
    uint64_t validation_errors = 0;
+    // Bloom filter checks during max purgeable calculation
+    uint64_t bloom_filter_checks = 0;

    compaction_stats& operator+=(const compaction_stats& r) {
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
+        bloom_filter_checks += r.bloom_filter_checks;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
--- a/compaction/compaction_backlog_manager.hh
+++ b/compaction/compaction_backlog_manager.hh
@@ -66,7 +66,8 @@ public:
    };

    compaction_backlog_tracker(std::unique_ptr<impl> impl) : _impl(std::move(impl)) {}
-    compaction_backlog_tracker(compaction_backlog_tracker&&) = default;
+    compaction_backlog_tracker(compaction_backlog_tracker&&);
+    compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) noexcept;
    compaction_backlog_tracker(const compaction_backlog_tracker&) = delete;
    ~compaction_backlog_tracker();

@@ -74,7 +75,7 @@ public:
    void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts);
    void register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp);
    void register_compacting_sstable(sstables::shared_sstable sst, backlog_read_progress_manager& rp);
-    void transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true);
+    void copy_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true) const;
    void revert_charges(sstables::shared_sstable sst);

    void disable() {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -7,15 +7,19 @@
 */

 #include "compaction_manager.hh"
+#include "compaction_descriptor.hh"
 #include "compaction_strategy.hh"
 #include "compaction_backlog_manager.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstables_manager.hh"
+#include <memory>
 #include <seastar/core/metrics.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/switch_to.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "sstables/exceptions.hh"
+#include "sstables/sstable_directory.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "utils/fb_utilities.hh"
 #include "utils/UUID_gen.hh"
@@ -76,6 +80,23 @@ public:
            _compacting.erase(sst);
        }
    }
+
+    class update_me : public compaction_manager::task::on_replacement {
+        compacting_sstable_registration& _registration;
+        public:
+            update_me(compacting_sstable_registration& registration)
+                : _registration{registration} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.release_compacting(sstables);
+            }
+            void on_addition(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.register_compacting(sstables);
+            }
+    };
+
+    auto update_on_sstable_replacement() {
+        return update_me(*this);
+    }
 };

 sstables::compaction_data compaction_manager::create_compaction_data() {
@@ -277,7 +298,7 @@ compaction_manager::task::task(compaction_manager& mgr, compaction::table_state*
    , _description(std::move(desc))
 {}

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task, throw_if_stopping do_throw_if_stopping) {
    _tasks.push_back(task);
    auto unregister_task = defer([this, task] {
        _tasks.remove(task);
@@ -290,6 +311,9 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
        co_return res;
    } catch (sstables::compaction_stopped_exception& e) {
        cmlog.info("{}: stopped, reason: {}", *task, e.what());
+        if (do_throw_if_stopping) {
+            throw;
+        }
    } catch (sstables::compaction_aborted_exception& e) {
        cmlog.error("{}: aborted, reason: {}", *task, e.what());
        _stats.errors++;
@@ -308,14 +332,14 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
    co_return std::nullopt;
 }

-future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge) {
    if (!descriptor.sstables.size()) {
        // if there is nothing to compact, just return.
        co_return sstables::compaction_result{};
    }

    bool should_update_history = this->should_update_history(descriptor.options.type());
-    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, std::move(release_exhausted), std::move(can_purge));
+    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, on_replace, std::move(can_purge));

    if (should_update_history) {
        co_await update_history(*_compacting_table, res, cdata);
@@ -323,8 +347,11 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables_a

    co_return res;
 }
-future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+
+future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge,
+                                                                               sstables::offstrategy offstrategy) {
    compaction::table_state& t = *_compacting_table;
+
    if (can_purge) {
        descriptor.enable_garbage_collection(t.main_sstable_set());
    }
@@ -332,15 +359,26 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables(s
        auto sst = t.make_sstable();
        return sst;
    };
-    descriptor.replacer = [this, &t, release_exhausted] (sstables::compaction_completion_desc desc) {
+
+    descriptor.replacer = [this, &t, &on_replace, offstrategy] (sstables::compaction_completion_desc desc) {
        t.get_compaction_strategy().notify_completion(desc.old_sstables, desc.new_sstables);
        _cm.propagate_replacement(t, desc.old_sstables, desc.new_sstables);
+        // on_replace updates the compacting registration with the old and new
+        // sstables. while on_compaction_completion() removes the old sstables
+        // from the table's sstable set, and adds the new ones to the sstable
+        // set.
+        // since the regular compactions exclude the sstables in the sstable
+        // set which are currently being compacted, if we want to ensure the
+        // exclusive access of compactions to an sstable we should guard it
+        // with the registration when adding/removing it to/from the sstable
+        // set. otherwise, the regular compaction would pick it up in the time
+        // window, where the sstables:
+        // - are still in the main set
+        // - are not being compacted.
+        on_replace.on_addition(desc.new_sstables);
        auto old_sstables = desc.old_sstables;
-        t.on_compaction_completion(std::move(desc), sstables::offstrategy::no).get();
-        // Calls compaction manager's task for this compaction to release reference to exhausted SSTables.
-        if (release_exhausted) {
-            release_exhausted(old_sstables);
-        }
+        t.on_compaction_completion(std::move(desc), offstrategy).get();
+        on_replace.on_removal(old_sstables);
    };

    co_return co_await sstables::compact_sstables(std::move(descriptor), cdata, t);
@@ -385,9 +423,7 @@ protected:
        sstables::compaction_strategy cs = t->get_compaction_strategy();
        sstables::compaction_descriptor descriptor = cs.get_major_compaction_job(*t, _cm.get_candidates(*t));
        auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
-        auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-            compacting.release_compacting(exhausted_sstables);
-        };
+        auto on_replace = compacting.update_on_sstable_replacement();
        setup_new_compaction(descriptor.run_identifier);

        cmlog.info0("User initiated compaction started on behalf of {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
@@ -399,7 +435,7 @@ protected:
        // the exclusive lock can be freed to let regular compaction run in parallel to major
        lock_holder.return_all();

-        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace);

        finish_compaction();

@@ -446,12 +482,12 @@ protected:
    }
 };

-future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job) {
+future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping) {
    if (_state != state::enabled) {
        return make_ready_future<>();
    }

-    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job))).discard_result();
+    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job)), do_throw_if_stopping).discard_result();
 }

 future<> compaction_manager::update_static_shares(float static_shares) {
@@ -645,6 +681,7 @@ sstables::compaction_stopped_exception compaction_manager::task::make_compaction

 compaction_manager::compaction_manager(config cfg, abort_source& as)
    : _cfg(std::move(cfg))
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), [this] () -> float {
        _last_backlog = backlog();
        auto b = _last_backlog / available_memory();
@@ -679,6 +716,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as)

 compaction_manager::compaction_manager()
    : _cfg(config{ .available_memory = 1 })
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), 1, [] () -> float { return 1.0; }))
    , _backlog_manager(_compaction_controller)
    , _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
@@ -736,38 +774,46 @@ void compaction_manager::register_metrics() {
 void compaction_manager::enable() {
    assert(_state == state::none || _state == state::disabled);
    _state = state::enabled;
-    _compaction_submission_timer.arm(periodic_compaction_submission_interval());
-    postponed_compactions_reevaluation();
+    _compaction_submission_timer.arm_periodic(periodic_compaction_submission_interval());
+    _waiting_reevalution = postponed_compactions_reevaluation();
 }

 std::function<void()> compaction_manager::compaction_submission_callback() {
    return [this] () mutable {
        for (auto& e: _compaction_state) {
-            submit(*e.first);
+            postpone_compaction_for_table(e.first);
        }
+        reevaluate_postponed_compactions();
    };
 }

-void compaction_manager::postponed_compactions_reevaluation() {
-    _waiting_reevalution = repeat([this] {
-        return _postponed_reevaluation.wait().then([this] {
-            if (_state != state::enabled) {
-                _postponed.clear();
-                return stop_iteration::yes;
-            }
-            auto postponed = std::move(_postponed);
-            try {
-                for (auto& t : postponed) {
-                    auto s = t->schema();
-                    cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
-                    submit(*t);
+future<> compaction_manager::postponed_compactions_reevaluation() {
+     while (true) {
+        co_await _postponed_reevaluation.when();
+        if (_state != state::enabled) {
+            _postponed.clear();
+            co_return;
+        }
+        // A task_state being reevaluated can re-insert itself into postponed list, which is the reason
+        // for moving the list to be processed into a local.
+        auto postponed = std::exchange(_postponed, {});
+        try {
+            for (auto it = postponed.begin(); it != postponed.end();) {
+                compaction::table_state* t = *it;
+                it = postponed.erase(it);
+                // skip reevaluation of a table_state that became invalid post its removal
+                if (!_compaction_state.contains(t)) {
+                    continue;
                }
-            } catch (...) {
-                _postponed = std::move(postponed);
+                auto s = t->schema();
+                cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
+                submit(*t);
+                co_await coroutine::maybe_yield();
            }
-            return stop_iteration::no;
-        });
-    });
+        } catch (...) {
+            _postponed.insert(postponed.begin(), postponed.end());
+        }
+    }
 }

 void compaction_manager::reevaluate_postponed_compactions() noexcept {
@@ -972,9 +1018,7 @@ protected:
            }
            auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
            auto weight_r = compaction_weight_registration(&_cm, weight);
-            auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = compacting.update_on_sstable_replacement();
            cmlog.debug("Accepted compaction job: task={} ({} sstable(s)) of weight {} for {}.{}",
                fmt::ptr(this), descriptor.sstables.size(), weight, t.schema()->ks_name(), t.schema()->cf_name());

@@ -983,7 +1027,7 @@ protected:

            try {
                bool should_update_history = this->should_update_history(descriptor.options.type());
-                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
                finish_compaction();
                if (should_update_history) {
                    // update_history can take a long time compared to
@@ -1024,7 +1068,7 @@ void compaction_manager::submit(compaction::table_state& t) {

    // OK to drop future.
    // waited via task->stop()
-    (void)perform_task(make_shared<regular_compaction_task>(*this, t));
+    (void)perform_task(make_shared<regular_compaction_task>(*this, t)).then_wrapped([] (auto f) { f.ignore_ready_future(); });
 }

 bool compaction_manager::can_perform_regular_compaction(compaction::table_state& t) {
@@ -1045,7 +1089,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction::
            desc.sstables
            | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::run_identifier))).size();
    };
-    const auto threshold = std::max(schema->max_compaction_threshold(), 32);
+    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}.{}: {} <= {}",
@@ -1083,49 +1127,40 @@ public:
    }
 private:
    future<> run_offstrategy_compaction(sstables::compaction_data& cdata) {
-        // This procedure will reshape sstables in maintenance set until it's ready for
-        // integration into main set.
-        // It may require N reshape rounds before the set satisfies the strategy invariant.
-        // This procedure also only updates maintenance set at the end, on success.
-        // Otherwise, some overlapping could be introduced in the set after each reshape
-        // round, progressively degrading read amplification until integration happens.
-        // The drawback of this approach is the 2x space requirement as the old sstables
-        // will only be deleted at the end. The impact of this space requirement is reduced
-        // by the fact that off-strategy is serialized across all tables, meaning that the
-        // actual requirement is the size of the largest table's maintenance set.
+        // Incrementally reshape the SSTables in maintenance set. The output of each reshape
+        // round is merged into the main set. The common case is that off-strategy input
+        // is mostly disjoint, e.g. repair-based node ops, then all the input will be
+        // reshaped in a single round. The incremental approach allows us to be space
+        // efficient (avoiding a 100% overhead) as we will incrementally replace input
+        // SSTables from maintenance set by output ones into main set.

        compaction::table_state& t = *_compacting_table;
-        const auto& maintenance_sstables = t.maintenance_sstable_set();

-        const auto old_sstables = boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_sstables.all());
-        std::vector<sstables::shared_sstable> reshape_candidates = old_sstables;
-        std::vector<sstables::shared_sstable> sstables_to_remove;
-        std::unordered_set<sstables::shared_sstable> new_unused_sstables;
-
-        auto cleanup_new_unused_sstables_on_failure = defer([&new_unused_sstables] {
-            for (auto& sst : new_unused_sstables) {
-                sst->mark_for_deletion();
-            }
-        });
+        // Filter out sstables that require view building, to avoid a race between off-strategy
+        // and view building. Refs: #11882
+        auto get_reshape_candidates = [&t] () {
+            auto maintenance_ssts = t.maintenance_sstable_set().all();
+            return boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_ssts
+                | boost::adaptors::filtered([](const sstables::shared_sstable& sst) {
+                        return !sst->requires_view_building();
+                }));
+        };

        auto get_next_job = [&] () -> std::optional<sstables::compaction_descriptor> {
            auto& iop = service::get_local_streaming_priority(); // run reshape in maintenance mode
-            auto desc = t.get_compaction_strategy().get_reshaping_job(reshape_candidates, t.schema(), iop, sstables::reshape_mode::strict);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), iop, sstables::reshape_mode::strict);
            return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

        std::exception_ptr err;
        while (auto desc = get_next_job()) {
-            desc->creator = [this, &new_unused_sstables, &t] (shard_id dummy) {
-                auto sst = t.make_sstable();
-                new_unused_sstables.insert(sst);
-                return sst;
-            };
-            auto input = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(desc->sstables);
+            auto compacting = compacting_sstable_registration(_cm, desc->sstables);
+            auto on_replace = compacting.update_on_sstable_replacement();

-            sstables::compaction_result ret;
            try {
-                ret = co_await sstables::compact_sstables(std::move(*desc), cdata, t);
+                sstables::compaction_result _ = co_await compact_sstables(std::move(*desc), _compaction_data, on_replace,
+                                                                          compaction_manager::can_purge_tombstones::no,
+                                                                          sstables::offstrategy::yes);
            } catch (sstables::compaction_stopped_exception&) {
                // If off-strategy compaction stopped on user request, let's not discard the partial work.
                // Therefore, both un-reshaped and reshaped data will be integrated into main set, allowing
@@ -1134,41 +1169,20 @@ private:
                break;
            }
            _performed = true;
-
-            // update list of reshape candidates without input but with output added to it
-            auto it = boost::remove_if(reshape_candidates, [&] (auto& s) { return input.contains(s); });
-            reshape_candidates.erase(it, reshape_candidates.end());
-            std::move(ret.new_sstables.begin(), ret.new_sstables.end(), std::back_inserter(reshape_candidates));
-
-            // If compaction strategy is unable to reshape input data in a single round, it may happen that a SSTable A
-            // created in round 1 will be compacted in a next round producing SSTable B. As SSTable A is no longer needed,
-            // it can be removed immediately. Let's remove all such SSTables immediately to reduce off-strategy space requirement.
-            // Input SSTables from maintenance set can only be removed later, as SSTable sets are only updated on completion.
-            auto can_remove_now = [&] (const sstables::shared_sstable& s) { return new_unused_sstables.contains(s); };
-            for (auto&& sst : input) {
-                if (can_remove_now(sst)) {
-                    co_await sst->unlink();
-                    new_unused_sstables.erase(std::move(sst));
-                } else {
-                    sstables_to_remove.push_back(std::move(sst));
-                }
-            }
        }

-        // at this moment reshape_candidates contains a set of sstables ready for integration into main set
-        auto completion_desc = sstables::compaction_completion_desc{
-            .old_sstables = std::move(old_sstables),
-            .new_sstables = std::move(reshape_candidates)
-        };
-        co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
-
-        cleanup_new_unused_sstables_on_failure.cancel();
-        // By marking input sstables for deletion instead, the ones which require view building will stay in the staging
-        // directory until they're moved to the main dir when the time comes. Also, that allows view building to resume
-        // on restart if there's a crash midway.
-        for (auto& sst : sstables_to_remove) {
-            sst->mark_for_deletion();
+        // There might be some remaining sstables in maintenance set that didn't require reshape, or the
+        // user has aborted off-strategy. So we can only integrate them into the main set, such that
+        // they become candidates for regular compaction. We cannot hold them forever in maintenance set,
+        // as that causes read and space amplification issues.
+        if (auto sstables = get_reshape_candidates(); sstables.size()) {
+            auto completion_desc = sstables::compaction_completion_desc{
+                .old_sstables = sstables, // removes from maintenance set.
+                .new_sstables = sstables, // adds into main set.
+            };
+            co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
        }
+
        if (err) {
            co_await coroutine::return_exception_ptr(std::move(err));
        }
@@ -1191,9 +1205,11 @@ protected:
            std::exception_ptr ex;
            try {
                compaction::table_state& t = *_compacting_table;
-                auto maintenance_sstables = t.maintenance_sstable_set().all();
-                cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
-                        t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                {
+                    auto maintenance_sstables = t.maintenance_sstable_set().all();
+                    cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
+                               t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                }
                co_await run_offstrategy_compaction(_compaction_data);
                finish_compaction();
                cmlog.info("Done with off-strategy compaction for {}.{}", t.schema()->ks_name(), t.schema()->cf_name());
@@ -1266,9 +1282,7 @@ private:
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, _options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
-            auto release_exhausted = [this] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                _compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = _compacting.update_on_sstable_replacement();

            setup_new_compaction(descriptor.run_identifier);

@@ -1277,7 +1291,7 @@ private:

            std::exception_ptr ex;
            try {
-                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted), _can_purge);
+                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace, _can_purge);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return res;  // done with current sstable
@@ -1434,14 +1448,26 @@ protected:
        co_return std::nullopt;
    }
 private:
-    // Releases reference to cleaned files such that respective used disk space can be freed.
-    void release_exhausted(std::vector<sstables::shared_sstable> exhausted_sstables) {
-        _compacting.release_compacting(exhausted_sstables);
-    }
-
    future<> run_cleanup_job(sstables::compaction_descriptor descriptor) {
        co_await coroutine::switch_to(_cm.compaction_sg().cpu);

+        // Releases reference to cleaned files such that respective used disk space can be freed.
+        using update_registration = compacting_sstable_registration::update_me;
+        class release_exhausted : public update_registration {
+            sstables::compaction_descriptor& _desc;
+        public:
+            release_exhausted(compacting_sstable_registration& registration, sstables::compaction_descriptor& desc)
+                : update_registration{registration}
+                , _desc{desc} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                auto exhausted = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(sstables);
+                std::erase_if(_desc.sstables, [&] (const sstables::shared_sstable& sst) {
+                    return exhausted.contains(sst);
+                });
+                update_registration::on_removal(sstables);
+            }
+        };
+        release_exhausted on_replace{_compacting, descriptor};
        for (;;) {
            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_cm._compaction_controller.backlog_of_shares(200), _cm.available_memory()));
            _cm.register_backlog_tracker(user_initiated);
@@ -1449,8 +1475,7 @@ private:
            std::exception_ptr ex;
            try {
                setup_new_compaction(descriptor.run_identifier);
-                co_await compact_sstables_and_update_history(descriptor, _compaction_data,
-                                          std::bind(&cleanup_sstables_compaction_task::release_exhausted, this, std::placeholders::_1));
+                co_await compact_sstables_and_update_history(descriptor, _compaction_data, on_replace);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return;  // done with current job
@@ -1470,10 +1495,8 @@ private:
 bool needs_cleanup(const sstables::shared_sstable& sst,
                   const dht::token_range_vector& sorted_owned_ranges,
                   schema_ptr s) {
-    auto first = sst->get_first_partition_key();
-    auto last = sst->get_last_partition_key();
-    auto first_token = dht::get_token(*s, first);
-    auto last_token = dht::get_token(*s, last);
+    auto first_token = sst->get_first_decorated_key().token();
+    auto last_token = sst->get_last_decorated_key().token();
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

    auto r = std::lower_bound(sorted_owned_ranges.begin(), sorted_owned_ranges.end(), first_token,
@@ -1573,8 +1596,13 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
    }, can_purge_tombstones::no);
 }

+compaction_manager::compaction_state::compaction_state(table_state& t)
+    : backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
+{
+}
+
 void compaction_manager::add(compaction::table_state& t) {
-    auto [_, inserted] = _compaction_state.insert({&t, compaction_state{}});
+    auto [_, inserted] = _compaction_state.try_emplace(&t, t);
    if (!inserted) {
        auto s = t.schema();
        on_internal_error(cmlog, format("compaction_state for table {}.{} [{}] already exists", s->ks_name(), s->cf_name(), fmt::ptr(&t)));
@@ -1582,22 +1610,21 @@ void compaction_manager::add(compaction::table_state& t) {
 }

 future<> compaction_manager::remove(compaction::table_state& t) noexcept {
-    auto handle = _compaction_state.extract(&t);
+    auto& c_state = get_compaction_state(&t);

-    if (!handle.empty()) {
-        auto& c_state = handle.mapped();
+    // We need to guarantee that a task being stopped will not retry to compact
+    // a table being removed.
+    // The requirement above is provided by stop_ongoing_compactions().
+    _postponed.erase(&t);

-        // We need to guarantee that a task being stopped will not retry to compact
-        // a table being removed.
-        // The requirement above is provided by stop_ongoing_compactions().
-        _postponed.erase(&t);
+    // Wait for all compaction tasks running under gate to terminate
+    // and prevent new tasks from entering the gate.
+    co_await seastar::when_all_succeed(stop_ongoing_compactions("table removal", &t), c_state.gate.close()).discard_result();

-        // Wait for the termination of an ongoing compaction on table T, if any.
-        co_await stop_ongoing_compactions("table removal", &t);
+    c_state.backlog_tracker.disable();
+
+    _compaction_state.erase(&t);

-        // Wait for all functions running under gate to terminate.
-        co_await c_state.gate.close();
-    }
 #ifdef DEBUG
    auto found = false;
    sstring msg;
@@ -1756,7 +1783,7 @@ void compaction_backlog_tracker::register_compacting_sstable(sstables::shared_ss
    }
 }

-void compaction_backlog_tracker::transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges) {
+void compaction_backlog_tracker::copy_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges) const {
    for (auto&& w : _ongoing_writes) {
        new_bt.register_partially_written_sstable(w.first, *w.second);
    }
@@ -1766,8 +1793,6 @@ void compaction_backlog_tracker::transfer_ongoing_charges(compaction_backlog_tra
            new_bt.register_compacting_sstable(w.first, *w.second);
        }
    }
-    _ongoing_writes = {};
-    _ongoing_compactions = {};
 }

 void compaction_backlog_tracker::revert_charges(sstables::shared_sstable sst) {
@@ -1775,6 +1800,26 @@ void compaction_backlog_tracker::revert_charges(sstables::shared_sstable sst) {
    _ongoing_compactions.erase(sst);
 }

+compaction_backlog_tracker::compaction_backlog_tracker(compaction_backlog_tracker&& other)
+        : _impl(std::move(other._impl))
+        , _ongoing_writes(std::move(other._ongoing_writes))
+        , _ongoing_compactions(std::move(other._ongoing_compactions))
+        , _manager(std::exchange(other._manager, nullptr)) {
+}
+
+compaction_backlog_tracker&
+compaction_backlog_tracker::operator=(compaction_backlog_tracker&& x) noexcept {
+    if (this != &x) {
+        if (auto manager = std::exchange(_manager, x._manager)) {
+            manager->remove_backlog_tracker(this);
+        }
+        _impl = std::move(x._impl);
+        _ongoing_writes = std::move(x._ongoing_writes);
+        _ongoing_compactions = std::move(x._ongoing_compactions);
+    }
+    return *this;
+}
+
 compaction_backlog_tracker::~compaction_backlog_tracker() {
    if (_manager) {
        _manager->remove_backlog_tracker(this);
@@ -1812,3 +1857,14 @@ compaction_backlog_manager::~compaction_backlog_manager() {
        tracker->_manager = nullptr;
    }
 }
+
+void compaction_manager::register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker) {
+    auto& cs = get_compaction_state(&t);
+    cs.backlog_tracker = std::move(new_backlog_tracker);
+    register_backlog_tracker(cs.backlog_tracker);
+}
+
+compaction_backlog_tracker& compaction_manager::get_backlog_tracker(compaction::table_state& t) {
+    auto& cs = get_compaction_state(&t);
+    return cs.backlog_tracker;
+}
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -32,6 +32,7 @@
 #include "compaction.hh"
 #include "compaction_weight_registration.hh"
 #include "compaction_backlog_manager.hh"
+#include "compaction/compaction_descriptor.hh"
 #include "strategy_control.hh"
 #include "backlog_controller.hh"
 #include "seastarx.hh"
@@ -49,6 +50,8 @@ public:
    boost::icl::interval_map<dht::token, gc_clock::time_point, boost::icl::partial_absorber, std::less, boost::icl::inplace_max> map;
 };

+using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -83,8 +86,10 @@ private:
        // Signaled whenever a compaction task completes.
        condition_variable compaction_done;

-        compaction_state() = default;
-        compaction_state(compaction_state&&) = default;
+        compaction_backlog_tracker backlog_tracker;
+
+        explicit compaction_state(table_state& t);
+        compaction_state(compaction_state&&) = delete;
        ~compaction_state();

        bool compaction_disabled() const noexcept {
@@ -135,11 +140,20 @@ public:

        virtual ~task();

+        // called when a compaction replaces the exhausted sstables with the new set
+        struct on_replacement {
+            virtual ~on_replacement() {}
+            // called after the replacement completes
+            // @param sstables the old sstable which are replaced in this replacement
+            virtual void on_removal(const std::vector<sstables::shared_sstable>& sstables) = 0;
+            // called before the replacement happens
+            // @param sstables the new sstables to be added to the table's sstable set
+            virtual void on_addition(const std::vector<sstables::shared_sstable>& sstables) = 0;
+        };
+
    protected:
        virtual future<compaction_stats_opt> do_run() = 0;

-        using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
        state switch_state(state new_state);

        future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -156,12 +170,10 @@ public:
        // otherwise, returns stop_iteration::no after sleep for exponential retry.
        future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);

-        // Compacts set of SSTables according to the descriptor.
-        using release_exhausted_func_t = std::function<void(const std::vector<sstables::shared_sstable>& exhausted_sstables)>;
-        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
-                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
-        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
+        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
+        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
+                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstables::offstrategy offstrategy = sstables::offstrategy::no);
        future<> update_history(compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
        bool should_update_history(sstables::compaction_type ct) {
            return ct == sstables::compaction_type::Compaction;
@@ -294,10 +306,10 @@ private:
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
-    timer<lowres_clock> _compaction_submission_timer = timer<lowres_clock>(compaction_submission_callback());
    static constexpr std::chrono::seconds periodic_compaction_submission_interval() { return std::chrono::seconds(3600); }

    config _cfg;
+    timer<lowres_clock> _compaction_submission_timer;
    compaction_controller _compaction_controller;
    compaction_backlog_manager _backlog_manager;
    optimized_optional<abort_source::subscription> _early_abort_subscription;
@@ -313,7 +325,7 @@ private:
    per_table_history_maps _repair_history_maps;
    tombstone_gc_state _tombstone_gc_state;
 private:
-    future<compaction_stats_opt> perform_task(shared_ptr<task>);
+    future<compaction_stats_opt> perform_task(shared_ptr<task>, throw_if_stopping do_throw_if_stopping = throw_if_stopping::no);

    future<> stop_tasks(std::vector<shared_ptr<task>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -348,7 +360,7 @@ private:
    // table still exists and compaction is not disabled for the table.
    inline bool can_proceed(compaction::table_state* t) const;

-    void postponed_compactions_reevaluation();
+    future<> postponed_compactions_reevaluation();
    void reevaluate_postponed_compactions() noexcept;
    // Postpone compaction for a table that couldn't be executed due to ongoing
    // similar-sized compaction.
@@ -458,7 +470,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping);

    class compaction_reenabler {
        compaction_manager& _cm;
@@ -524,6 +536,9 @@ public:
    void register_backlog_tracker(compaction_backlog_tracker& backlog_tracker) {
        _backlog_manager.register_backlog_tracker(backlog_tracker);
    }
+    void register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker);
+
+    compaction_backlog_tracker& get_backlog_tracker(compaction::table_state& t);

    static sstables::compaction_data create_compaction_data();

--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -65,7 +65,7 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

-uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
+uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
    return partition_estimate;
 }

@@ -409,7 +409,9 @@ public:
                l0_old_ssts.push_back(std::move(sst));
            }
        }
-        _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        if (l0_old_ssts.size() || l0_new_ssts.size()) {
+            _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        }
    }
 };

@@ -427,14 +429,6 @@ struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
    virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {}
 };

-// Just so that if we have more than one CF with NullStrategy, we don't create a lot
-// of objects to iterate over for no reason
-// Still thread local because of make_unique. But this will disappear soon
-static thread_local compaction_backlog_tracker null_backlog_tracker(std::make_unique<null_backlog_tracker>());
-compaction_backlog_tracker& get_null_backlog_tracker() {
-    return null_backlog_tracker;
-}
-
 //
 // Null compaction strategy is the default compaction strategy.
 // As the name implies, it does nothing.
@@ -453,8 +447,8 @@ public:
        return compaction_strategy_type::null;
    }

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return get_null_backlog_tracker();
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override {
+        return std::make_unique<null_backlog_tracker>();
    }
 };

@@ -462,11 +456,14 @@ leveled_compaction_strategy::leveled_compaction_strategy(const std::map<sstring,
        : compaction_strategy_impl(options)
        , _max_sstable_size_in_mb(calculate_max_sstable_size_in_mb(compaction_strategy_impl::get_value(options, SSTABLE_SIZE_OPTION)))
        , _stcs_options(options)
-        , _backlog_tracker(std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options))
 {
    _compaction_counter.resize(leveled_manifest::MAX_LEVELS);
 }

+std::unique_ptr<compaction_backlog_tracker::impl> leveled_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options);
+}
+
 int32_t
 leveled_compaction_strategy::calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const {
    using namespace cql3::statements;
@@ -486,7 +483,6 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
    : compaction_strategy_impl(options)
    , _options(options)
    , _stcs_options(options)
-    , _backlog_tracker(std::make_unique<time_window_backlog_tracker>(_options, _stcs_options))
 {
    if (!options.contains(TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.contains(TOMBSTONE_THRESHOLD_OPTION)) {
        _disable_tombstone_compaction = true;
@@ -497,6 +493,10 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
    _use_clustering_key_filter = true;
 }

+std::unique_ptr<compaction_backlog_tracker::impl> time_window_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<time_window_backlog_tracker>(_options, _stcs_options);
+}
+
 } // namespace sstables

 std::vector<sstables::shared_sstable>
@@ -640,7 +640,6 @@ namespace sstables {
 date_tiered_compaction_strategy::date_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
    : compaction_strategy_impl(options)
    , _manifest(options)
-    , _backlog_tracker(std::make_unique<unimplemented_backlog_tracker>())
 {
    clogger.warn("DateTieredCompactionStrategy is deprecated. Usually cases for which it is used are better handled by TimeWindowCompactionStrategy."
            " Please change your compaction strategy to TWCS as DTCS will be retired in the near future");
@@ -685,17 +684,23 @@ compaction_descriptor date_tiered_compaction_strategy::get_sstables_for_compacti
    return sstables::compaction_descriptor({ *it }, service::get_local_compaction_priority());
 }

+std::unique_ptr<compaction_backlog_tracker::impl> date_tiered_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<unimplemented_backlog_tracker>();
+}
+
 size_tiered_compaction_strategy::size_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
    : compaction_strategy_impl(options)
    , _options(options)
-    , _backlog_tracker(std::make_unique<size_tiered_backlog_tracker>(_options))
 {}

 size_tiered_compaction_strategy::size_tiered_compaction_strategy(const size_tiered_compaction_strategy_options& options)
    : _options(options)
-    , _backlog_tracker(std::make_unique<size_tiered_backlog_tracker>(_options))
 {}

+std::unique_ptr<compaction_backlog_tracker::impl> size_tiered_compaction_strategy::make_backlog_tracker() {
+    return std::make_unique<size_tiered_backlog_tracker>(_options);
+}
+
 compaction_strategy::compaction_strategy(::shared_ptr<compaction_strategy_impl> impl)
    : _compaction_strategy_impl(std::move(impl)) {}
 compaction_strategy::compaction_strategy() = default;
@@ -736,8 +741,8 @@ bool compaction_strategy::use_clustering_key_filter() const {
    return _compaction_strategy_impl->use_clustering_key_filter();
 }

-compaction_backlog_tracker& compaction_strategy::get_backlog_tracker() {
-    return _compaction_strategy_impl->get_backlog_tracker();
+compaction_backlog_tracker compaction_strategy::make_backlog_tracker() {
+    return compaction_backlog_tracker(_compaction_strategy_impl->make_backlog_tracker());
 }

 sstables::compaction_descriptor
@@ -745,8 +750,8 @@ compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema
    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

-uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate);
+uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
+    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
 }

 reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -106,9 +106,9 @@ public:

    sstable_set make_sstable_set(schema_ptr schema) const;

-    compaction_backlog_tracker& get_backlog_tracker();
+    compaction_backlog_tracker make_backlog_tracker();

-    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr);

    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -22,8 +22,6 @@ class strategy_control;

 namespace sstables {

-compaction_backlog_tracker& get_unimplemented_backlog_tracker();
-
 class sstable_set_impl;
 class resharding_descriptor;

@@ -70,9 +68,9 @@ public:
    // droppable tombstone histogram and gc_before.
    bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state);

-    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() = 0;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema);

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/date_tiered_compaction_strategy.hh
+++ b/compaction/date_tiered_compaction_strategy.hh
@@ -259,7 +259,6 @@ namespace sstables {

 class date_tiered_compaction_strategy : public compaction_strategy_impl {
    date_tiered_manifest _manifest;
-    compaction_backlog_tracker _backlog_tracker;
 public:
    date_tiered_compaction_strategy(const std::map<sstring, sstring>& options);
    virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;
@@ -272,9 +271,7 @@ public:
        return compaction_strategy_type::date_tiered;
    }

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;
 };

 }
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -144,6 +144,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    auto max_sstable_size_in_bytes = _max_sstable_size_in_mb * 1024 * 1024;

+    leveled_manifest::logger.debug("get_reshaping_job: mode={} input.size={} max_sstable_size_in_bytes={}", mode == reshape_mode::relaxed ? "relaxed" : "strict", input.size(), max_sstable_size_in_bytes);
+
    for (auto& sst : input) {
        auto sst_level = sst->get_sstable_level();
        if (sst_level > leveled_manifest::MAX_LEVELS - 1) {
@@ -200,10 +202,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

        auto [disjoint, overlapping_sstables] = is_disjoint(level_info[level], tolerance(level));
        if (!disjoint) {
-            auto ideal_level = ideal_level_for_input(input, max_sstable_size_in_bytes);
-            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so compacting everything on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
-            // Unfortunately no good limit to limit input size to max_sstables for LCS major
-            compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
+            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so the level will be entirely compacted on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
+            compaction_descriptor desc(std::move(level_info[level]), iop, level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -229,6 +229,9 @@ leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, s
 }

 unsigned leveled_compaction_strategy::ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size) {
+    if (!max_sstable_size) {
+        return 1;
+    }
    auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
        double inv_log_fanout = 1.0f / std::log(fanout);
        return log(x) * inv_log_fanout;
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -35,7 +35,6 @@ class leveled_compaction_strategy : public compaction_strategy_impl {
    std::optional<std::vector<std::optional<dht::decorated_key>>> _last_compacted_keys;
    std::vector<int> _compaction_counter;
    size_tiered_compaction_strategy_options _stcs_options;
-    compaction_backlog_tracker _backlog_tracker;
    int32_t calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const;
 public:
    static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
@@ -64,9 +63,7 @@ public:
    }
    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) override;
 };
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include "sstables/sstables.hh"
 #include "size_tiered_compaction_strategy.hh"

 #include <boost/range/adaptor/transformed.hpp>
--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -10,7 +10,7 @@

 #include "compaction_strategy_impl.hh"
 #include "compaction.hh"
-#include "sstables/sstables.hh"
+#include "sstables/shared_sstable.hh"
 #include <boost/algorithm/cxx11/any_of.hpp>

 class size_tiered_backlog_tracker;
@@ -82,7 +82,6 @@ public:

 class size_tiered_compaction_strategy : public compaction_strategy_impl {
    size_tiered_compaction_strategy_options _options;
-    compaction_backlog_tracker _backlog_tracker;

    // Return a list of pair of shared_sstable and its respective size.
    static std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables);
@@ -128,9 +127,7 @@ public:
    most_interesting_bucket(const std::vector<sstables::shared_sstable>& candidates, int min_threshold, int max_threshold,
        size_tiered_compaction_strategy_options options = {});

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) override;

--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -10,14 +10,15 @@
 #pragma once

 #include "schema_fwd.hh"
-#include "sstables/sstable_set.hh"
-#include "sstables/sstables_manager.hh"
 #include "compaction_descriptor.hh"

 class reader_permit;
+class compaction_backlog_tracker;

 namespace sstables {
+class sstable_set;
 class compaction_strategy;
+class sstables_manager;
 struct sstable_writer_config;
 }

@@ -43,6 +44,7 @@ public:
    virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
    virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
    virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;
+    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
 };

 }
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -100,16 +100,27 @@ public:
    };
 };

-uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
-        // Not enough information, we assume the worst
-        return partition_estimate / max_data_segregation_window_count;
-    }
-    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
-    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
-    const auto window_size = get_window_size(_options);
+uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) {
+    // If not enough information, we assume the worst
+    auto estimated_window_count = max_data_segregation_window_count;
+    auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
+    bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
+    auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
+        const auto window_size = get_window_size(_options);
+        return (max_window + (window_size - 1) - min_window) / window_size;
+    };

-    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
+    if (!min_and_max_ts_available && default_ttl.count()) {
+        auto min_window = get_window_for(_options, timestamp_type(0));
+        auto max_window = get_window_for(_options, timestamp_type(default_ttl.count()));
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    } else if (min_and_max_ts_available) {
+        auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
+        auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    }

    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
 }
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -15,7 +15,7 @@
 #include "size_tiered_compaction_strategy.hh"
 #include "timestamp.hh"
 #include "exceptions/exceptions.hh"
-#include "sstables/sstables.hh"
+#include "sstables/shared_sstable.hh"
 #include "service/priority_manager.hh"

 namespace sstables {
@@ -73,7 +73,6 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
    // Keep track of all recent active windows that still need to be compacted into a single SSTable
    std::unordered_set<timestamp_type> _recent_active_windows;
    size_tiered_compaction_strategy_options _stcs_options;
-    compaction_backlog_tracker _backlog_tracker;
 public:
    // The maximum amount of buckets we segregate data into when writing into sstables.
    // To prevent an explosion in the number of sstables we cap it.
@@ -156,11 +155,9 @@ public:

    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;

-    virtual compaction_backlog_tracker& get_backlog_tracker() override {
-        return _backlog_tracker;
-    }
+    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) override;
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) override;

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) override;

--- a/compound.hh
+++ b/compound.hh
@@ -16,7 +16,6 @@
 #include <boost/range/adaptor/transformed.hpp>
 #include "utils/serialization.hh"
 #include <seastar/util/backtrace.hh>
-#include "cql_serialization_format.hh"

 enum class allow_prefixes { no, yes };

@@ -280,7 +279,7 @@ public:
        }
        for (size_t i = 0; i != values.size(); ++i) {
            //FIXME: is it safe to assume internal serialization-format format?
-            _types[i]->validate(values[i], cql_serialization_format::internal());
+            _types[i]->validate(values[i]);
        }
    }
    bool equal(managed_bytes_view v1, managed_bytes_view v2) const {
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -560,7 +560,7 @@ public:
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
-                throw runtime_exception(format("non-zero component divider found ({:d}) mid", format("0x{:02x}", composite::eoc_type(marker) & 0xff)));
+                throw runtime_exception(format("non-zero component divider found ({:#02x}) mid", composite::eoc_type(marker) & 0xff));
            }
        }
        return ret;
--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -117,6 +117,8 @@ struct date_type_impl final : public concrete_type<db_clock::time_point> {

 using timestamp_date_base_class = concrete_type<db_clock::time_point>;

+sstring timestamp_to_json_string(const timestamp_date_base_class& t, const bytes_view& bv);
+
 struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    timeuuid_type_impl();
    static utils::UUID from_sstring(sstring_view s);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -65,6 +65,13 @@ commitlog_sync_period_in_ms: 10000
 # is reasonable.
 commitlog_segment_size_in_mb: 32

+# The size of the individual schema commitlog file segments.
+
+# The segment size puts a limit on the mutation size that can be
+# written at once, and some schema mutation writes are much larger
+# than average.
+schema_commitlog_segment_size_in_mb: 32
+
 # seed_provider class_name is saved for future use.
 # A seed address is mandatory.
 seed_provider:
@@ -448,20 +455,20 @@ commitlog_total_space_in_mb: -1
 #    internode_encryption: none
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-#    truststore: <none, use system trust>
-#    certficate_revocation_list: <none>
+#    truststore: <not set, use system trust>
+#    certficate_revocation_list: <not set>
 #    require_client_auth: False
-#    priority_string: <none, use default>
+#    priority_string: <not set, use default>

 # enable or disable client/server encryption.
 # client_encryption_options:
 #    enabled: false
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-#    truststore: <none, use system trust>
-#    certficate_revocation_list: <none>
+#    truststore: <not set, use system trust>
+#    certficate_revocation_list: <not set>
 #    require_client_auth: False
-#    priority_string: <none, use default>
+#    priority_string: <not set, use default>

 # internode_compression controls whether traffic between nodes is
 # compressed.
@@ -553,4 +560,16 @@ murmur3_partitioner_ignore_msb_bits: 12
 # WARNING: It's unsafe to set this to false if the node previously booted
 # with the schema commit log enabled. In such case, some schema changes
 # may be lost if the node was not cleanly stopped.
-force_schema_commit_log: true
+force_schema_commit_log: true
+
+# Use Raft to consistently manage schema information in the cluster.
+# Refer to https://docs.scylladb.com/master/architecture/raft.html for more details.
+# The 'Handling Failures' section is especially important.
+#
+# Once enabled in a cluster, this cannot be turned off.
+# If you want to bootstrap a new cluster without Raft, make sure to set this to `false`
+# before starting your nodes for the first time.
+#
+# A cluster not using Raft can be 'upgraded' to use Raft. Refer to the aforementioned
+# documentation, section 'Enabling Raft in ScyllaDB 5.2 and further', for the procedure.
+consistent_cluster_management: true
--- a/configure.py
+++ b/configure.py
@@ -44,16 +44,12 @@ distro_extra_cflags = ''
 distro_extra_ldflags = ''
 distro_extra_cmake_args = []
 employ_ld_trickery = True
-has_wasmtime = False
-use_wasmtime_as_library = False

 # distro-specific setup
 def distro_setup_nix():
-    global os_ids, employ_ld_trickery, has_wasmtime, use_wasmtime_as_library
+    global os_ids, employ_ld_trickery
    os_ids = ['linux']
    employ_ld_trickery = False
-    has_wasmtime = True
-    use_wasmtime_as_library = True

 if os.environ.get('NIX_CC'):
        distro_setup_nix()
@@ -200,7 +196,7 @@ def linker_flags(compiler):


 def maybe_static(flag, libs):
-    if flag and not args.static:
+    if flag:
        libs = '-Wl,-Bstatic {} -Wl,-Bdynamic'.format(libs)
    return libs

@@ -289,7 +285,8 @@ modes = {
        'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
        'cxx_ld_flags': '',
        'stack-usage-threshold': 1024*40,
-        'optimization-level': 'g',
+        # -fasan -Og breaks some coroutines on aarch64, use -O0 instead
+        'optimization-level': ('0' if platform.machine() == 'aarch64' else 'g'),
        'per_src_extra_cxxflags': {},
        'cmake_build_type': 'Debug',
        'can_have_debug_info': True,
@@ -412,6 +409,7 @@ scylla_tests = set([
    'test/boost/limiting_data_source_test',
    'test/boost/linearizing_input_stream_test',
    'test/boost/loading_cache_test',
+    'test/boost/locator_topology_test',
    'test/boost/log_heap_test',
    'test/boost/estimated_histogram_test',
    'test/boost/summary_test',
@@ -482,6 +480,8 @@ scylla_tests = set([
    'test/boost/virtual_reader_test',
    'test/boost/virtual_table_mutation_source_test',
    'test/boost/virtual_table_test',
+    'test/boost/wasm_test',
+    'test/boost/wasm_alloc_test',
    'test/boost/bptree_test',
    'test/boost/btree_test',
    'test/boost/radix_tree_test',
@@ -573,13 +573,6 @@ all_artifacts = apps | tests | other
 arg_parser = argparse.ArgumentParser('Configure scylla')
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
                        help='Output build-file name (by default build.ninja)')
-arg_parser.add_argument('--static', dest='static', action='store_const', default='',
-                        const='-static',
-                        help='Static link (useful for running on hosts outside the build environment')
-arg_parser.add_argument('--pie', dest='pie', action='store_true',
-                        help='Build position-independent executable (PIE)')
-arg_parser.add_argument('--so', dest='so', action='store_true',
-                        help='Build shared object (SO) instead of executable')
 arg_parser.add_argument('--mode', action='append', choices=list(modes.keys()), dest='selected_modes',
                        help="Build modes to generate ninja files for. The available build modes are:\n{}".format("; ".join(["{} - {}".format(m, cfg['description']) for m, cfg in modes.items()])))
 arg_parser.add_argument('--with', dest='artifacts', action='append', default=[],
@@ -670,7 +663,7 @@ scylla_core = (['message/messaging_service.cc',
                'replica/distributed_loader.cc',
                'replica/memtable.cc',
                'replica/exceptions.cc',
-                'dirty_memory_manager.cc',
+                'replica/dirty_memory_manager.cc',
                'absl-flat_hash_map.cc',
                'atomic_cell.cc',
                'caching_options.cc',
@@ -705,6 +698,7 @@ scylla_core = (['message/messaging_service.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
+                'utils/on_internal_error.cc',
                'converting_mutation_partition_applier.cc',
                'readers/combined.cc',
                'readers/multishard.cc',
@@ -824,6 +818,7 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/statements/detach_service_level_statement.cc',
                'cql3/statements/list_service_level_statement.cc',
                'cql3/statements/list_service_level_attachments_statement.cc',
+                'cql3/statements/describe_statement.cc',
                'cql3/update_parameters.cc',
                'cql3/util.cc',
                'cql3/ut_name.cc',
@@ -909,6 +904,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/config_file.cc',
                'utils/multiprecision_int.cc',
                'utils/gz/crc_combine.cc',
+                'utils/gz/crc_combine_table.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -943,6 +939,8 @@ scylla_core = (['message/messaging_service.cc',
                'locator/ec2_snitch.cc',
                'locator/ec2_multi_region_snitch.cc',
                'locator/gce_snitch.cc',
+                'locator/topology.cc',
+                'locator/util.cc',
                'service/client_state.cc',
                'service/storage_service.cc',
                'service/misc_services.cc',
@@ -972,6 +970,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -1033,6 +1032,7 @@ scylla_core = (['message/messaging_service.cc',
                'service/raft/raft_group0_client.cc',
                'service/broadcast_tables/experimental/lang.cc',
                'tasks/task_manager.cc',
+                'rust/wasmtime_bindings/src/lib.rs',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')] \
                  + scylla_raft_core
               )
@@ -1079,6 +1079,8 @@ api = ['api/api.cc',
       Json2Code('api/api-doc/error_injection.json'),
       'api/authorization_cache.cc',
       Json2Code('api/api-doc/authorization_cache.json'),
+       'api/raft.cc',
+       Json2Code('api/api-doc/raft.json'),
       ]

 alternator = [
@@ -1150,10 +1152,6 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/experimental/broadcast_tables_lang.idl.hh',
        ]

-rusts = [
-    'rust/inc/src/lib.rs',
-]
-
 headers = find_headers('.', excluded_dirs=['idl', 'build', 'seastar', '.git'])

 scylla_tests_generic_dependencies = [
@@ -1177,7 +1175,7 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci

 scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc']

-scylla_tools = ['tools/scylla-types.cc', 'tools/scylla-sstable.cc', 'tools/schema_loader.cc', 'tools/utils.cc']
+scylla_tools = ['tools/scylla-types.cc', 'tools/scylla-sstable.cc', 'tools/schema_loader.cc', 'tools/utils.cc', 'tools/lua_sstable_consumer.cc']

 deps = {
    'scylla': idls + ['main.cc'] + scylla_core + api + alternator + redis + scylla_tools,
@@ -1275,7 +1273,7 @@ deps['test/boost/bytes_ostream_test'] = [
    "test/lib/log.cc",
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
-deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
+deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc', 'utils/on_internal_error.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
@@ -1306,7 +1304,7 @@ deps['test/boost/exceptions_fallback_test'] = ['test/boost/exceptions_fallback_t

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
 deps['test/boost/schema_loader_test'] += ['tools/schema_loader.cc']
-deps['test/boost/rust_test'] += rusts
+deps['test/boost/rust_test'] += ['rust/inc/src/lib.rs']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
 deps['test/raft/raft_server_test'] = ['test/raft/raft_server_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
@@ -1323,8 +1321,6 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',
                                     'test/lib/log.cc',
                                     'service/raft/discovery.cc'] + scylla_raft_dependencies

-deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']
-

 warnings = [
    '-Wall',
@@ -1374,7 +1370,7 @@ warnings = [w

 warnings = ' '.join(warnings + ['-Wno-error=deprecated-declarations'])

-def clang_inline_threshold():
+def get_clang_inline_threshold():
    if args.clang_inline_threshold != -1:
        return args.clang_inline_threshold
    elif platform.machine() == 'aarch64':
@@ -1395,7 +1391,7 @@ for mode in modes:

 optimization_flags = [
    '--param inline-unit-growth=300', # gcc
-    f'-mllvm -inline-threshold={clang_inline_threshold()}',  # clang
+    f'-mllvm -inline-threshold={get_clang_inline_threshold()}',  # clang
    # clang generates 16-byte loads that break store-to-load forwarding
    # gcc also has some trouble: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103554
    '-fno-slp-vectorize',
@@ -1409,19 +1405,6 @@ if flag_supported(flag='-Wstack-usage=4096', compiler=args.cxx):
    for mode in modes:
        modes[mode]['cxxflags'] += f' -Wstack-usage={modes[mode]["stack-usage-threshold"]} -Wno-error=stack-usage='

-if not has_wasmtime:
-    has_wasmtime = os.path.isfile('/usr/lib64/libwasmtime.a') and os.path.isdir('/usr/local/include/wasmtime')
-
-if has_wasmtime:
-    if platform.machine() == 'aarch64':
-        print("wasmtime is temporarily not supported on aarch64. Ref: issue #9387")
-        has_wasmtime = False
-    else:
-        for mode in modes:
-            modes[mode]['cxxflags'] += ' -DSCYLLA_ENABLE_WASMTIME'
-else:
-    print("wasmtime not found - WASM support will not be enabled in this build")
-
 linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
@@ -1432,16 +1415,6 @@ perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'
 # debug info from the libraries we static link with
 regular_link_rule = 'link' if args.debuginfo else 'link_stripped'

-if args.so:
-    args.pie = '-shared'
-    args.fpie = '-fpic'
-elif args.pie:
-    args.pie = '-pie'
-    args.fpie = '-fpie'
-else:
-    args.pie = ''
-    args.fpie = ''
-
 # a list element means a list of alternative packages to consider
 # the first element becomes the HAVE_pkg define
 # a string element is a package name with no alternatives
@@ -1598,13 +1571,14 @@ args.user_ldflags = forced_ldflags + ' ' + args.user_ldflags

 args.user_cflags += f" -ffile-prefix-map={curdir}=."

-seastar_cflags = args.user_cflags
-
 if args.target != '':
-    seastar_cflags += ' -march=' + args.target
-seastar_ldflags = args.user_ldflags
+    args.user_cflags += ' -march=' + args.target

-libdeflate_cflags = seastar_cflags
+for mode in modes:
+    # Those flags are passed not only to Scylla objects, but also to libraries
+    # that we compile ourselves.
+    modes[mode]['lib_cflags'] = args.user_cflags
+    modes[mode]['lib_ldflags'] = args.user_ldflags + linker_flags

 # cmake likes to separate things with semicolons
 def semicolon_separated(*flags):
@@ -1624,8 +1598,8 @@ def configure_seastar(build_dir, mode, mode_config):
        '-DCMAKE_C_COMPILER={}'.format(args.cc),
        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
        '-DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON',
-        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags).replace(' ', ';')),
-        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(seastar_ldflags, modes[mode]['cxx_ld_flags'])),
+        '-DSeastar_CXX_FLAGS=SHELL:{}'.format(mode_config['lib_cflags']),
+        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(mode_config['lib_ldflags'], mode_config['cxx_ld_flags'])),
        '-DSeastar_CXX_DIALECT=gnu++20',
        '-DSeastar_API_LEVEL=6',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
@@ -1686,52 +1660,16 @@ for mode in build_modes:
    seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
    modes[mode]['seastar_cflags'] = seastar_pc_cflags
    modes[mode]['seastar_libs'] = seastar_pc_libs
+    modes[mode]['seastar_testing_libs'] = pkg_config(pc[mode].replace('seastar.pc', 'seastar-testing.pc'), '--libs', '--static')

-def configure_abseil(build_dir, mode, mode_config):
-    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
+abseil_pkgs = [
+    'absl_raw_hash_set',
+    'absl_hash',
+]

-    abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
-    cmake_mode = mode_config['cmake_build_type']
-    abseil_cmake_args = [
-        '-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
-        '-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
-        '-DCMAKE_C_COMPILER={}'.format(args.cc),
-        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
-        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
-        '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
-        '-DCMAKE_CXX_STANDARD=20',
-        '-DABSL_PROPAGATE_CXX_STD=ON',
-    ] + distro_extra_cmake_args
-
-    abseil_cmd = ['cmake', '-G', 'Ninja', real_relpath('abseil', abseil_build_dir)] + abseil_cmake_args
-
-    os.makedirs(abseil_build_dir, exist_ok=True)
-    subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
-
-abseil_libs = ['absl/' + lib for lib in [
-    'container/libabsl_hashtablez_sampler.a',
-    'container/libabsl_raw_hash_set.a',
-    'synchronization/libabsl_synchronization.a',
-    'synchronization/libabsl_graphcycles_internal.a',
-    'debugging/libabsl_stacktrace.a',
-    'debugging/libabsl_symbolize.a',
-    'debugging/libabsl_debugging_internal.a',
-    'debugging/libabsl_demangle_internal.a',
-    'time/libabsl_time.a',
-    'time/libabsl_time_zone.a',
-    'numeric/libabsl_int128.a',
-    'hash/libabsl_city.a',
-    'hash/libabsl_hash.a',
-    'hash/libabsl_low_level_hash.a',
-    'base/libabsl_malloc_internal.a',
-    'base/libabsl_spinlock_wait.a',
-    'base/libabsl_base.a',
-    'base/libabsl_raw_logging_internal.a',
-    'profiling/libabsl_exponential_biased.a',
-    'base/libabsl_throw_delegate.a']]
+pkgs += abseil_pkgs

 args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
-args.user_cflags += ' -march=' + args.target
 libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
                 ' -lstdc++fs', ' -lcrypt', ' -lcryptopp', ' -lpthread',
                 # Must link with static version of libzstd, since
@@ -1739,11 +1677,8 @@ libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-l
                 maybe_static(True, '-lzstd'),
                 maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc -licui18n'),
                 '-lxxhash',
+                 '-ldeflate',
                ])
-if has_wasmtime:
-    print("Found wasmtime dependency, linking with libwasmtime")
-    if use_wasmtime_as_library:
-        libs += " -lwasmtime"

 if not args.staticboost:
    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'
@@ -1762,7 +1697,6 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
    libs += ' ' + pkg_config(pkg, '--libs')
-args.user_cflags += ' -isystem abseil'
 user_cflags = args.user_cflags + ' -fvisibility=hidden'
 user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
 if args.staticcxx:
@@ -1784,10 +1718,6 @@ if args.ragel_exec:
 else:
    ragel_exec = "ragel"

-if not args.dist_only:
-    for mode, mode_config in build_modes.items():
-        configure_abseil(outdir, mode, mode_config)
-
 with open(buildfile, 'w') as f:
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
@@ -1840,18 +1770,24 @@ with open(buildfile, 'w') as f:
        rule unified
            command = unified/build_unified.sh --mode $mode --unified-pkg $out
        rule rust_header
-            command = cxxbridge $in > $out
+            command = cxxbridge --include rust/cxx.h --header $in > $out
            description = RUST_HEADER $out
+        rule rust_source
+            command = cxxbridge --include rust/cxx.h $in > $out
+            description = RUST_SOURCE $out
+        rule cxxbridge_header
+            command = cxxbridge --header > $out
        ''').format(**globals()))
    for mode in build_modes:
        modeval = modes[mode]
        fmt_lib = 'fmt'
        f.write(textwrap.dedent('''\
            cxx_ld_flags_{mode} = {cxx_ld_flags}
-            ld_flags_{mode} = $cxx_ld_flags_{mode}
-            cxxflags_{mode} = $cxx_ld_flags_{mode} {cxxflags} -iquote. -iquote $builddir/{mode}/gen
+            ld_flags_{mode} = $cxx_ld_flags_{mode} {lib_ldflags}
+            cxxflags_{mode} = $cxx_ld_flags_{mode} {lib_cflags} {cxxflags} -iquote. -iquote $builddir/{mode}/gen
            libs_{mode} = -l{fmt_lib}
            seastar_libs_{mode} = {seastar_libs}
+            seastar_testing_libs_{mode} = {seastar_testing_libs}
            rule cxx.{mode}
              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in
              description = CXX $out
@@ -1901,7 +1837,8 @@ with open(buildfile, 'w') as f:
              pool = console
              description = TEST {mode}
            rule rust_lib.{mode}
-              command = CARGO_HOME=build/{mode}/rust/.cargo cargo build --release --manifest-path=rust/Cargo.toml --target-dir=build/{mode}/rust -p ${{pkg}}
+              command = CARGO_BUILD_DEP_INFO_BASEDIR='.' cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
+                        && touch $out
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, test_repeat=test_repeat, test_timeout=test_timeout, **modeval))
        f.write(
@@ -1920,7 +1857,6 @@ with open(buildfile, 'w') as f:
        ragels = {}
        antlr3_grammars = set()
        rust_headers = {}
-        rust_libs = {}
        seastar_dep = '$builddir/{}/seastar/libseastar.a'.format(mode)
        seastar_testing_dep = '$builddir/{}/seastar/libseastar_testing.a'.format(mode)
        for binary in sorted(build_artifacts):
@@ -1931,9 +1867,8 @@ with open(buildfile, 'w') as f:
                    for src in srcs
                    if src.endswith('.cc')]
            objs.append('$builddir/../utils/arch/powerpc/crc32-vpmsum/crc32.S')
-            if has_wasmtime and not use_wasmtime_as_library:
-                objs.append('/usr/lib64/libwasmtime.a')
            has_thrift = False
+            has_rust = False
            for dep in deps[binary]:
                if isinstance(dep, Thrift):
                    has_thrift = True
@@ -1942,43 +1877,36 @@ with open(buildfile, 'w') as f:
                    objs += dep.objects('$builddir/' + mode + '/gen')
                if isinstance(dep, Json2Code):
                    objs += dep.objects('$builddir/' + mode + '/gen')
-                if dep.endswith('/src/lib.rs'):
-                    lib = dep.replace('/src/lib.rs', '.a').replace('rust/','lib')
-                    objs.append('$builddir/' + mode + '/rust/release/' + lib)
-            if binary.endswith('.a'):
-                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
+                if dep.endswith('.rs'):
+                    has_rust = True
+                    idx = dep.rindex('/src/')
+                    obj = dep[:idx].replace('rust/','') + '.o'
+                    objs.append('$builddir/' + mode + '/gen/rust/' + obj)
+            if has_rust:
+                objs.append('$builddir/' + mode +'/rust-' + mode + '/librust_combined.a')
+            local_libs = '$seastar_libs_{} $libs'.format(mode)
+            if has_thrift:
+                local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
+            if binary in tests:
+                if binary in pure_boost_tests:
+                    local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
+                if binary not in tests_not_using_seastar_test_framework:
+                    local_libs += ' ' + "$seastar_testing_libs_{}".format(mode)
+                # Our code's debugging information is huge, and multiplied
+                # by many tests yields ridiculous amounts of disk space.
+                # So we strip the tests by default; The user can very
+                # quickly re-link the test unstripped by adding a "_g"
+                # to the test name, e.g., "ninja build/release/testname_g"
+                link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                f.write('   libs = {}\n'.format(local_libs))
+                f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                f.write('   libs = {}\n'.format(local_libs))
            else:
-                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
-                    'libdeflate/libdeflate.a',
-                ] + [
-                    'abseil/' + x for x in abseil_libs
-                ]])
-                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
-                if binary in tests:
-                    local_libs = '$seastar_libs_{} $libs'.format(mode)
-                    if binary in pure_boost_tests:
-                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
-                    if binary not in tests_not_using_seastar_test_framework:
-                        pc_path = pc[mode].replace('seastar.pc', 'seastar-testing.pc')
-                        local_libs += ' ' + pkg_config(pc_path, '--libs', '--static')
-                    if has_thrift:
-                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
-                    # Our code's debugging information is huge, and multiplied
-                    # by many tests yields ridiculous amounts of disk space.
-                    # So we strip the tests by default; The user can very
-                    # quickly re-link the test unstripped by adding a "_g"
-                    # to the test name, e.g., "ninja build/release/testname_g"
-                    link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
-                    f.write('   libs = {}\n'.format(local_libs))
-                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
-                    f.write('   libs = {}\n'.format(local_libs))
-                else:
-                    f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep))
-                    if has_thrift:
-                        f.write('   libs =  {} {} $seastar_libs_{} $libs\n'.format(thrift_libs, maybe_static(args.staticboost, '-lboost_system'), mode))
-                    f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
-                    f.write(f'build $builddir/{mode}/{binary}.debug: phony $builddir/{mode}/{binary}.stripped\n')
+                f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep))
+                f.write('   libs = {}\n'.format(local_libs))
+                f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
+                f.write(f'build $builddir/{mode}/{binary}.debug: phony $builddir/{mode}/{binary}.stripped\n')
            for src in srcs:
                if src.endswith('.cc'):
                    obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
@@ -1995,19 +1923,12 @@ with open(buildfile, 'w') as f:
                    thrifts.add(src)
                elif src.endswith('.g'):
                    antlr3_grammars.add(src)
-                elif src.endswith('/src/lib.rs'):
-                    hh = '$builddir/' + mode + '/gen/' + src.replace('/src/lib.rs', '.hh')
+                elif src.endswith('.rs'):
+                    idx = src.rindex('/src/')
+                    hh = '$builddir/' + mode + '/gen/' + src[:idx] + '.hh'
                    rust_headers[hh] = src
-                    staticlib = src.replace('rust/', '$builddir/' + mode + '/rust/release/lib').replace('/src/lib.rs', '.a')
-                    rust_libs[staticlib] = src
                else:
                    raise Exception('No rule for ' + src)
-        compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
-        compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
-        f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
-                                            '$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
-        f.write('build {}: link_build.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
-                                                '$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
        f.write('   libs = $seastar_libs_{}\n'.format(mode))
        f.write(
            'build {mode}-objects: phony {objs}\n'.format(
@@ -2045,6 +1966,7 @@ with open(buildfile, 'w') as f:
        gen_headers += list(serializers.keys())
        gen_headers += list(ragels.keys())
        gen_headers += list(rust_headers.keys())
+        gen_headers.append('$builddir/{}/gen/rust/cxx.h'.format(mode))
        gen_headers_dep = ' '.join(gen_headers)

        for obj in compiles:
@@ -2068,10 +1990,13 @@ with open(buildfile, 'w') as f:
        for hh in rust_headers:
            src = rust_headers[hh]
            f.write('build {}: rust_header {}\n'.format(hh, src))
-        for lib in rust_libs:
-            src = rust_libs[lib]
-            package = src.replace('/src/lib.rs', '').replace('rust/','')
-            f.write('build {}: rust_lib.{} {}\n  pkg = {}\n'.format(lib, mode, src, package))
+            cc = hh.replace('.hh', '.cc')
+            f.write('build {}: rust_source {}\n'.format(cc, src))
+            obj = cc.replace('.cc', '.o')
+            f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, gen_headers_dep))
+        f.write('build {}: cxxbridge_header\n'.format('$builddir/{}/gen/rust/cxx.h'.format(mode)))
+        librust = '$builddir/{}/rust-{}/librust_combined'.format(mode, mode)
+        f.write('build {}.a: rust_lib.{} rust/Cargo.lock\n  depfile={}.d\n'.format(librust, mode, librust))
        for thrift in thrifts:
            outs = ' '.join(thrift.generated('$builddir/{}/gen'.format(mode)))
            f.write('build {}: thrift.{} {}\n'.format(outs, mode, thrift.source))
@@ -2087,7 +2012,8 @@ with open(buildfile, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
@@ -2139,30 +2065,16 @@ with open(buildfile, 'w') as f:
        f.write(f'  mode = {mode}\n')
        f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/dist/tar/{scylla_product}-unstripped-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian dist-server-compat-{mode} dist-server-compat-arch-{mode}\n')
-        f.write(f'build dist-server-compat-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
-        f.write(f'build dist-server-compat-arch-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-{arch}-package.tar.gz\n')
+        f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian\n')
        f.write(f'build dist-server-debuginfo-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-debuginfo-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
-        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz dist-jmx-rpm dist-jmx-deb dist-jmx-compat\n')
-        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz dist-tools-rpm dist-tools-deb dist-tools-compat\n')
-        f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb dist-python3-compat dist-python3-compat-arch\n')
-        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz dist-unified-compat-{mode} dist-unified-compat-arch-{mode}\n')
-        f.write(f'build dist-unified-compat-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz\n')
-        f.write(f'build dist-unified-compat-arch-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz\n')
+        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz dist-jmx-rpm dist-jmx-deb\n')
+        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz dist-tools-rpm dist-tools-deb\n')
+        f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb\n')
+        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz: unified $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz | always\n')
        f.write(f'  mode = {mode}\n')
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
-        f.write('rule libdeflate.{mode}\n'.format(**locals()))
-        f.write('  command = make -C libdeflate BUILD_DIR=../$builddir/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../$builddir/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
-        f.write('build $builddir/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
-        f.write('  pool = submodule_pool\n')
-
-        for lib in abseil_libs:
-            f.write('build $builddir/{mode}/abseil/{lib}: ninja $builddir/{mode}/abseil/build.ninja\n'.format(**locals()))
-            f.write('  pool = submodule_pool\n')
-            f.write('  subdir = $builddir/{mode}/abseil\n'.format(**locals()))
-            f.write('  target = {lib}\n'.format(**locals()))

    checkheaders_mode = 'dev' if 'dev' in modes else modes.keys()[0]
    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(checkheaders_mode, hh) for hh in headers])))
@@ -2179,17 +2091,13 @@ with open(buildfile, 'w') as f:

    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
-        build dist-unified-compat: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz' for mode in default_modes])}
-        build dist-unified-compat-arch: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz' for mode in default_modes])}
-        build dist-unified: phony dist-unified-tar dist-unified-compat dist-unified-compat-arch
+        build dist-unified: phony dist-unified-tar

        build dist-server-deb: phony {' '.join(['$builddir/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
        build dist-server-rpm: phony {' '.join(['$builddir/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
        build dist-server-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
        build dist-server-debuginfo: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-debuginfo-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-server-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-server-compat-arch: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-{arch}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-server: phony dist-server-tar dist-server-debuginfo dist-server-compat dist-server-compat-arch dist-server-rpm dist-server-deb
+        build dist-server: phony dist-server-tar dist-server-debuginfo dist-server-rpm dist-server-deb

        rule build-submodule-reloc
          command = cd $reloc_dir && ./reloc/build_reloc.sh --version $$(<../../build/SCYLLA-PRODUCT-FILE)-$$(sed 's/-/~/' <../../build/SCYLLA-VERSION-FILE)-$$(<../../build/SCYLLA-RELEASE-FILE) --nodeps $args
@@ -2207,8 +2115,7 @@ with open(buildfile, 'w') as f:
          dir = tools/jmx
          artifact = $builddir/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz
        build dist-jmx-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz'.format(mode=mode, scylla_product=scylla_product, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-jmx-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-jmx: phony dist-jmx-tar dist-jmx-compat dist-jmx-rpm dist-jmx-deb
+        build dist-jmx: phony dist-jmx-tar dist-jmx-rpm dist-jmx-deb

        build tools/java/build/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz: build-submodule-reloc | build/SCYLLA-PRODUCT-FILE build/SCYLLA-VERSION-FILE build/SCYLLA-RELEASE-FILE
          reloc_dir = tools/java
@@ -2219,8 +2126,7 @@ with open(buildfile, 'w') as f:
          dir = tools/java
          artifact = $builddir/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz
        build dist-tools-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz'.format(mode=mode, scylla_product=scylla_product, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-tools-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-tools: phony dist-tools-tar dist-tools-compat dist-tools-rpm dist-tools-deb
+        build dist-tools: phony dist-tools-tar dist-tools-rpm dist-tools-deb

        build tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: build-submodule-reloc | build/SCYLLA-PRODUCT-FILE build/SCYLLA-VERSION-FILE build/SCYLLA-RELEASE-FILE
          reloc_dir = tools/python3
@@ -2232,14 +2138,10 @@ with open(buildfile, 'w') as f:
          dir = tools/python3
          artifact = $builddir/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build dist-python3-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
-        build dist-python3-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-python3-compat-arch: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-{arch}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
-        build dist-python3: phony dist-python3-tar dist-python3-compat dist-python3-compat-arch dist-python3-rpm dist-python3-deb
+        build dist-python3: phony dist-python3-tar dist-python3-rpm dist-python3-deb
        build dist-deb: phony dist-server-deb dist-python3-deb dist-jmx-deb dist-tools-deb
        build dist-rpm: phony dist-server-rpm dist-python3-rpm dist-jmx-rpm dist-tools-rpm
        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-jmx-tar dist-tools-tar
-        build dist-compat: phony dist-unified-compat dist-server-compat dist-python3-compat
-        build dist-compat-arch: phony dist-unified-compat-arch dist-server-compat-arch dist-python3-compat-arch

        build dist: phony dist-unified dist-server dist-python3 dist-jmx dist-tools
        '''))
@@ -2284,7 +2186,7 @@ with open(buildfile, 'w') as f:
            description = List configured modes
        build mode_list: mode_list
        default {modes_list}
-        ''').format(modes_list=' '.join(default_modes), build_ninja_list=' '.join([f'build/{mode}/{dir}/build.ninja' for mode in build_modes for dir in ['seastar', 'abseil']]), **globals()))
+        ''').format(modes_list=' '.join(default_modes), build_ninja_list=' '.join([f'build/{mode}/{dir}/build.ninja' for mode in build_modes for dir in ['seastar']]), **globals()))
    unit_test_list = set(test for test in build_artifacts if test in set(tests))
    f.write(textwrap.dedent('''\
        rule unit_test_list
@@ -2313,7 +2215,7 @@ with open(buildfile, 'w') as f:
 compdb = 'compile_commands.json'
 # per-mode compdbs are built by taking the relevant entries from the
 # output of "ninja -t compdb" and combining them with the CMake-made
-# compdbs for Seastar and Abseil in the relevant mode.
+# compdbs for Seastar in the relevant mode.
 #
 # "ninja -t compdb" output has to be filtered because
 # - it contains rules for all selected modes, and several entries for
@@ -2328,7 +2230,7 @@ with tempfile.NamedTemporaryFile() as ninja_compdb:
    # build mode-specific compdbs
    for mode in selected_modes:
        mode_out = outdir + '/' + mode
-        submodule_compdbs = [mode_out + '/' + submodule + '/' + compdb for submodule in ['abseil', 'seastar']]
+        submodule_compdbs = [mode_out + '/' + submodule + '/' + compdb for submodule in ['seastar']]
        with open(mode_out + '/' + compdb, 'w+b') as combined_mode_specific_compdb:
            subprocess.run(['./scripts/merge-compdb.py', 'build/' + mode,
                            ninja_compdb.name] + submodule_compdbs, stdout=combined_mode_specific_compdb)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -51,6 +51,7 @@ options {
 #include "cql3/statements/index_prop_defs.hh"
 #include "cql3/statements/raw/use_statement.hh"
 #include "cql3/statements/raw/batch_statement.hh"
+#include "cql3/statements/raw/describe_statement.hh"
 #include "cql3/statements/list_users_statement.hh"
 #include "cql3/statements/grant_statement.hh"
 #include "cql3/statements/revoke_statement.hh"
@@ -358,6 +359,7 @@ cqlStatement returns [std::unique_ptr<raw::parsed_statement> stmt]
    | st46=listServiceLevelStatement { $stmt = std::move(st46); }
    | st47=listServiceLevelAttachStatement { $stmt = std::move(st47); }
    | st48=pruneMaterializedViewStatement  { $stmt = std::move(st48); }
+    | st49=describeStatement           { $stmt = std::move(st49); }
    ;

 /*
@@ -461,8 +463,7 @@ orderByClause[raw::select_statement::parameters::orderings_type& orderings]
    ;

 jsonValue returns [expression value]
-    :
-    | s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
+    : s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
    | m=marker         { $value = std::move(m); }
    ;

@@ -1368,6 +1369,59 @@ listServiceLevelAttachStatement returns [std::unique_ptr<list_service_level_atta
      { $stmt = std::make_unique<list_service_level_attachments_statement>(); }
    ;

+/**
+ * (DESCRIBE | DESC) (
+ *    CLUSTER
+ *    [FULL] SCHEMA
+ *    KEYSPACES
+ *    [ONLY] KEYSPACE <name>?
+ *    TABLES
+ *    TABLE <name>
+ *    TYPES
+ *    TYPE <name>
+ *    FUNCTIONS
+ *    FUNCTION <name>
+ *    AGGREGATES
+ *    AGGREGATE <name>
+ * ) (WITH INTERNALS)?
+ */
+describeStatement returns [std::unique_ptr<cql3::statements::raw::describe_statement> stmt]
+    @init {
+        bool fullSchema = false;
+        bool pending = false;
+        bool config = false;
+        bool only = false;
+        std::optional<sstring> keyspace;
+        sstring generic_name = "";
+    }
+    : ( K_DESCRIBE | K_DESC )
+    ( (K_CLUSTER) => K_CLUSTER                      { $stmt = cql3::statements::raw::describe_statement::cluster();                }
+    | (K_FULL { fullSchema=true; })? K_SCHEMA       { $stmt = cql3::statements::raw::describe_statement::schema(fullSchema);       }
+    | (K_KEYSPACES) => K_KEYSPACES                  { $stmt = cql3::statements::raw::describe_statement::keyspaces();              }
+    | (K_ONLY { only=true; })? K_KEYSPACE ( ks=keyspaceName { keyspace = ks; })?
+                                                    { $stmt = cql3::statements::raw::describe_statement::keyspace(keyspace, only); }
+    | (K_TABLES) => K_TABLES                        { $stmt = cql3::statements::raw::describe_statement::tables();                 }
+    | K_COLUMNFAMILY cf=columnFamilyName            { $stmt = cql3::statements::raw::describe_statement::table(cf);                }
+    | K_INDEX idx=columnFamilyName                  { $stmt = cql3::statements::raw::describe_statement::index(idx);               }
+    | K_MATERIALIZED K_VIEW view=columnFamilyName   { $stmt = cql3::statements::raw::describe_statement::view(view);               }
+    | (K_TYPES) => K_TYPES                          { $stmt = cql3::statements::raw::describe_statement::types();                  }
+    | K_TYPE tn=userTypeName                        { $stmt = cql3::statements::raw::describe_statement::type(tn);                 }
+    | (K_FUNCTIONS) => K_FUNCTIONS                  { $stmt = cql3::statements::raw::describe_statement::functions();              }
+    | K_FUNCTION fn=functionName                    { $stmt = cql3::statements::raw::describe_statement::function(fn);             }
+    | (K_AGGREGATES) => K_AGGREGATES                { $stmt = cql3::statements::raw::describe_statement::aggregates();             }
+    | K_AGGREGATE ag=functionName                   { $stmt = cql3::statements::raw::describe_statement::aggregate(ag);            }
+    | ( ( ksT=IDENT                                 { keyspace = sstring{$ksT.text}; }
+        | ksT=QUOTED_NAME                           { keyspace = sstring{$ksT.text}; }
+        | ksK=unreserved_keyword                    { keyspace = ksK; } ) 
+        '.' )?
+        ( tT=IDENT                                  { generic_name = sstring{$tT.text}; }
+        | tT=QUOTED_NAME                            { generic_name = sstring{$tT.text}; }
+        | tK=unreserved_keyword                     { generic_name = tK; } )
+                                                    { $stmt = cql3::statements::raw::describe_statement::generic(keyspace, generic_name); }
+    )
+    ( K_WITH K_INTERNALS { $stmt->with_internals_details(); } )?
+    ;
+
 /** DEFINITIONS **/

 // Column Identifiers.  These need to be treated differently from other
@@ -1419,7 +1473,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
@@ -1513,7 +1567,7 @@ value returns [expression value]
    | l=collectionLiteral  { $value = std::move(l); }
    | u=usertypeLiteral    { $value = std::move(u); }
    | t=tupleLiteral       { $value = std::move(t); }
-    | K_NULL               { $value = null(); }
+    | K_NULL               { $value = make_untyped_null(); }
    | e=marker             { $value = std::move(e); }
    ;

@@ -1523,8 +1577,7 @@ marker returns [expression value]
    ;

 intValue returns [expression value]
-    :
-    | t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
+    : t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
    | e=marker      { $value = std::move(e); }
    ;

@@ -1678,7 +1731,7 @@ relation returns [expression e]
    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
        { $e = binary_operator(token{std::move(l.elements)}, type, std::move(t)); }
    | name=cident K_IS K_NOT K_NULL {
-          $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IS_NOT, null()); }
+          $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IS_NOT, make_untyped_null()); }
    | name=cident K_IN marker1=marker
        { $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IN, std::move(marker1)); }
    | name=cident K_IN in_values=singleColumnInValues
@@ -1897,10 +1950,13 @@ unreserved_function_keyword returns [sstring str]
 basic_unreserved_keyword returns [sstring str]
    : k=( K_KEYS
        | K_AS
+        | K_CLUSTER
        | K_CLUSTERING
        | K_COMPACT
        | K_STORAGE
+        | K_TABLES
        | K_TYPE
+        | K_TYPES
        | K_VALUES
        | K_MAP
        | K_LIST
@@ -1924,11 +1980,14 @@ basic_unreserved_keyword returns [sstring str]
        | K_TRIGGER
        | K_DISTINCT
        | K_CONTAINS
+        | K_INTERNALS
        | K_STATIC
        | K_FROZEN
        | K_TUPLE
        | K_FUNCTION
+        | K_FUNCTIONS
        | K_AGGREGATE
+        | K_AGGREGATES
        | K_SFUNC
        | K_STYPE
        | K_REDUCEFUNC
@@ -1956,6 +2015,9 @@ basic_unreserved_keyword returns [sstring str]
        | K_LEVEL
        | K_LEVELS
        | K_PRUNE
+        | K_ONLY
+        | K_DESCRIBE
+        | K_DESC
        ) { $str = $k.text; }
    ;

@@ -2013,11 +2075,14 @@ K_TRUNCATE:    T R U N C A T E;
 K_DELETE:      D E L E T E;
 K_IN:          I N;
 K_CREATE:      C R E A T E;
+K_SCHEMA:      S C H E M A;
 K_KEYSPACE:    ( K E Y S P A C E
-                 | S C H E M A );
+                 | K_SCHEMA );
 K_KEYSPACES:   K E Y S P A C E S;
 K_COLUMNFAMILY:( C O L U M N F A M I L Y
                 | T A B L E );
+K_TABLES:      ( C O L U M N F A M I L I E S
+                 | T A B L E S );
 K_MATERIALIZED:M A T E R I A L I Z E D;
 K_VIEW:        V I E W;
 K_INDEX:       I N D E X;
@@ -2034,6 +2099,7 @@ K_ALTER:       A L T E R;
 K_RENAME:      R E N A M E;
 K_ADD:         A D D;
 K_TYPE:        T Y P E;
+K_TYPES:       T Y P E S;
 K_COMPACT:     C O M P A C T;
 K_STORAGE:     S T O R A G E;
 K_ORDER:       O R D E R;
@@ -2045,6 +2111,8 @@ K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
 K_IS:          I S;
 K_CONTAINS:    C O N T A I N S;
+K_INTERNALS:   I N T E R N A L S;
+K_ONLY:        O N L Y;

 K_GRANT:       G R A N T;
 K_ALL:         A L L;
@@ -2068,6 +2136,7 @@ K_LOGIN:       L O G I N;
 K_NOLOGIN:     N O L O G I N;
 K_OPTIONS:     O P T I O N S;

+K_CLUSTER:     C L U S T E R;
 K_CLUSTERING:  C L U S T E R I N G;
 K_ASCII:       A S C I I;
 K_BIGINT:      B I G I N T;
@@ -2107,7 +2176,9 @@ K_STATIC:      S T A T I C;
 K_FROZEN:      F R O Z E N;

 K_FUNCTION:    F U N C T I O N;
+K_FUNCTIONS:   F U N C T I O N S;
 K_AGGREGATE:   A G G R E G A T E;
+K_AGGREGATES:  A G G R E G A T E S;
 K_SFUNC:       S F U N C;
 K_STYPE:       S T Y P E;
 K_REDUCEFUNC:  R E D U C E F U N C;
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -10,6 +10,7 @@

 #include "cql3/attributes.hh"
 #include "cql3/column_identifier.hh"
+#include <optional>

 namespace cql3 {

@@ -20,7 +21,9 @@ std::unique_ptr<attributes> attributes::none() {
 attributes::attributes(std::optional<cql3::expr::expression>&& timestamp,
                       std::optional<cql3::expr::expression>&& time_to_live,
                       std::optional<cql3::expr::expression>&& timeout)
-    : _timestamp{std::move(timestamp)}
+    : _timestamp_unset_guard(timestamp)
+    , _timestamp{std::move(timestamp)}
+    , _time_to_live_unset_guard(time_to_live)
    , _time_to_live{std::move(time_to_live)}
    , _timeout{std::move(timeout)}
 { }
@@ -38,7 +41,7 @@ bool attributes::is_timeout_set() const {
 }

 int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
-    if (!_timestamp.has_value()) {
+    if (!_timestamp.has_value() || _timestamp_unset_guard.is_unset(options)) {
        return now;
    }

@@ -46,31 +49,25 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of timestamp");
    }
-    if (tval.is_unset_value()) {
-        return now;
-    }
    try {
-        return tval.view().validate_and_deserialize<int64_t>(*long_type, cql_serialization_format::internal());
+        return tval.view().validate_and_deserialize<int64_t>(*long_type);
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
 }

-int32_t attributes::get_time_to_live(const query_options& options) {
-    if (!_time_to_live.has_value())
-        return 0;
+std::optional<int32_t> attributes::get_time_to_live(const query_options& options) {
+    if (!_time_to_live.has_value() || _time_to_live_unset_guard.is_unset(options))
+        return std::nullopt;

    cql3::raw_value tval = expr::evaluate(*_time_to_live, options);
    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of TTL");
    }
-    if (tval.is_unset_value()) {
-        return 0;
-    }

    int32_t ttl;
    try {
-        ttl = tval.view().validate_and_deserialize<int32_t>(*int32_type, cql_serialization_format::internal());
+        ttl = tval.view().validate_and_deserialize<int32_t>(*int32_type);
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
@@ -91,8 +88,8 @@ int32_t attributes::get_time_to_live(const query_options& options) {

 db::timeout_clock::duration attributes::get_timeout(const query_options& options) const {
    cql3::raw_value timeout = expr::evaluate(*_timeout, options);
-    if (timeout.is_null() || timeout.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Timeout value cannot be unset/null");
+    if (timeout.is_null()) {
+        throw exceptions::invalid_request_exception("Timeout value cannot be null");
    }
    cql_duration duration = timeout.view().deserialize<cql_duration>(*duration_type);
    if (duration.months || duration.days) {
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -11,6 +11,7 @@
 #pragma once

 #include "cql3/expr/expression.hh"
+#include "cql3/expr/unset.hh"
 #include "db/timeout_clock.hh"

 namespace cql3 {
@@ -24,7 +25,9 @@ class prepare_context;
 */
 class attributes final {
 private:
+    expr::unset_bind_variable_guard _timestamp_unset_guard;
    std::optional<cql3::expr::expression> _timestamp;
+    expr::unset_bind_variable_guard _time_to_live_unset_guard;
    std::optional<cql3::expr::expression> _time_to_live;
    std::optional<cql3::expr::expression> _timeout;
 public:
@@ -42,7 +45,7 @@ public:

    int64_t get_timestamp(int64_t now, const query_options& options);

-    int32_t get_time_to_live(const query_options& options);
+    std::optional<int32_t> get_time_to_live(const query_options& options);

    db::timeout_clock::duration get_timeout(const query_options& options) const;

--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -139,10 +139,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti

        cql3::raw_value key_constant = expr::evaluate(*_collection_element, options);
        cql3::raw_value_view key = key_constant.view();
-        if (key.is_unset_value()) {
-            throw exceptions::invalid_request_exception(
-                    format("Invalid 'unset' value in {} element access", cell_type.cql3_type_name()));
-        }
        if (key.is_null()) {
            throw exceptions::invalid_request_exception(
                    format("Invalid null value for {} element access", cell_type.cql3_type_name()));
@@ -196,9 +192,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        // <, >, >=, <=, !=
        cql3::raw_value param = expr::evaluate(*_value, options);

-        if (param.is_unset_value()) {
-            throw exceptions::invalid_request_exception("Invalid 'unset' value in condition");
-        }
        if (param.is_null()) {
            if (_op == expr::oper_t::EQ) {
                return cell_value == nullptr;
@@ -224,9 +217,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
            return (*_matcher)(bytes_view(cell_value->serialize_nonnull()));
        } else {
            auto param = expr::evaluate(*_value, options);  // LIKE pattern
-            if (param.is_unset_value()) {
-                throw exceptions::invalid_request_exception("Invalid 'unset' value in LIKE pattern");
-            }
            if (param.is_null()) {
                throw exceptions::invalid_request_exception("Invalid NULL value in LIKE pattern");
            }
@@ -309,7 +299,7 @@ column_condition::raw::prepare(data_dictionary::database db, const sstring& keys

    if (_op == expr::oper_t::LIKE) {
        auto literal_term = expr::as_if<expr::untyped_constant>(&*_value);
-        if (literal_term) {
+        if (literal_term && literal_term->partial_type != expr::untyped_constant::type_class::null) {
            // Pass matcher object
            const sstring& pattern = literal_term->raw_text;
            return column_condition::condition(receiver, std::move(collection_element_expression),
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -33,9 +33,9 @@ public:
    private static final Logger logger = LoggerFactory.getLogger(Constants.class);
 #endif
 public:
-    class setter : public operation {
+    class setter : public operation_skip_if_unset {
    public:
-        using operation::operation;
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
@@ -53,30 +53,26 @@ public:
        virtual void prepare_for_broadcast_tables(statements::broadcast_tables::prepared_update& query) const override;
    };

-    struct adder final : operation {
-        using operation::operation;
+    struct adder final : operation_skip_if_unset {
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
-            } else if (value.is_unset_value()) {
-                return;
            }
            auto increment = value.view().deserialize<int64_t>(*long_type);
            m.set_cell(prefix, column, params.make_counter_update_cell(increment));
        }
    };

-    struct subtracter final : operation {
-        using operation::operation;
+    struct subtracter final : operation_skip_if_unset {
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
-            } else if (value.is_unset_value()) {
-                return;
            }
            auto increment = value.view().deserialize<int64_t>(*long_type);
            if (increment == std::numeric_limits<int64_t>::min()) {
@@ -86,10 +82,10 @@ public:
        }
    };

-    class deleter : public operation {
+    class deleter : public operation_no_unset_support {
    public:
        deleter(const column_definition& column)
-            : operation(column, std::nullopt)
+            : operation_no_unset_support(column, std::nullopt)
        { }

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -473,27 +473,40 @@ sstring maybe_quote(const sstring& identifier) {
    return result;
 }

-sstring quote(const sstring& identifier) {
+template <char C>
+static sstring quote_with(const sstring& str) {
+    static const std::string quote_str{C};
+
    // quote empty string
-    if (identifier.empty()) {
-        return "\"\"";
+    if (str.empty()) {
+        return make_sstring(quote_str, quote_str);
    }
    size_t num_quotes = 0;
-    for (char c : identifier) {
-        num_quotes += (c == '"');
+    for (char c : str) {
+        num_quotes += (c == C);
    }
    if (num_quotes == 0) {
-        return make_sstring("\"", identifier, "\"");
+        return make_sstring(quote_str, str, quote_str);
    }
-    static const std::regex double_quote_re("\"");
+
+    static const std::string double_quote_str{C, C};
+    static const std::regex quote_re(std::string{C});
    std::string result;
-    result.reserve(2 + identifier.size() + num_quotes);
-    result.push_back('"');
-    std::regex_replace(std::back_inserter(result), identifier.begin(), identifier.end(), double_quote_re, "\"\"");
-    result.push_back('"');
+    result.reserve(2 + str.size() + num_quotes);
+    result.push_back(C);
+    std::regex_replace(std::back_inserter(result), str.begin(), str.end(), quote_re, double_quote_str);
+    result.push_back(C);
    return result;
 }

+sstring quote(const sstring& identifier) {
+    return quote_with<'"'>(identifier);
+}
+
+sstring single_quote(const sstring& str) {
+    return quote_with<'\''>(str);
+}
+
 }

 }
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
--- a/cql3/expr/expression.hh
+++ b/cql3/expr/expression.hh
@@ -76,7 +76,6 @@ struct column_mutation_attribute;
 struct function_call;
 struct cast;
 struct field_selection;
-struct null;
 struct bind_variable;
 struct untyped_constant;
 struct constant;
@@ -96,7 +95,6 @@ concept ExpressionElement
        || std::same_as<T, function_call>
        || std::same_as<T, cast>
        || std::same_as<T, field_selection>
-        || std::same_as<T, null>
        || std::same_as<T, bind_variable>
        || std::same_as<T, untyped_constant>
        || std::same_as<T, constant>
@@ -117,7 +115,6 @@ concept invocable_on_expression
        && std::invocable<Func, function_call>
        && std::invocable<Func, cast>
        && std::invocable<Func, field_selection>
-        && std::invocable<Func, null>
        && std::invocable<Func, bind_variable>
        && std::invocable<Func, untyped_constant>
        && std::invocable<Func, constant>
@@ -138,7 +135,6 @@ concept invocable_on_expression_ref
        && std::invocable<Func, function_call&>
        && std::invocable<Func, cast&>
        && std::invocable<Func, field_selection&>
-        && std::invocable<Func, null&>
        && std::invocable<Func, bind_variable&>
        && std::invocable<Func, untyped_constant&>
        && std::invocable<Func, constant&>
@@ -147,7 +143,7 @@ concept invocable_on_expression_ref
        && std::invocable<Func, usertype_constructor&>
        ;

-/// A CQL expression -- union of all possible expression types.  bool means a Boolean constant.
+/// A CQL expression -- union of all possible expression types.
 class expression final {
    // 'impl' holds a variant of all expression types, but since 
    // variants of incomplete types are not allowed, we forward declare it
@@ -198,9 +194,7 @@ bool operator==(const expression& e1, const expression& e2);
 // An expression that doesn't contain subexpressions
 template <typename E>
 concept LeafExpression
-        = std::same_as<bool, E>
-        || std::same_as<unresolved_identifier, E> 
-        || std::same_as<null, E> 
+        = std::same_as<unresolved_identifier, E>
        || std::same_as<bind_variable, E> 
        || std::same_as<untyped_constant, E> 
        || std::same_as<constant, E>
@@ -346,12 +340,6 @@ struct field_selection {
    friend bool operator==(const field_selection&, const field_selection&) = default;
 };

-struct null {
-    data_type type; // may be null before prepare
-
-    friend bool operator==(const null&, const null&) = default;
-};
-
 struct bind_variable {
    int32_t bind_index;

@@ -365,17 +353,18 @@ struct bind_variable {
 // A constant which does not yet have a date type. It is partially typed
 // (we know if it's floating or int) but not sized.
 struct untyped_constant {
-    enum type_class { integer, floating_point, string, boolean, duration, uuid, hex };
+    enum type_class { integer, floating_point, string, boolean, duration, uuid, hex, null };
    type_class partial_type;
    sstring raw_text;

    friend bool operator==(const untyped_constant&, const untyped_constant&) = default;
 };

+untyped_constant make_untyped_null();
+
 // Represents a constant value with known value and type
 // For null and unset the type can sometimes be set to empty_type
 struct constant {
-    // A value serialized using the internal (latest) cql_serialization_format
    cql3::raw_value value;

    // Never nullptr, for NULL and UNSET might be empty_type
@@ -383,7 +372,6 @@ struct constant {

    constant(cql3::raw_value value, data_type type);
    static constant make_null(data_type val_type = empty_type);
-    static constant make_unset_value(data_type val_type = empty_type);
    static constant make_bool(bool bool_val);

    bool is_null() const;
@@ -436,7 +424,7 @@ struct usertype_constructor {
 struct expression::impl final {
    using variant_type = std::variant<
            conjunction, binary_operator, column_value, token, unresolved_identifier,
-            column_mutation_attribute, function_call, cast, field_selection, null,
+            column_mutation_attribute, function_call, cast, field_selection,
            bind_variable, untyped_constant, constant, tuple_constructor, collection_constructor,
            usertype_constructor, subscript>;
    variant_type v;
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -78,7 +78,7 @@ static
 void
 usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    if (!receiver.type->is_user_type()) {
-        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto ut = static_pointer_cast<const user_type_impl>(receiver.type);
@@ -90,7 +90,7 @@ usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_
        const expression& value = u.elements.at(field);
        auto&& field_spec = usertype_field_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *field_spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", receiver.name, field, field_spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", *receiver.name, field, field_spec->type->as_cql3_type()));
        }
    }
 }
@@ -123,7 +123,7 @@ usertype_constructor_prepare_expression(const usertype_constructor& u, data_dict
        auto iraw = u.elements.find(field);
        expression raw;
        if (iraw == u.elements.end()) {
-            raw = expr::null();
+            raw = expr::make_untyped_null();
        } else {
            raw = iraw->second;
            ++found_values;
@@ -246,6 +246,21 @@ map_prepare_expression(const collection_constructor& c, data_dictionary::databas

    auto key_spec = maps::key_spec_of(*receiver);
    auto value_spec = maps::value_spec_of(*receiver);
+    const map_type_impl* map_type = dynamic_cast<const map_type_impl*>(&receiver->type->without_reversed());
+    if (map_type == nullptr) {
+        on_internal_error(expr_logger,
+                          format("map_prepare_expression bad non-map receiver type: {}", receiver->type->name()));
+    }
+    data_type map_element_tuple_type = tuple_type_impl::get_instance({map_type->get_keys_type(), map_type->get_values_type()});
+
+    // In Cassandra, an empty (unfrozen) map/set/list is equivalent to the column being null. In
+    // other words a non-frozen collection only exists if it has elements.  Return nullptr right
+    // away to simplify predicate evaluation.  See also
+    // https://issues.apache.org/jira/browse/CASSANDRA-5141
+    if (map_type->is_multi_cell() && c.elements.empty()) {
+        return constant::make_null(receiver->type);
+    }
+
    std::vector<expression> values;
    values.reserve(c.elements.size());
    bool all_terminal = true;
@@ -264,7 +279,7 @@ map_prepare_expression(const collection_constructor& c, data_dictionary::databas

        values.emplace_back(tuple_constructor {
            .elements = {std::move(k), std::move(v)},
-            .type = entry_tuple.type
+            .type = map_element_tuple_type
        });
    }

@@ -298,7 +313,7 @@ set_validate_assignable_to(const collection_constructor& c, data_dictionary::dat
            return;
        }

-        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto&& value_spec = set_value_spec_of(receiver);
@@ -486,18 +501,18 @@ void
 tuple_constructor_validate_assignable_to(const tuple_constructor& tc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver.type->underlying_type());
    if (!tt) {
-        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    for (size_t i = 0; i < tc.elements.size(); ++i) {
        if (i >= tt->size()) {
            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: too many elements. Type {} expects {:d} but got {:d}",
-                                                            receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
+                                                            *receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
        }

        auto&& value = tc.elements[i];
        auto&& spec = component_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", receiver.name, i, spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", *receiver.name, i, spec->type->as_cql3_type()));
        }
    }
 }
@@ -567,6 +582,7 @@ operator<<(std::ostream&out, untyped_constant::type_class t)
        case untyped_constant::type_class::boolean:  return out << "BOOLEAN";
        case untyped_constant::type_class::hex:      return out << "HEX";
        case untyped_constant::type_class::duration: return out << "DURATION";
+        case untyped_constant::type_class::null:     return out << "NULL";
    }
    abort();
 }
@@ -594,8 +610,9 @@ static
 assignment_testable::test_result
 untyped_constant_test_assignment(const untyped_constant& uc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver)
 {
+    bool uc_is_null = uc.partial_type == untyped_constant::type_class::null;
    auto receiver_type = receiver.type->as_cql3_type();
-    if (receiver_type.is_collection() || receiver_type.is_user_type()) {
+    if ((receiver_type.is_collection() || receiver_type.is_user_type()) && !uc_is_null) {
        return assignment_testable::test_result::NOT_ASSIGNABLE;
    }
    if (!receiver_type.is_native()) {
@@ -660,6 +677,10 @@ untyped_constant_test_assignment(const untyped_constant& uc, data_dictionary::da
                return assignment_testable::test_result::EXACT_MATCH;
            }
            break;
+        case untyped_constant::type_class::null:
+            return receiver.type->is_counter()
+                ? assignment_testable::test_result::NOT_ASSIGNABLE
+                : assignment_testable::test_result::WEAKLY_ASSIGNABLE;
    }
    return assignment_testable::test_result::NOT_ASSIGNABLE;
 }
@@ -673,9 +694,18 @@ untyped_constant_prepare_expression(const untyped_constant& uc, data_dictionary:
        return std::nullopt;
    }
    if (!is_assignable(untyped_constant_test_assignment(uc, db, keyspace, *receiver))) {
+      if (uc.partial_type != untyped_constant::type_class::null) {
        throw exceptions::invalid_request_exception(format("Invalid {} constant ({}) for \"{}\" of type {}",
            uc.partial_type, uc.raw_text, *receiver->name, receiver->type->as_cql3_type().to_string()));
+      } else {
+        throw exceptions::invalid_request_exception("Invalid null value for counter increment/decrement");
+      }
    }
+
+    if (uc.partial_type == untyped_constant::type_class::null) {
+        return constant::make_null(receiver->type);
+    }
+
    raw_value raw_val = cql3::raw_value::make_value(untyped_constant_parsed_value(uc, receiver->type));
    return constant(std::move(raw_val), receiver->type);
 }
@@ -687,38 +717,19 @@ bind_variable_test_assignment(const bind_variable& bv, data_dictionary::database
 }

 static
-bind_variable
+std::optional<bind_variable>
 bind_variable_prepare_expression(const bind_variable& bv, data_dictionary::database db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver)
 {   
+    if (!receiver) {
+        return std::nullopt;
+    }
+
    return bind_variable {
        .bind_index = bv.bind_index,
        .receiver = receiver
    };
 }

-static
-assignment_testable::test_result
-null_test_assignment(data_dictionary::database db,
-        const sstring& keyspace,
-        const column_specification& receiver) {
-    return receiver.type->is_counter()
-        ? assignment_testable::test_result::NOT_ASSIGNABLE
-        : assignment_testable::test_result::WEAKLY_ASSIGNABLE;
-}
-
-static
-std::optional<expression>
-null_prepare_expression(data_dictionary::database db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) {
-    if (!receiver) {
-        // TODO: It is not possible to infer the type of NULL, but perhaps we can have a matcing null_type that can be cast to anything
-        return std::nullopt;
-    }
-    if (!is_assignable(null_test_assignment(db, keyspace, *receiver))) {
-        throw exceptions::invalid_request_exception("Invalid null value for counter increment/decrement");
-    }
-    return constant::make_null(receiver->type);
-}
-
 static
 sstring
 cast_display_name(const cast& c) {
@@ -864,6 +875,53 @@ test_assignment_function_call(const cql3::expr::function_call& fc, data_dictiona
    }
 }

+std::optional<expression> prepare_conjunction(const conjunction& conj,
+                                              data_dictionary::database db,
+                                              const sstring& keyspace,
+                                              const schema* schema_opt,
+                                              lw_shared_ptr<column_specification> receiver) {
+    if (receiver.get() != nullptr && receiver->type->without_reversed().get_kind() != abstract_type::kind::boolean) {
+        throw exceptions::invalid_request_exception(
+            format("AND conjunction produces a boolean value, which doesn't match the type: {} of {}",
+                   receiver->type->name(), receiver->name->text()));
+    }
+
+    lw_shared_ptr<column_specification> child_receiver;
+    if (receiver.get() != nullptr) {
+        ::shared_ptr<column_identifier> child_receiver_name =
+            ::make_shared<column_identifier>(format("AND_element({})", receiver->name->text()), true);
+        child_receiver = make_lw_shared<column_specification>(receiver->ks_name, receiver->cf_name,
+                                                              std::move(child_receiver_name), boolean_type);
+    } else {
+        ::shared_ptr<column_identifier> child_receiver_name =
+            ::make_shared<column_identifier>("AND_element(unknown)", true);
+        sstring cf_name = schema_opt ? schema_opt->cf_name() : "unknown_cf";
+        child_receiver = make_lw_shared<column_specification>(keyspace, std::move(cf_name),
+                                                              std::move(child_receiver_name), boolean_type);
+    }
+
+    std::vector<expression> prepared_children;
+
+    bool all_terminal = true;
+    for (const expression& child : conj.children) {
+        std::optional<expression> prepared_child =
+            try_prepare_expression(child, db, keyspace, schema_opt, child_receiver);
+        if (!prepared_child.has_value()) {
+            throw exceptions::invalid_request_exception(fmt::format("Could not infer type of {}", child));
+        }
+        if (!is<constant>(*prepared_child)) {
+            all_terminal = false;
+        }
+        prepared_children.push_back(std::move(*prepared_child));
+    }
+
+    conjunction result = conjunction{std::move(prepared_children)};
+    if (all_terminal) {
+        return constant(evaluate(result, evaluation_inputs{}), boolean_type);
+    }
+    return result;
+}
+
 std::optional<expression>
 try_prepare_expression(const expression& expr, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
    return expr::visit(overloaded_functor{
@@ -873,8 +931,8 @@ try_prepare_expression(const expression& expr, data_dictionary::database db, con
        [&] (const binary_operator&) -> std::optional<expression> {
            on_internal_error(expr_logger, "binary_operators are not yet reachable via prepare_expression()");
        },
-        [&] (const conjunction&) -> std::optional<expression> {
-            on_internal_error(expr_logger, "conjunctions are not yet reachable via prepare_expression()");
+        [&] (const conjunction& conj) -> std::optional<expression> {
+            return prepare_conjunction(conj, db, keyspace, schema_opt, receiver);
        },
        [] (const column_value& cv) -> std::optional<expression> {
            return cv;
@@ -945,9 +1003,6 @@ try_prepare_expression(const expression& expr, data_dictionary::database db, con
        [&] (const field_selection&) -> std::optional<expression> {
            on_internal_error(expr_logger, "field_selections are not yet reachable via prepare_expression()");
        },
-        [&] (const null&) -> std::optional<expression> {
-            return null_prepare_expression(db, keyspace, receiver);
-        },
        [&] (const bind_variable& bv) -> std::optional<expression> {
            return bind_variable_prepare_expression(bv, db, keyspace, receiver);
        },
@@ -1009,9 +1064,6 @@ test_assignment(const expression& expr, data_dictionary::database db, const sstr
        [&] (const field_selection&) -> test_result {
            on_internal_error(expr_logger, "field_selections are not yet reachable via test_assignment()");
        },
-        [&] (const null&) -> test_result {
-            return null_test_assignment(db, keyspace, receiver);
-        },
        [&] (const bind_variable& bv) -> test_result {
            return bind_variable_test_assignment(bv, db, keyspace, receiver);
        },
@@ -1138,7 +1190,7 @@ static lw_shared_ptr<column_specification> get_lhs_receiver(const expression& pr
 // Given type of LHS and the operation finds the expected type of RHS.
 // The type will be the same as LHS for simple operations like =, but it will be different for more complex ones like IN or CONTAINS.
 static lw_shared_ptr<column_specification> get_rhs_receiver(lw_shared_ptr<column_specification>& lhs_receiver, oper_t oper) {
-    const data_type& lhs_type = lhs_receiver->type->underlying_type();
+    const data_type lhs_type = lhs_receiver->type->underlying_type();

    if (oper == oper_t::IN) {
        data_type rhs_receiver_type = list_type_impl::get_instance(std::move(lhs_type), false);
--- a/cql3/expr/restrictions.cc
+++ b/cql3/expr/restrictions.cc
@@ -144,7 +144,7 @@ void preliminary_binop_vaidation_checks(const binary_operator& binop) {
    }

    if (binop.op == oper_t::IS_NOT) {
-        bool rhs_is_null = is<null>(binop.rhs)
+        bool rhs_is_null = (is<untyped_constant>(binop.rhs) && as<untyped_constant>(binop.rhs).partial_type == untyped_constant::type_class::null)
                           || (is<constant>(binop.rhs) && as<constant>(binop.rhs).is_null());
        if (!rhs_is_null) {
            throw exceptions::invalid_request_exception(format("Unsupported \"IS NOT\" relation: {}", pretty_binop_printer));
--- a/cql3/expr/unset.hh
+++ b/cql3/expr/unset.hh
@@ -0,0 +1,30 @@
+// Copyright (C) 2023-present ScyllaDB
+// SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
+
+#pragma once
+
+#include <optional>
+#include "expression.hh"
+
+namespace cql3 {
+
+class query_options;
+
+}
+
+namespace cql3::expr {
+
+// Some expression users can behave differently if the expression is a bind variable
+// and if that bind variable is unset. unset_bind_variable_guard encapsulates the two
+// conditions.
+class unset_bind_variable_guard {
+    // Disengaged if the operand is not exactly a single bind variable.
+    std::optional<bind_variable> _var;
+public:
+    explicit unset_bind_variable_guard(const expr::expression& operand);
+    explicit unset_bind_variable_guard(std::nullopt_t) {}
+    explicit unset_bind_variable_guard(const std::optional<expr::expression>& operand);
+    bool is_unset(const query_options& qo) const;
+};
+
+}
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -12,7 +12,7 @@
 #include "types.hh"
 #include "types/tuple.hh"
 #include "cql3/functions/scalar_function.hh"
-#include "cql_serialization_format.hh"
+#include "cql3/util.hh"
 #include "utils/big_decimal.hh"
 #include "aggregate_fcts.hh"
 #include "user_aggregate.hh"
@@ -40,10 +40,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        ++_count;
    }
    virtual void set_accumulator(const opt_bytes& acc) override {
@@ -56,7 +56,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return long_type->decompose(_count);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = value_cast<int64_t>(long_type->deserialize(bytes_view(*acc)));
            _count += other;
@@ -189,13 +189,13 @@ public:
    virtual void reset() override {
        _acc = _initcond;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        return _finalfunc ? _finalfunc->execute(sf, std::vector<bytes_opt>{_acc}) : _acc;
+    virtual opt_bytes compute() override {
+        return _finalfunc ? _finalfunc->execute(std::vector<bytes_opt>{_acc}) : _acc;
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        std::vector<bytes_opt> args{_acc};
        args.insert(args.end(), values.begin(), values.end());
-        _acc = _sfunc->execute(sf, args);
+        _acc = _sfunc->execute(args);
    }
    virtual void set_accumulator(const opt_bytes& acc) override {
        _acc = acc;
@@ -203,9 +203,9 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _acc;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        std::vector<bytes_opt> args{_acc, acc};
-        _acc = _rfunc->execute(sf, args);
+        _acc = _rfunc->execute(args);
    }
 };

@@ -218,10 +218,10 @@ public:
    virtual void reset() override {
        _sum = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return data_type_for<Type>()->decompose(accumulator_for<Type>::narrow(_sum));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -237,7 +237,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return accumulator_for<Type>::decompose(_sum);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = accumulator_for<Type>::deserialize(acc);
            _sum += other;
@@ -248,7 +248,7 @@ public:
 template <typename Type>
 class impl_reducible_sum_function final : public impl_sum_function_for<Type> {
 public:
-    virtual bytes_opt compute(cql_serialization_format sf) override {
+    virtual bytes_opt compute() override {
        return this->get_accumulator();
    }
 };
@@ -316,14 +316,14 @@ public:
        _sum = {};
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        Type ret{};
        if (_count) {
            ret = impl_div_for_avg<Type>::div(_sum, _count);
        }
        return data_type_for<Type>()->decompose(ret);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -348,7 +348,7 @@ public:
        );
        return tuple_val.serialize();
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            data_type tuple_type = tuple_type_impl::get_instance({accumulator_for<Type>::data_type(), long_type});
            auto tuple = value_cast<tuple_type_impl::native_type>(tuple_type->deserialize(bytes_view(*acc)));
@@ -362,7 +362,7 @@ public:
 template <typename Type>
 class impl_reducible_avg_function : public impl_avg_function_for<Type> {
 public:
-    virtual bytes_opt compute(cql_serialization_format sf) override {
+    virtual bytes_opt compute() override {
        return this->get_accumulator();
    }
 };
@@ -457,13 +457,13 @@ public:
    virtual void reset() override {
        _max = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        if (!_max) {
            return {};
        }
        return data_type_for<Type>()->decompose(data_value(Type{*_max}));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -487,8 +487,8 @@ public:
        }
        return {};
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
-        return add_input(sf, {acc});
+    virtual void reduce(const opt_bytes& acc) override {
+        return add_input({acc});
    }
 };

@@ -502,10 +502,10 @@ public:
    virtual void reset() override {
        _max = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return _max.value_or(bytes{});
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (values.empty() || !values[0]) {
            return;
        }
@@ -519,11 +519,11 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _max;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc && !acc->length()) {
            return;
        }
-        return add_input(sf, {acc});
+        return add_input({acc});
    }
 };

@@ -598,13 +598,13 @@ public:
    virtual void reset() override {
        _min = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        if (!_min) {
            return {};
        }
        return data_type_for<Type>()->decompose(data_value(Type{*_min}));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -628,8 +628,8 @@ public:
        }
        return {};
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
-        return add_input(sf, {acc});
+    virtual void reduce(const opt_bytes& acc) override {
+        return add_input({acc});
    }
 };

@@ -643,10 +643,10 @@ public:
    virtual void reset() override {
        _min = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return _min.value_or(bytes{});
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (values.empty() || !values[0]) {
            return;
        }
@@ -660,11 +660,11 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _min;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc && !acc->length()) {
            return;
        }
-        return add_input(sf, {acc});
+        return add_input({acc});
    }
 };

@@ -720,10 +720,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -739,7 +739,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return long_type->decompose(_count);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = value_cast<int64_t>(long_type->deserialize(bytes_view(*acc)));
            _count += other;
@@ -814,6 +814,35 @@ bool user_aggregate::is_reducible() const { return _reducefunc != nullptr; }
 bool user_aggregate::requires_thread() const { return _sfunc->requires_thread() || (_finalfunc && _finalfunc->requires_thread()); }
 bool user_aggregate::has_finalfunc() const { return _finalfunc != nullptr; }

+std::ostream& user_aggregate::describe(std::ostream& os) const {
+    auto ks = cql3::util::maybe_quote(name().keyspace);
+    auto na = cql3::util::maybe_quote(name().name);
+
+    os << "CREATE AGGREGATE " << ks << "." << na << "(";
+    for (size_t i = 0; i < _arg_types.size(); i++) {
+        if (i > 0) {
+            os << ", ";
+        }
+        os << _arg_types[i]->cql3_type_name();
+    }
+    os << ")\n";
+
+    os << "SFUNC " << cql3::util::maybe_quote(_sfunc->name().name) << "\n"
+       << "STYPE " << _sfunc->return_type()->cql3_type_name();
+    if (is_reducible()) {
+        os << "\n" << "REDUCEFUNC " << cql3::util::maybe_quote(_reducefunc->name().name);
+    }
+    if (has_finalfunc()) {
+        os << "\n" << "FINALFUNC " << cql3::util::maybe_quote(_finalfunc->name().name);
+    }
+    if (_initcond) {
+        os << "\n" << "INITCOND " << _sfunc->return_type()->deserialize(bytes_view(*_initcond)).to_parsable_string();
+    }
+    os << ";";
+
+    return os;
+}
+
 shared_ptr<aggregate_function>
 aggregate_fcts::make_count_rows_function() {
    return make_shared<count_rows_function>();
--- a/cql3/functions/as_json_function.hh
+++ b/cql3/functions/as_json_function.hh
@@ -18,7 +18,6 @@

 #include "bytes_ostream.hh"
 #include "types.hh"
-#include "cql_serialization_format.hh"

 #include <boost/algorithm/cxx11/any_of.hpp>

@@ -47,7 +46,7 @@ public:

    virtual bool requires_thread() const override;

-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        bytes_ostream encoded_row;
        encoded_row.write("{", 1);
        for (size_t i = 0; i < _selector_names.size(); ++i) {
--- a/cql3/functions/bytes_conversion_fcts.hh
+++ b/cql3/functions/bytes_conversion_fcts.hh
@@ -14,7 +14,6 @@
 #include "exceptions/exceptions.hh"
 #include <seastar/core/print.hh>
 #include "cql3/cql3_type.hh"
-#include "cql_serialization_format.hh"

 namespace cql3 {

@@ -28,7 +27,7 @@ shared_ptr<function>
 make_to_blob_function(data_type from_type) {
    auto name = from_type->as_cql3_type().to_string() + "asblob";
    return make_native_scalar_function<true>(name, bytes_type, { from_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) {
+            [] (const std::vector<bytes_opt>& parameters) {
        return parameters[0];
    });
 }
@@ -38,13 +37,13 @@ shared_ptr<function>
 make_from_blob_function(data_type to_type) {
    sstring name = sstring("blobas") + to_type->as_cql3_type().to_string();
    return make_native_scalar_function<true>(name, to_type, { bytes_type },
-            [name, to_type] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [name, to_type] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        auto&& val = parameters[0];
        if (!val) {
            return val;
        }
        try {
-            to_type->validate(*val, sf);
+            to_type->validate(*val);
            return val;
        } catch (marshal_exception& e) {
            using namespace exceptions;
@@ -58,7 +57,7 @@ inline
 shared_ptr<function>
 make_varchar_as_blob_fct() {
    return make_native_scalar_function<true>("varcharasblob", bytes_type, { utf8_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return parameters[0];
    });
 }
@@ -67,7 +66,7 @@ inline
 shared_ptr<function>
 make_blob_as_varchar_fct() {
    return make_native_scalar_function<true>("blobasvarchar", utf8_type, { bytes_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return parameters[0];
    });
 }
--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -35,7 +35,7 @@ public:
    virtual void print(std::ostream& os) const override {
        os << "cast(" << _arg_types[0]->name() << " as " << _return_type->name() << ")";
    }
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        auto from_type = arg_types()[0];
        auto to_type = return_type();

@@ -165,8 +165,6 @@ static data_value castas_fctn_from_dv_to_string(data_value from) {
    return from.type()->to_string_impl(from);
 }

-// FIXME: Add conversions for counters, after they are fully implemented...
-
 static constexpr unsigned next_power_of_2(unsigned val) {
    unsigned ret = 1;
    while (ret <= val) {
@@ -370,6 +368,26 @@ castas_fctn get_castas_fctn(data_type to_type, data_type from_type) {
        return castas_fctn_from_dv_to_string;
    case cast_switch_case_val(kind::utf8, kind::ascii):
        return castas_fctn_simple<sstring, sstring>;
+
+    case cast_switch_case_val(kind::byte, kind::counter):
+        return castas_fctn_simple<int8_t, int64_t>;
+    case cast_switch_case_val(kind::short_kind, kind::counter):
+        return castas_fctn_simple<int16_t, int64_t>;
+    case cast_switch_case_val(kind::int32, kind::counter):
+        return castas_fctn_simple<int32_t, int64_t>;
+    case cast_switch_case_val(kind::long_kind, kind::counter):
+        return castas_fctn_simple<int64_t, int64_t>;
+    case cast_switch_case_val(kind::float_kind, kind::counter):
+        return castas_fctn_simple<float, int64_t>;
+    case cast_switch_case_val(kind::double_kind, kind::counter):
+        return castas_fctn_simple<double, int64_t>;
+    case cast_switch_case_val(kind::varint, kind::counter):
+        return castas_fctn_simple<utils::multiprecision_int, int64_t>;
+    case cast_switch_case_val(kind::decimal, kind::counter):
+        return castas_fctn_from_integer_to_decimal<int64_t>;
+    case cast_switch_case_val(kind::ascii, kind::counter):
+    case cast_switch_case_val(kind::utf8, kind::counter):
+        return castas_fctn_to_string<int64_t>;
    }
    throw exceptions::invalid_request_exception(format("{} cannot be cast to {}", from_type->name(), to_type->name()));
 }
--- a/cql3/functions/error_injection_fcts.cc
+++ b/cql3/functions/error_injection_fcts.cc
@@ -40,8 +40,8 @@ public:
        return Pure;
    }

-    bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
-        return _func(sf, parameters);
+    bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
+        return _func(parameters);
    }
 };

@@ -61,7 +61,7 @@ make_failure_injection_function(sstring name,

 shared_ptr<function> make_enable_injection_function() {
    return make_failure_injection_function<false>("enable_injection", empty_type, { ascii_type, ascii_type },
-            [] (cql_serialization_format, const std::vector<bytes_opt>& parameters) {
+            [] (const std::vector<bytes_opt>& parameters) {
        sstring injection_name = ascii_type->get_string(parameters[0].value());
        const bool one_shot = ascii_type->get_string(parameters[1].value()) == "true";
        smp::invoke_on_all([injection_name, one_shot] () mutable {
@@ -73,7 +73,7 @@ shared_ptr<function> make_enable_injection_function() {

 shared_ptr<function> make_disable_injection_function() {
    return make_failure_injection_function<false>("disable_injection", empty_type, { ascii_type },
-            [] (cql_serialization_format, const std::vector<bytes_opt>& parameters) {
+            [] (const std::vector<bytes_opt>& parameters) {
        sstring injection_name = ascii_type->get_string(parameters[0].value());
        smp::invoke_on_all([injection_name] () mutable {
            utils::get_local_injector().disable(injection_name);
@@ -85,7 +85,7 @@ shared_ptr<function> make_disable_injection_function() {
 shared_ptr<function> make_enabled_injections_function() {
    const auto list_type_inst = list_type_impl::get_instance(ascii_type, false);
    return make_failure_injection_function<true>("enabled_injections", list_type_inst, {},
-        [list_type_inst] (cql_serialization_format, const std::vector<bytes_opt>&) -> bytes {
+        [list_type_inst] (const std::vector<bytes_opt>&) -> bytes {
            return seastar::map_reduce(smp::all_cpus(), [] (unsigned) {
                return make_ready_future<std::vector<sstring>>(utils::get_local_injector().enabled_injections());
            }, std::vector<data_value>(),
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -13,7 +13,10 @@
 #include "cql3/lists.hh"
 #include "cql3/constants.hh"
 #include "cql3/user_types.hh"
+#include "cql3/ut_name.hh"
 #include "cql3/type_json.hh"
+#include "cql3/functions/user_function.hh"
+#include "cql3/functions/user_aggregate.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "types/map.hh"
 #include "types/set.hh"
@@ -24,6 +27,7 @@
 #include "cql3/prepare_context.hh"
 #include "user_aggregate.hh"
 #include "cql3/expr/expression.hh"
+#include <boost/range/adaptor/transformed.hpp>
 #include <boost/range/adaptors.hpp>

 #include "error_injection_fcts.hh"
@@ -49,6 +53,13 @@ bool abstract_function::requires_thread() const { return false; }

 bool as_json_function::requires_thread() const { return false; }

+static bool same_signature(const shared_ptr<function>& f1, const shared_ptr<function>& f2) {
+    if (f1 == nullptr || f2 == nullptr) {
+        return false;
+    }
+    return f1->name() == f2->name() && f1->arg_types() == f2->arg_types();
+}
+
 thread_local std::unordered_multimap<function_name, shared_ptr<function>> functions::_declared = init();

 void functions::clear_functions() noexcept {
@@ -94,11 +105,6 @@ functions::init() noexcept {
        if (type == cql3_type::blob) {
            continue;
        }
-        // counters are not supported yet
-        if (type.is_counter()) {
-            warn(unimplemented::cause::COUNTERS);
-            continue;
-        }

        declare(make_to_blob_function(type.get_type()));
        declare(make_from_blob_function(type.get_type()));
@@ -140,22 +146,56 @@ void functions::replace_function(shared_ptr<function> func) {
    with_udf_iter(func->name(), func->arg_types(), [func] (functions::declared_t::iterator i) {
        i->second = std::move(func);
    });
+    auto scalar_func = dynamic_pointer_cast<scalar_function>(func);
+    if (!scalar_func) {
+        return;
+    }
+    for (auto& fit : _declared) {
+        auto aggregate = dynamic_pointer_cast<user_aggregate>(fit.second);
+        if (aggregate && (same_signature(aggregate->sfunc(), scalar_func)
+            || (same_signature(aggregate->finalfunc(), scalar_func))
+            || (same_signature(aggregate->reducefunc(), scalar_func))))
+        {
+            // we need to replace at least one underlying function
+            shared_ptr<scalar_function> sfunc = same_signature(aggregate->sfunc(), scalar_func) ? scalar_func : aggregate->sfunc();
+            shared_ptr<scalar_function> finalfunc = same_signature(aggregate->finalfunc(), scalar_func) ? scalar_func : aggregate->finalfunc();
+            shared_ptr<scalar_function> reducefunc = same_signature(aggregate->reducefunc(), scalar_func) ? scalar_func : aggregate->reducefunc();
+            fit.second = ::make_shared<user_aggregate>(aggregate->name(), aggregate->initcond(), sfunc, reducefunc, finalfunc);
+        }
+    }
 }

 void functions::remove_function(const function_name& name, const std::vector<data_type>& arg_types) {
    with_udf_iter(name, arg_types, [] (functions::declared_t::iterator i) { _declared.erase(i); });
 }

-std::optional<function_name> functions::used_by_user_aggregate(const function_name& name) {
+std::optional<function_name> functions::used_by_user_aggregate(shared_ptr<user_function> func) {
    for (const shared_ptr<function>& fptr : _declared | boost::adaptors::map_values) {
        auto aggregate = dynamic_pointer_cast<user_aggregate>(fptr);
-        if (aggregate && (aggregate->sfunc().name() == name || (aggregate->has_finalfunc() && aggregate->finalfunc().name() == name))) {
+        if (aggregate && (same_signature(aggregate->sfunc(), func)
+            || (same_signature(aggregate->finalfunc(), func))
+            || (same_signature(aggregate->reducefunc(), func))))
+        {
            return aggregate->name();
        }
    }
    return {};
 }

+std::optional<function_name> functions::used_by_user_function(const ut_name& user_type) {
+    for (const shared_ptr<function>& fptr : _declared | boost::adaptors::map_values) {
+        for (auto& arg_type : fptr->arg_types()) {
+            if (arg_type->references_user_type(user_type.get_keyspace(), user_type.get_user_type_name())) {
+                return fptr->name();
+            }
+        }
+        if (fptr->return_type()->references_user_type(user_type.get_keyspace(), user_type.get_user_type_name())) {
+            return fptr->name();
+        }
+    }
+    return {};
+}
+
 lw_shared_ptr<column_specification>
 functions::make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
        const function& fun, size_t i) {
@@ -171,7 +211,7 @@ inline
 shared_ptr<function>
 make_to_json_function(data_type t) {
    return make_native_scalar_function<true>("tojson", utf8_type, {t},
-            [t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [t](const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return utf8_type->decompose(to_json_string(*t, parameters[0]));
    });
 }
@@ -180,12 +220,12 @@ inline
 shared_ptr<function>
 make_from_json_function(data_dictionary::database db, const sstring& keyspace, data_type t) {
    return make_native_scalar_function<true>("fromjson", t, {utf8_type},
-            [&db, keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [&db, keyspace, t](const std::vector<bytes_opt>& parameters) -> bytes_opt {
        try {
            rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
            bytes_opt parsed_json_value;
            if (!json_value.IsNull()) {
-                parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+                parsed_json_value.emplace(from_json_object(*t, json_value));
            }
            return parsed_json_value;
        } catch(rjson::error& e) {
@@ -382,6 +422,32 @@ functions::get(data_dictionary::database db,
    return std::move(compatibles[0]);
 }

+template<typename F>
+std::vector<shared_ptr<F>> functions::get_filtered_transformed(const sstring& keyspace) {
+    auto filter = [&] (const std::pair<const function_name, shared_ptr<function>>& d) -> bool {
+        return d.first.keyspace == keyspace && dynamic_cast<F*>(d.second.get());
+    };
+    auto transformer = [] (const std::pair<const function_name, shared_ptr<function>>& d) -> shared_ptr<F> {
+        return dynamic_pointer_cast<F>(d.second);
+    };
+    
+    return boost::copy_range<std::vector<shared_ptr<F>>>(
+        _declared 
+        | boost::adaptors::filtered(filter) 
+        | boost::adaptors::transformed(transformer)
+    );
+}
+
+std::vector<shared_ptr<user_function>>
+functions::get_user_functions(const sstring& keyspace) {
+    return get_filtered_transformed<user_function>(keyspace);
+}
+
+std::vector<shared_ptr<user_aggregate>>
+functions::get_user_aggregates(const sstring& keyspace) {
+    return get_filtered_transformed<user_aggregate>(keyspace);
+}
+
 boost::iterator_range<functions::declared_t::iterator>
 functions::find(const function_name& name) {
    assert(name.has_keyspace()); // : "function name not fully qualified";
--- a/cql3/functions/functions.hh
+++ b/cql3/functions/functions.hh
@@ -27,6 +27,10 @@
 namespace cql3 {

 namespace functions {
+//forward declarations
+    class user_function;
+    class user_aggregate;
+
    using declared_t = std::unordered_multimap<function_name, shared_ptr<function>>;
    void add_agg_functions(declared_t& funcs);

@@ -57,6 +61,8 @@ public:
        const std::vector<shared_ptr<assignment_testable>> args(std::begin(provided_args), std::end(provided_args));
        return get(db, keyspace, name, args, receiver_ks, receiver_cf, receiver);
    }
+    static std::vector<shared_ptr<user_function>> get_user_functions(const sstring& keyspace);
+    static std::vector<shared_ptr<user_aggregate>> get_user_aggregates(const sstring& keyspace);
    static boost::iterator_range<declared_t::iterator> find(const function_name& name);
    static declared_t::iterator find_iter(const function_name& name, const std::vector<data_type>& arg_types);
    static shared_ptr<function> find(const function_name& name, const std::vector<data_type>& arg_types);
@@ -65,11 +71,15 @@ public:
    static void add_function(shared_ptr<function>);
    static void replace_function(shared_ptr<function>);
    static void remove_function(const function_name& name, const std::vector<data_type>& arg_types);
-    static std::optional<function_name> used_by_user_aggregate(const function_name& name);
+    static std::optional<function_name> used_by_user_aggregate(shared_ptr<user_function>);
+    static std::optional<function_name> used_by_user_function(const ut_name& user_type);
 private:
    template <typename F>
    static void with_udf_iter(const function_name& name, const std::vector<data_type>& arg_types, F&& f);

+    template <typename F>
+    static std::vector<shared_ptr<F>> get_filtered_transformed(const sstring& keyspace);
+
    // This method and matchArguments are somewhat duplicate, but this method allows us to provide more precise errors in the common
    // case where there is no override for a given function. This is thus probably worth the minor code duplication.
    static void validate_types(data_dictionary::database db,
--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -12,7 +12,6 @@

 #include "native_function.hh"
 #include "scalar_function.hh"
-#include "cql_serialization_format.hh"
 #include "log.hh"
 #include <seastar/core/shared_ptr.hh>

@@ -48,9 +47,9 @@ public:
    virtual bool is_pure() const override {
        return Pure;
    }
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        try {
-            return _func(sf, parameters);
+            return _func(parameters);
        } catch(exceptions::cassandra_exception&) {
            // If the function's code took the time to produce an official
            // cassandra_exception, pass it through. Otherwise, below we will
--- a/cql3/functions/scalar_function.hh
+++ b/cql3/functions/scalar_function.hh
@@ -23,12 +23,11 @@ public:
    /**
     * Applies this function to the specified parameter.
     *
-     * @param protocolVersion protocol version used for parameters and return value
     * @param parameters the input parameters
     * @return the result of applying this function to the parameter
     * @throws InvalidRequestException if this function cannot not be applied to the parameter
     */
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) = 0;
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) = 0;
 };


--- a/cql3/functions/time_uuid_fcts.hh
+++ b/cql3/functions/time_uuid_fcts.hh
@@ -24,7 +24,7 @@ inline
 shared_ptr<function>
 make_now_fct() {
    return make_native_scalar_function<false>("now", timeuuid_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        return {to_bytes(utils::UUID_gen::get_time_UUID())};
    });
 }
@@ -42,7 +42,7 @@ inline
 shared_ptr<function>
 make_min_timeuuid_fct() {
    return make_native_scalar_function<true>("mintimeuuid", timeuuid_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        auto& bb = values[0];
        if (!bb) {
            return {};
@@ -60,7 +60,7 @@ inline
 shared_ptr<function>
 make_max_timeuuid_fct() {
    return make_native_scalar_function<true>("maxtimeuuid", timeuuid_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        auto& bb = values[0];
        if (!bb) {
            return {};
@@ -89,7 +89,7 @@ inline
 shared_ptr<function>
 make_date_of_fct() {
    return make_native_scalar_function<true>("dateof", timestamp_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -104,7 +104,7 @@ inline
 shared_ptr<function>
 make_unix_timestamp_of_fct() {
    return make_native_scalar_function<true>("unixtimestampof", long_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -117,7 +117,7 @@ make_unix_timestamp_of_fct() {
 inline shared_ptr<function>
 make_currenttimestamp_fct() {
    return make_native_scalar_function<false>("currenttimestamp", timestamp_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        return {timestamp_type->decompose(db_clock::now())};
    });
 }
@@ -125,7 +125,7 @@ make_currenttimestamp_fct() {
 inline shared_ptr<function>
 make_currenttime_fct() {
    return make_native_scalar_function<false>("currenttime", time_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        constexpr int64_t milliseconds_in_day = 3600 * 24 * 1000;
        int64_t milliseconds_since_epoch = std::chrono::duration_cast<std::chrono::milliseconds>(db_clock::now().time_since_epoch()).count();
        int64_t nanoseconds_today = (milliseconds_since_epoch % milliseconds_in_day) * 1000 * 1000;
@@ -136,7 +136,7 @@ make_currenttime_fct() {
 inline shared_ptr<function>
 make_currentdate_fct() {
    return make_native_scalar_function<false>("currentdate", simple_date_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        auto to_simple_date = get_castas_fctn(simple_date_type, timestamp_type);
        return {simple_date_type->decompose(to_simple_date(db_clock::now()))};
    });
@@ -146,7 +146,7 @@ inline
 shared_ptr<function>
 make_currenttimeuuid_fct() {
    return make_native_scalar_function<false>("currenttimeuuid", timeuuid_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        return {timeuuid_type->decompose(timeuuid_native_type{utils::UUID_gen::get_time_UUID()})};
    });
 }
@@ -155,7 +155,7 @@ inline
 shared_ptr<function>
 make_timeuuidtodate_fct() {
    return make_native_scalar_function<true>("todate", simple_date_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -171,7 +171,7 @@ inline
 shared_ptr<function>
 make_timestamptodate_fct() {
    return make_native_scalar_function<true>("todate", simple_date_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -190,7 +190,7 @@ inline
 shared_ptr<function>
 make_timeuuidtotimestamp_fct() {
    return make_native_scalar_function<true>("totimestamp", timestamp_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -205,7 +205,7 @@ inline
 shared_ptr<function>
 make_datetotimestamp_fct() {
    return make_native_scalar_function<true>("totimestamp", timestamp_type, { simple_date_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -224,7 +224,7 @@ inline
 shared_ptr<function>
 make_timeuuidtounixtimestamp_fct() {
    return make_native_scalar_function<true>("tounixtimestamp", long_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -242,7 +242,7 @@ inline
 shared_ptr<function>
 make_timestamptounixtimestamp_fct() {
    return make_native_scalar_function<true>("tounixtimestamp", long_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -260,7 +260,7 @@ inline
 shared_ptr<function>
 make_datetounixtimestamp_fct() {
    return make_native_scalar_function<true>("tounixtimestamp", long_type, { simple_date_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
--- a/cql3/functions/token_fct.hh
+++ b/cql3/functions/token_fct.hh
@@ -31,7 +31,7 @@ public:
                    , _schema(s) {
    }

-    bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        if (std::any_of(parameters.cbegin(), parameters.cend(), [](const auto& param){ return !param; })) {
            return std::nullopt;
        }
--- a/cql3/functions/user_aggregate.hh
+++ b/cql3/functions/user_aggregate.hh
@@ -11,11 +11,12 @@
 #include "abstract_function.hh"
 #include "scalar_function.hh"
 #include "aggregate_function.hh"
+#include "data_dictionary/keyspace_element.hh"

 namespace cql3 {
 namespace functions {

-class user_aggregate : public abstract_function, public aggregate_function{
+class user_aggregate : public abstract_function, public aggregate_function, public data_dictionary::keyspace_element {
    bytes_opt _initcond;
    ::shared_ptr<scalar_function> _sfunc;
    ::shared_ptr<scalar_function> _reducefunc;
@@ -31,14 +32,19 @@ public:
    virtual bool requires_thread() const override;
    bool has_finalfunc() const;

-    const scalar_function& sfunc() const {
-        return *_sfunc;
+    virtual sstring keypace_name() const override { return name().keyspace; }
+    virtual sstring element_name() const override { return name().name; }
+    virtual sstring element_type() const override { return "aggregate"; }
+    virtual std::ostream& describe(std::ostream& os) const override;
+
+    seastar::shared_ptr<scalar_function> sfunc() const {
+        return _sfunc;
    }
-    const scalar_function& reducefunc() const {
-        return *_reducefunc;
+    seastar::shared_ptr<scalar_function> reducefunc() const {
+        return _reducefunc;
    }
-    const scalar_function& finalfunc() const {
-        return *_finalfunc;
+    seastar::shared_ptr<scalar_function> finalfunc() const {
+        return _finalfunc;
    }
    const bytes_opt& initcond() const {
        return _initcond;
--- a/cql3/functions/user_function.cc
+++ b/cql3/functions/user_function.cc
@@ -7,8 +7,8 @@
 */

 #include "user_function.hh"
+#include "cql3/util.hh"
 #include "log.hh"
-#include "cql_serialization_format.hh"
 #include "lang/wasm.hh"

 #include <seastar/core/thread.hh>
@@ -32,7 +32,7 @@ bool user_function::is_aggregate() const { return false; }

 bool user_function::requires_thread() const { return true; }

-bytes_opt user_function::execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) {
+bytes_opt user_function::execute(const std::vector<bytes_opt>& parameters) {
    const auto& types = arg_types();
    if (parameters.size() != types.size()) {
        throw std::logic_error("Wrong number of parameters");
@@ -66,5 +66,33 @@ bytes_opt user_function::execute(cql_serialization_format sf, const std::vector<
        });
 }

+std::ostream& user_function::describe(std::ostream& os) const {
+    auto ks = cql3::util::maybe_quote(name().keyspace);
+    auto na = cql3::util::maybe_quote(name().name);
+
+    os << "CREATE FUNCTION " << ks << "." << na << "(";
+    for (size_t i = 0; i < _arg_names.size(); i++) {
+        if (i > 0) {
+            os << ", ";
+        }
+        os << _arg_names[i] << " " << _arg_types[i]->cql3_type_name();
+    }
+    os << ")\n";
+
+    if (_called_on_null_input) {
+        os << "CALLED";
+    } else {
+        os << "RETURNS NULL";
+    }
+    os << " ON NULL INPUT\n"
+       << "RETURNS " << _return_type->cql3_type_name() << "\n"
+       << "LANGUAGE " << _language << "\n"
+       << "AS $$\n"
+       << _body << "\n"
+       << "$$;";
+
+    return os;
+}
+
 }
 }
--- a/cql3/functions/user_function.hh
+++ b/cql3/functions/user_function.hh
@@ -14,18 +14,19 @@
 #include "scalar_function.hh"
 #include "lang/lua.hh"
 #include "lang/wasm.hh"
+#include "data_dictionary/keyspace_element.hh"

 namespace cql3 {
 namespace functions {


-class user_function final : public abstract_function, public scalar_function {
+class user_function final : public abstract_function, public scalar_function, public data_dictionary::keyspace_element {
 public:
    struct lua_context {
        sstring bitcode;
        // FIXME: We should not need a copy in each function. It is here
        // because user_function::execute is only passed the
-        // cql_serialization_format and the runtime arguments.  We could
+        // the runtime arguments.  We could
        // avoid it by having a runtime->execute(user_function) instead,
        // but that is a large refactoring. We could also store a
        // lua_runtime in a thread_local variable, but that is one extra
@@ -58,7 +59,12 @@ public:
    virtual bool is_native() const override;
    virtual bool is_aggregate() const override;
    virtual bool requires_thread() const override;
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override;
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override;
+
+    virtual sstring keypace_name() const override { return name().keyspace; }
+    virtual sstring element_name() const override { return name().name; }
+    virtual sstring element_type() const override { return "function"; }
+    virtual std::ostream& describe(std::ostream& os) const override;
 };

 }
--- a/cql3/functions/uuid_fcts.hh
+++ b/cql3/functions/uuid_fcts.hh
@@ -22,7 +22,7 @@ inline
 shared_ptr<function>
 make_uuid_fct() {
    return make_native_scalar_function<false>("uuid", uuid_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return {uuid_type->decompose(utils::make_random_uuid())};
    });
 }
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -37,9 +37,6 @@ lists::setter::execute(mutation& m, const clustering_key_prefix& prefix, const u

 void
 lists::setter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const column_definition& column, const cql3::raw_value& value) {
-    if (value.is_unset_value()) {
-        return;
-    }
    if (column.type->is_multi_cell()) {
        // Delete all cells first, then append new ones
        collection_mutation_view_description mut;
@@ -70,13 +67,7 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
    if (index.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value for list index");
    }
-    if (index.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Invalid unset value for list index");
-    }
    auto value = expr::evaluate(*_e, params._options);
-    if (value.is_unset_value()) {
-        return;
-    }

    auto idx = index.view().deserialize<int32_t>(*int32_type);
    auto&& existing_list_opt = params.get_prefetched_list(m.key(), prefix, column);
@@ -122,10 +113,6 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
        throw exceptions::invalid_request_exception("Invalid null value for list index");
    }

-    if (index.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Invalid unset value for list index");
-    }
-
    auto ltype = static_cast<const list_type_impl*>(column.type.get());

    collection_mutation_description mut;
@@ -145,9 +132,6 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
 void
 lists::appender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    const cql3::raw_value value = expr::evaluate(*_e, params._options);
-    if (value.is_unset_value()) {
-        return;
-    }
    assert(column.type->is_multi_cell()); // "Attempted to append to a frozen list";
    do_append(value, m, prefix, column, params);
 }
@@ -161,7 +145,7 @@ lists::do_append(const cql3::raw_value& list_value,
    if (column.type->is_multi_cell()) {
        // If we append null, do nothing. Note that for Setter, we've
        // already removed the previous value so we're good here too
-        if (list_value.is_null_or_unset()) {
+        if (list_value.is_null()) {
            return;
        }

@@ -199,7 +183,7 @@ void
 lists::prepender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to prepend to a frozen list";
    cql3::raw_value lvalue = expr::evaluate(*_e, params._options);
-    if (lvalue.is_null_or_unset()) {
+    if (lvalue.is_null()) {
        return;
    }

@@ -265,7 +249,7 @@ lists::discarder::execute(mutation& m, const clustering_key_prefix& prefix, cons
        return;
    }

-    if (lvalue.is_null_or_unset()) {
+    if (lvalue.is_null()) {
        return;
    }

@@ -304,9 +288,6 @@ lists::discarder_by_index::execute(mutation& m, const clustering_key_prefix& pre
    if (index.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value for list index");
    }
-    if (index.is_unset_value()) {
-        return;
-    }

    auto&& existing_list_opt = params.get_prefetched_list(m.key(), prefix, column);
    int32_t idx = index.view().deserialize<int32_t>(*int32_type);
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -27,21 +27,21 @@ public:
    static lw_shared_ptr<column_specification> value_spec_of(const column_specification&);
    static lw_shared_ptr<column_specification> uuid_index_spec_of(const column_specification&);
 public:
-    class setter : public operation {
+    class setter : public operation_skip_if_unset {
    public:
        setter(const column_definition& column, expr::expression e)
-                : operation(column, std::move(e)) {
+                : operation_skip_if_unset(column, std::move(e)) {
        }
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
        static void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const column_definition& column, const cql3::raw_value& value);
    };

-    class setter_by_index : public operation {
+    class setter_by_index : public operation_skip_if_unset {
    protected:
        expr::expression _idx;
    public:
        setter_by_index(const column_definition& column, expr::expression idx, expr::expression e)
-            : operation(column, std::move(e)), _idx(std::move(idx)) {
+            : operation_skip_if_unset(column, std::move(e)), _idx(std::move(idx)) {
        }
        virtual bool requires_read() const override;
        virtual void fill_prepare_context(prepare_context& ctx) override;
@@ -57,9 +57,9 @@ public:
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class appender : public operation {
+    class appender : public operation_skip_if_unset {
    public:
-        using operation::operation;
+        using operation_skip_if_unset::operation_skip_if_unset;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

@@ -69,25 +69,25 @@ public:
            const column_definition& column,
            const update_parameters& params);

-    class prepender : public operation {
+    class prepender : public operation_skip_if_unset {
    public:
-        using operation::operation;
+        using operation_skip_if_unset::operation_skip_if_unset;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class discarder : public operation {
+    class discarder : public operation_skip_if_unset {
    public:
        discarder(const column_definition& column, expr::expression e)
-                : operation(column, std::move(e)) {
+                : operation_skip_if_unset(column, std::move(e)) {
        }
        virtual bool requires_read() const override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class discarder_by_index : public operation {
+    class discarder_by_index : public operation_skip_if_unset {
    public:
        discarder_by_index(const column_definition& column, expr::expression idx)
-                : operation(column, std::move(idx)) {
+                : operation_skip_if_unset(column, std::move(idx)) {
        }
        virtual bool requires_read() const override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -26,9 +26,6 @@ maps::setter::execute(mutation& m, const clustering_key_prefix& row_key, const u

 void
 maps::setter::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params, const column_definition& column, const cql3::raw_value& value) {
-    if (value.is_unset_value()) {
-        return;
-    }
    if (column.type->is_multi_cell()) {
        // Delete all cells first, then put new ones
        collection_mutation_description mut;
@@ -50,12 +47,6 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = expr::evaluate(_k, params._options);
    auto value = expr::evaluate(*_e, params._options);
-    if (value.is_unset_value()) {
-        return;
-    }
-    if (key.is_unset_value()) {
-        throw invalid_request_exception("Invalid unset map key");
-    }
    if (key.is_null()) {
        throw invalid_request_exception("Invalid null map key");
    }
@@ -73,9 +64,7 @@ void
 maps::putter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to add items to a frozen map";
    cql3::raw_value value = expr::evaluate(*_e, params._options);
-    if (!value.is_unset_value()) {
-        do_put(m, prefix, params, value, column);
-    }
+    do_put(m, prefix, params, value, column);
 }

 void
@@ -111,9 +100,6 @@ maps::discarder_by_key::execute(mutation& m, const clustering_key_prefix& prefix
    if (key.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null map key");
    }
-    if (key.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Invalid unset map key");
-    }
    collection_mutation_description mut;
    mut.cells.emplace_back(std::move(key).to_bytes(), params.make_dead_cell());

--- a/cql3/maps.hh
+++ b/cql3/maps.hh
@@ -27,30 +27,30 @@ public:
    static lw_shared_ptr<column_specification> key_spec_of(const column_specification& column);
    static lw_shared_ptr<column_specification> value_spec_of(const column_specification& column);

-    class setter : public operation {
+    class setter : public operation_skip_if_unset {
    public:
        setter(const column_definition& column, expr::expression e)
-                : operation(column, std::move(e)) {
+                : operation_skip_if_unset(column, std::move(e)) {
        }

        virtual void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) override;
        static void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params, const column_definition& column, const cql3::raw_value& value);
    };

-    class setter_by_key : public operation {
+    class setter_by_key : public operation_skip_if_unset {
        expr::expression _k;
    public:
        setter_by_key(const column_definition& column, expr::expression k, expr::expression e)
-            : operation(column, std::move(e)), _k(std::move(k)) {
+            : operation_skip_if_unset(column, std::move(e)), _k(std::move(k)) {
        }
        virtual void fill_prepare_context(prepare_context& ctx) override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class putter : public operation {
+    class putter : public operation_skip_if_unset {
    public:
        putter(const column_definition& column, expr::expression e)
-            : operation(column, std::move(e)) {
+            : operation_skip_if_unset(column, std::move(e)) {
        }
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };
@@ -58,10 +58,10 @@ public:
    static void do_put(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params,
            const cql3::raw_value& value, const column_definition& column);

-    class discarder_by_key : public operation {
+    class discarder_by_key : public operation_no_unset_support {
    public:
        discarder_by_key(const column_definition& column, expr::expression k)
-                : operation(column, std::move(k)) {
+                : operation_no_unset_support(column, std::move(k)) {
        }
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };
--- a/cql3/operation.cc
+++ b/cql3/operation.cc
@@ -32,9 +32,9 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
    using exceptions::invalid_request_exception;
    auto rtype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!rtype) {
-        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!rtype->is_multi_cell()) {
-        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (rtype->get_kind() == abstract_type::kind::list) {
@@ -47,7 +47,7 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
            return make_shared<lists::setter_by_index>(receiver, std::move(idx), std::move(lval));
        }
    } else if (rtype->get_kind() == abstract_type::kind::set) {
-        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name_as_text()));
    } else if (rtype->get_kind() == abstract_type::kind::map) {
        auto key = prepare_expression(_selector, db, keyspace, nullptr, maps::key_spec_of(*receiver.column_specification));
        auto mval = prepare_expression(_value, db, keyspace, nullptr, maps::value_spec_of(*receiver.column_specification));
@@ -136,11 +136,11 @@ operation::addition::prepare(data_dictionary::database db, const sstring& keyspa
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        return make_shared<constants::adder>(receiver, std::move(v));
    } else if (!ctype->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -169,14 +169,14 @@ operation::subtraction::prepare(data_dictionary::database db, const sstring& key
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);
        return make_shared<constants::subtracter>(receiver, std::move(v));
    }
    if (!ctype->is_multi_cell()) {
        throw exceptions::invalid_request_exception(
-                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -211,9 +211,9 @@ operation::prepend::prepare(data_dictionary::database db, const sstring& keyspac
    auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);

    if (!dynamic_cast<const list_type_impl*>(receiver.type.get())) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name_as_text()));
    }

    return make_shared<lists::prepender>(receiver, std::move(v));
@@ -268,9 +268,9 @@ operation::set_counter_value_from_tuple_list::prepare(data_dictionary::database
    auto v = prepare_expression(_value, db, keyspace, nullptr, spec);

    // Will not be used elsewhere, so make it local.
-    class counter_setter : public operation {
+    class counter_setter : public operation_no_unset_support {
    public:
-        using operation::operation;
+        using operation_no_unset_support::operation_no_unset_support;

        bool is_raw_counter_shard_write() const override {
            return true;
@@ -340,9 +340,9 @@ operation::element_deletion::affected_column() const {
 shared_ptr<operation>
 operation::element_deletion::prepare(data_dictionary::database db, const sstring& keyspace, const column_definition& receiver) const {
    if (!receiver.type->is_collection()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name_as_text()));
    }
    auto ctype = static_pointer_cast<const collection_type_impl>(receiver.type);
    if (ctype->get_kind() == abstract_type::kind::list) {
--- a/cql3/operation.hh
+++ b/cql3/operation.hh
@@ -17,6 +17,7 @@
 #include "update_parameters.hh"
 #include "cql3/column_identifier.hh"
 #include "cql3/expr/expression.hh"
+#include "cql3/expr/unset.hh"

 #include <optional>

@@ -54,10 +55,13 @@ protected:
    // may require none of more than one expression, but most need 1 so it simplify things a bit.
    std::optional<expr::expression> _e;

+    // A guard to check if the operation should be skipped due to unset operand.
+    expr::unset_bind_variable_guard _unset_guard;
 public:
-    operation(const column_definition& column_, std::optional<expr::expression> e)
+    operation(const column_definition& column_, std::optional<expr::expression> e, expr::unset_bind_variable_guard ubvg)
        : column{column_}
        , _e(std::move(e))
+        , _unset_guard(std::move(ubvg))
    { }

    virtual ~operation() {}
@@ -87,10 +91,14 @@ public:
    }

    /**
-     * Execute the operation.
+     * Execute the operation. Check should_skip_operation() first.
     */
    virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) = 0;
-    
+
+    bool should_skip_operation(const query_options& qo) const {
+        return _unset_guard.is_unset(qo);
+    }
+
    virtual void prepare_for_broadcast_tables(statements::broadcast_tables::prepared_update&) const;

    /**
@@ -265,4 +273,18 @@ public:
    };
 };

+class operation_skip_if_unset : public operation {
+public:
+    operation_skip_if_unset(const column_definition& column, expr::expression e)
+            : operation(column, e, expr::unset_bind_variable_guard(e)) {
+    }
+};
+
+class operation_no_unset_support : public operation {
+public:
+    operation_no_unset_support(const column_definition& column, std::optional<expr::expression> e)
+            : operation(column, std::move(e), expr::unset_bind_variable_guard(std::nullopt)) {
+    }
+};
+
 }
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -23,42 +23,43 @@ thread_local const query_options::specific_options query_options::specific_optio

 thread_local query_options query_options::DEFAULT{default_cql_config,
    db::consistency_level::ONE, std::nullopt,
-    std::vector<cql3::raw_value_view>(), false, query_options::specific_options::DEFAULT, cql_serialization_format::latest()};
+    std::vector<cql3::raw_value_view>(), false, query_options::specific_options::DEFAULT};

 query_options::query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
                           std::vector<cql3::raw_value> values,
                           std::vector<cql3::raw_value_view> value_views,
+                           cql3::unset_bind_variable_vector unset,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf)
+                           specific_options options
+                           )
   : _cql_config(cfg)
   , _consistency(consistency)
   , _names(std::move(names))
   , _values(std::move(values))
   , _value_views(value_views)
+   , _unset(unset)
   , _skip_metadata(skip_metadata)
   , _options(std::move(options))
-   , _cql_serialization_format(sf)
 {
 }

 query_options::query_options(const cql_config& cfg,
                             db::consistency_level consistency,
                             std::optional<std::vector<sstring_view>> names,
-                             std::vector<cql3::raw_value> values,
+                             cql3::raw_value_vector_with_unset values,
                             bool skip_metadata,
-                             specific_options options,
-                             cql_serialization_format sf)
+                             specific_options options
+                             )
    : _cql_config(cfg)
    , _consistency(consistency)
    , _names(std::move(names))
-    , _values(std::move(values))
+    , _values(std::move(values.values))
    , _value_views()
+    , _unset(std::move(values.unset))
    , _skip_metadata(skip_metadata)
    , _options(std::move(options))
-    , _cql_serialization_format(sf)
 {
    fill_value_views();
 }
@@ -66,22 +67,22 @@ query_options::query_options(const cql_config& cfg,
 query_options::query_options(const cql_config& cfg,
                             db::consistency_level consistency,
                             std::optional<std::vector<sstring_view>> names,
-                             std::vector<cql3::raw_value_view> value_views,
+                             cql3::raw_value_view_vector_with_unset value_views,
                             bool skip_metadata,
-                             specific_options options,
-                             cql_serialization_format sf)
+                             specific_options options
+                             )
    : _cql_config(cfg)
    , _consistency(consistency)
    , _names(std::move(names))
    , _values()
-    , _value_views(std::move(value_views))
+    , _value_views(std::move(value_views.values))
+    , _unset(std::move(value_views.unset))
    , _skip_metadata(skip_metadata)
    , _options(std::move(options))
-    , _cql_serialization_format(sf)
 {
 }

-query_options::query_options(db::consistency_level cl, std::vector<cql3::raw_value> values,
+query_options::query_options(db::consistency_level cl, cql3::raw_value_vector_with_unset values,
        specific_options options)
    : query_options(
          default_cql_config,
@@ -89,8 +90,7 @@ query_options::query_options(db::consistency_level cl, std::vector<cql3::raw_val
          {},
          std::move(values),
          false,
-          std::move(options),
-          cql_serialization_format::latest()
+          std::move(options)
      )
 {
 }
@@ -101,9 +101,9 @@ query_options::query_options(std::unique_ptr<query_options> qo, lw_shared_ptr<se
        std::move(qo->_names),
        std::move(qo->_values),
        std::move(qo->_value_views),
+        std::move(qo->_unset),
        qo->_skip_metadata,
-        query_options::specific_options{qo->_options.page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp},
-        qo->_cql_serialization_format) {
+        query_options::specific_options{qo->_options.page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}) {

 }

@@ -113,13 +113,13 @@ query_options::query_options(std::unique_ptr<query_options> qo, lw_shared_ptr<se
        std::move(qo->_names),
        std::move(qo->_values),
        std::move(qo->_value_views),
+        std::move(qo->_unset),
        qo->_skip_metadata,
-        query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp},
-        qo->_cql_serialization_format) {
+        query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}) {

 }

-query_options::query_options(std::vector<cql3::raw_value> values)
+query_options::query_options(cql3::raw_value_vector_with_unset values)
    : query_options(
          db::consistency_level::ONE, std::move(values))
 {}
@@ -135,12 +135,21 @@ void query_options::prepare(const std::vector<lw_shared_ptr<column_specification
    ordered_values.reserve(specs.size());
    for (auto&& spec : specs) {
        auto& spec_name = spec->name->text();
+        bool found_value_for_name = false;
        for (size_t j = 0; j < names.size(); j++) {
            if (names[j] == spec_name) {
                ordered_values.emplace_back(_value_views[j]);
+                found_value_for_name = true;
                break;
            }
        }
+
+        // No bound value was found with the name `spec_name`.
+        // This means that the user forgot to include a bound value with such name.
+        if (!found_value_for_name) {
+            throw exceptions::invalid_request_exception(
+                format("Missing value for bind marker with name: {}", spec_name));
+        }
    }
    _value_views = std::move(ordered_values);
 }
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -11,13 +11,14 @@
 #pragma once

 #include <concepts>
+#include <initializer_list>
 #include "timestamp.hh"
 #include "bytes.hh"
 #include "db/consistency_level_type.hh"
 #include "service/query_state.hh"
 #include "service/pager/paging_state.hh"
 #include "cql3/values.hh"
-#include "cql_serialization_format.hh"
+#include "utils/small_vector.hh"

 namespace cql3 {

@@ -28,6 +29,38 @@ class column_specification;

 using computed_function_values = std::unordered_map<uint8_t, bytes_opt>;

+using unset_bind_variable_vector = utils::small_vector<bool, 16>;
+
+// Matches a raw_value_view with an unset vector to support CQL binary protocol
+// "unset" values.
+struct raw_value_view_vector_with_unset {
+    std::vector<raw_value_view> values;
+    unset_bind_variable_vector unset;
+
+    raw_value_view_vector_with_unset(std::vector<raw_value_view> values_, unset_bind_variable_vector unset_) : values(std::move(values_)), unset(std::move(unset_)) {}
+    // Constructor with no unset support, for tests and internal queries
+    raw_value_view_vector_with_unset(std::vector<raw_value_view> values_) : values(std::move(values_)) {
+        unset.resize(values.size());
+    }
+    raw_value_view_vector_with_unset() = default;
+};
+
+// Matches a raw_value with an unset vector to support CQL binary protocol
+// "unset" values.
+struct raw_value_vector_with_unset {
+    std::vector<raw_value> values;
+    unset_bind_variable_vector unset;
+
+    raw_value_vector_with_unset(std::vector<raw_value> values_, unset_bind_variable_vector unset_) : values(std::move(values_)), unset(std::move(unset_)) {}
+    // Constructor with no unset support, for tests and internal queries
+    raw_value_vector_with_unset(std::vector<raw_value> values_) : values(std::move(values_)) {
+        unset.resize(values.size());
+    }
+    // Mostly for testing.
+    raw_value_vector_with_unset(std::initializer_list<raw_value> values_) : raw_value_vector_with_unset(std::vector(values_)) {}
+    raw_value_vector_with_unset() = default;
+};
+
 /**
 * Options for a query.
 */
@@ -48,9 +81,9 @@ private:
    const std::optional<std::vector<sstring_view>> _names;
    std::vector<cql3::raw_value> _values;
    std::vector<cql3::raw_value_view> _value_views;
+    unset_bind_variable_vector _unset;
    const bool _skip_metadata;
    const specific_options _options;
-    cql_serialization_format _cql_serialization_format;
    std::optional<std::vector<query_options>> _batch_options;
    // We must use the same microsecond-precision timestamp for
    // all cells created by an LWT statement or when a statement
@@ -83,23 +116,10 @@ private:
    // evaluation sites and we only have a const reference to `query_options`.
    mutable computed_function_values _cached_pk_fn_calls;
 private:
-    /**
-     * @brief Batch query_options constructor.
-     *
-     * Requirements:
-     *   - @tparam OneMutationDataRange has a begin() and end() iterators.
-     *   - The values of @tparam OneMutationDataRange are of either raw_value_view or raw_value types.
-     *
-     * @param o Base query_options object. query_options objects for each statement in the batch will derive the values from it.
-     * @param values_ranges a vector of values ranges for each statement in the batch.
-     */
-    template<typename OneMutationDataRange>
-    requires requires (OneMutationDataRange range) {
-         std::begin(range);
-         std::end(range);
-    } && ( requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value_view>; } ||
-           requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value>; } )
-    explicit query_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges);
+    // Batch constructor.
+    template <typename Values>
+    requires std::same_as<Values, raw_value_vector_with_unset> || std::same_as<Values, raw_value_view_vector_with_unset>
+    explicit query_options(query_options&& o, std::vector<Values> values_ranges);

 public:
    query_options(query_options&&) = default;
@@ -108,43 +128,30 @@ public:
    explicit query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
-                           std::vector<cql3::raw_value> values,
+                           raw_value_vector_with_unset values,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf);
+                           specific_options options
+                           );
    explicit query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
                           std::vector<cql3::raw_value> values,
                           std::vector<cql3::raw_value_view> value_views,
+                           unset_bind_variable_vector unset,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf);
+                           specific_options options
+                           );
    explicit query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
-                           std::vector<cql3::raw_value_view> value_views,
+                           raw_value_view_vector_with_unset value_views,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf);
+                           specific_options options
+                           );

-    /**
-     * @brief Batch query_options factory.
-     *
-     * Requirements:
-     *   - @tparam OneMutationDataRange has a begin() and end() iterators.
-     *   - The values of @tparam OneMutationDataRange are of either raw_value_view or raw_value types.
-     *
-     * @param o Base query_options object. query_options objects for each statement in the batch will derive the values from it.
-     * @param values_ranges a vector of values ranges for each statement in the batch.
-     */
-    template<typename OneMutationDataRange>
-    requires requires (OneMutationDataRange range) {
-         std::begin(range);
-         std::end(range);
-    } && ( requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value_view>; } ||
-           requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value>; } )
-    static query_options make_batch_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges) {
+    template <typename Values>
+    requires std::same_as<Values, raw_value_vector_with_unset> || std::same_as<Values, raw_value_view_vector_with_unset>
+    static query_options make_batch_options(query_options&& o, std::vector<Values> values_ranges) {
        return query_options(std::move(o), std::move(values_ranges));
    }

@@ -152,8 +159,8 @@ public:
    static thread_local query_options DEFAULT;

    // forInternalUse
-    explicit query_options(std::vector<cql3::raw_value> values);
-    explicit query_options(db::consistency_level, std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
+    explicit query_options(raw_value_vector_with_unset values);
+    explicit query_options(db::consistency_level, raw_value_vector_with_unset values, specific_options options = specific_options::DEFAULT);
    explicit query_options(std::unique_ptr<query_options>, lw_shared_ptr<service::pager::paging_state> paging_state);
    explicit query_options(std::unique_ptr<query_options>, lw_shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);

@@ -162,7 +169,14 @@ public:
    }

    cql3::raw_value_view get_value_at(size_t idx) const {
-        return _value_views.at(idx);
+        if (_unset.at(idx)) {
+            throw exceptions::invalid_request_exception(fmt::format("Unexpected unset value for bind variable {}", idx));
+        }
+        return _value_views[idx];
+    }
+
+    bool is_unset(size_t idx) const {
+        return _unset.at(idx);
    }

    size_t get_values_count() const {
@@ -195,18 +209,6 @@ public:
        return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
    }

-    /**
-     * The protocol version for the query. Will be 3 if the object don't come from
-     * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
-     */
-    int get_protocol_version() const {
-        return _cql_serialization_format.protocol_version();
-    }
-
-    cql_serialization_format get_cql_serialization_format() const {
-        return _cql_serialization_format;
-    }
-
    const query_options::specific_options& get_specific_options() const {
        return _options;
    }
@@ -278,19 +280,15 @@ private:
    void fill_value_views();
 };

-template<typename OneMutationDataRange>
-requires requires (OneMutationDataRange range) {
-     std::begin(range);
-     std::end(range);
-} && ( requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value_view>; } ||
-       requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value>; } )
-query_options::query_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges)
+template <typename Values>
+requires std::same_as<Values, raw_value_vector_with_unset> || std::same_as<Values, raw_value_view_vector_with_unset>
+query_options::query_options(query_options&& o, std::vector<Values> values_ranges)
    : query_options(std::move(o))
 {
    std::vector<query_options> tmp;
    tmp.reserve(values_ranges.size());
    std::transform(values_ranges.begin(), values_ranges.end(), std::back_inserter(tmp), [this](auto& values_range) {
-        return query_options(_cql_config, _consistency, {}, std::move(values_range), _skip_metadata, _options, _cql_serialization_format);
+        return query_options(_cql_config, _consistency, {}, std::move(values_range), _skip_metadata, _options);
    });
    _batch_options = std::move(tmp);
 }
--- a/Show More
+++ b/Show More