doc: remove wrong image upgrade info (5.2-to-2023.1)

This commit removes the information about the recommended way of upgrading ScyllaDB images - by updating ScyllaDB and OS packages in one step. This upgrade procedure is not supported (it was implemented, but then reverted). Refs https://github.com/scylladb/scylladb/issues/15733 Closes scylladb/scylladb#21876 Fixes https://github.com/scylladb/scylla-enterprise/issues/5041 Fixes https://github.com/scylladb/scylladb/issues/21898 (cherry picked from commit 98860905d8)
db/config.cc: increment components_memory_reclaim_threshold config default
2024-12-12 15:28:20 +02:00 · 2024-06-04 07:13:28 +03:00 · 2024-05-30 11:11:39 +03:00 · 2024-05-30 11:10:49 +03:00 · 2024-05-27 08:52:06 +03:00 · 2024-05-26 16:30:06 +03:00
834 changed files with 36708 additions and 18216 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -12,7 +12,7 @@ test/cql/cdc_* @kbr- @elcallio @piodul @jul-stas
 test/boost/cdc_* @kbr- @elcallio @piodul @jul-stas

 # COMMITLOG / BATCHLOG
-db/commitlog/* @elcallio
+db/commitlog/* @elcallio @eliransin
 db/batch* @elcallio

 # COORDINATOR
@@ -25,7 +25,7 @@ compaction/* @raphaelsc @nyh
 transport/*

 # CQL QUERY LANGUAGE
-cql3/* @tgrabiec @psarna @cvybhu
+cql3/* @tgrabiec @cvybhu @nyh

 # COUNTERS
 counters* @jul-stas
@@ -33,7 +33,7 @@ tests/counter_test* @jul-stas

 # DOCS
 docs/* @annastuchlik @tzach
-docs/alternator @annastuchlik @tzach @nyh @psarna
+docs/alternator @annastuchlik @tzach @nyh @havaker @nuivall

 # GOSSIP
 gms/* @tgrabiec @asias
@@ -45,9 +45,9 @@ dist/docker/*
 utils/logalloc* @tgrabiec

 # MATERIALIZED VIEWS
-db/view/* @nyh @psarna
-cql3/statements/*view* @nyh @psarna
-test/boost/view_* @nyh @psarna
+db/view/* @nyh @cvybhu @piodul
+cql3/statements/*view* @nyh @cvybhu @piodul
+test/boost/view_* @nyh @cvybhu @piodul

 # PACKAGING
 dist/* @syuu1228
@@ -62,9 +62,9 @@ service/migration* @tgrabiec @nyh
 schema* @tgrabiec @nyh

 # SECONDARY INDEXES
-db/index/* @nyh @psarna
-cql3/statements/*index* @nyh @psarna
-test/boost/*index* @nyh @psarna
+index/* @nyh @cvybhu @piodul
+cql3/statements/*index* @nyh @cvybhu @piodul
+test/boost/*index* @nyh @cvybhu @piodul

 # SSTABLES
 sstables/* @tgrabiec @raphaelsc @nyh
@@ -74,11 +74,11 @@ streaming/* @tgrabiec @asias
 service/storage_service.* @tgrabiec @asias

 # ALTERNATOR
-alternator/* @nyh @psarna
-test/alternator/* @nyh @psarna
+alternator/* @nyh @havaker @nuivall
+test/alternator/* @nyh @havaker @nuivall

 # HINTED HANDOFF
-db/hints/* @piodul @vladzcloudius
+db/hints/* @piodul @vladzcloudius @eliransin

 # REDIS
 redis/* @nyh @syuu1228
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,14 +1,11 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
 	url = ../scylla-swagger-ui
 	ignore = dirty
-[submodule "abseil"]
-	path = abseil
-	url = ../abseil-cpp
 [submodule "scylla-jmx"]
 	path = tools/jmx
 	url = ../scylla-jmx
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,22 +42,13 @@ set(Seastar_CXX_FLAGS ${cxx_coro_flag} ${target_arch_flag} CACHE INTERNAL "" FOR
 set(Seastar_CXX_DIALECT gnu++20 CACHE INTERNAL "" FORCE)

 add_subdirectory(seastar)
-add_subdirectory(abseil)
-# Exclude absl::strerror from the default "all" target since it's not
-# used in Scylla build and, moreover, makes use of deprecated glibc APIs,
-# such as sys_nerr, which are not exposed from "stdio.h" since glibc 2.32,
-# which happens to be the case for recent Fedora distribution versions.
-#
-# Need to use the internal "absl_strerror" target name instead of namespaced
-# variant because `set_target_properties` does not understand the latter form,
-# unfortunately.
-set_target_properties(absl_strerror PROPERTIES EXCLUDE_FROM_ALL TRUE)

 # System libraries dependencies
 find_package(Boost COMPONENTS filesystem program_options system thread regex REQUIRED)
 find_package(Lua REQUIRED)
 find_package(ZLIB REQUIRED)
 find_package(ICU COMPONENTS uc REQUIRED)
+find_package(Abseil REQUIRED)

 set(scylla_build_dir "${CMAKE_BINARY_DIR}/build/${BUILD_TYPE}")
 set(scylla_gen_build_dir "${scylla_build_dir}/gen")
@@ -746,7 +737,6 @@ target_compile_definitions(scylla PRIVATE XXH_PRIVATE_API HAVE_LZ4_COMPRESS_DEFA
 target_include_directories(scylla PRIVATE
    "${CMAKE_CURRENT_SOURCE_DIR}"
    libdeflate
-    abseil
    "${scylla_gen_build_dir}")

 ###
--- a/4
+++ b/4
@@ -34,7 +34,7 @@ END

 DATE=""

-while [[ $# -gt 0 ]]; do
+while [ $# -gt 0 ]; do
 	opt="$1"
 	case $opt in
 		-h|--help)
@@ -72,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.2.0-dev
+VERSION=5.2.19

 if test -f version
 then
--- a/1
+++ b/1
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -141,7 +141,7 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::strin
    service::storage_proxy::coordinator_query_result qr = co_await proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
            service::storage_proxy::coordinator_query_options(executor::default_timeout(), empty_service_permit(), client_state));

-    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

    auto result_set = builder.build();
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -23,7 +23,7 @@ namespace alternator {
 // api_error into a JSON object, and that is returned to the user.
 class api_error final : public std::exception {
 public:
-    using status_type = httpd::reply::status_type;
+    using status_type = http::reply::status_type;
    status_type _http_code;
    std::string _type;
    std::string _msg;
@@ -77,7 +77,7 @@ public:
        return api_error("TableNotFoundException", std::move(msg));
    }
    static api_error internal(std::string msg) {
-        return api_error("InternalServerError", std::move(msg), reply::status_type::internal_server_error);
+        return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error);
    }

    // Provide the "std::exception" interface, to make it easier to print this
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -88,17 +88,20 @@ json::json_return_type make_streamed(rjson::value&& value) {
        // move objects to coroutine frame.
        auto los = std::move(os);
        auto lrs = std::move(rs);
+        std::exception_ptr ex;
        try {
            co_await rjson::print(*lrs, los);
-            co_await los.flush();
-            co_await los.close();
        } catch (...) {
            // at this point, we cannot really do anything. HTTP headers and return code are
            // already written, and quite potentially a portion of the content data.
            // just log + rethrow. It is probably better the HTTP server closes connection
            // abruptly or something...
-            elogger.error("Unhandled exception in data streaming: {}", std::current_exception());
-            throw;
+            ex = std::current_exception();
+            elogger.error("Exception during streaming HTTP response: {}", ex);
+        }
+        co_await los.close();
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
        }
        co_return;
    };
@@ -761,7 +764,6 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
        co_return api_error::access_denied("Incorrect resource identifier");
    }
    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
    const rjson::value* tags = rjson::find(request, "Tags");
    if (!tags || !tags->IsArray()) {
        co_return api_error::validation("Cannot parse tags");
@@ -769,8 +771,9 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    if (tags->Size() < 1) {
        co_return api_error::validation("The number of tags must be at least 1") ;
    }
-    update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
+    });
    co_return json_string("");
 }

@@ -788,9 +791,9 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli

    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+    });
    co_return json_string("");
 }

@@ -2305,7 +2308,7 @@ void executor::describe_single_item(const cql3::selection::selection& selection,
                rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(*cell, **column_it));
            }
        } else if (cell) {
-            auto deserialized = attrs_type()->deserialize(*cell, cql_serialization_format::latest());
+            auto deserialized = attrs_type()->deserialize(*cell);
            auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
            for (auto entry : keys_and_values) {
                std::string attr_name = value_cast<sstring>(entry.first);
@@ -2340,7 +2343,7 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
        const std::optional<attrs_to_get>& attrs_to_get) {
    rjson::value item = rjson::empty_object();

-    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
+    cql3::selection::result_set_builder builder(selection, gc_clock::now());
    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));

    auto result_set = builder.build();
@@ -2358,21 +2361,22 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
    return item;
 }

-std::vector<rjson::value> executor::describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get) {
-    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
-    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
+future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get) {
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        describe_single_item(selection, result_row, attrs_to_get, item);
+        describe_single_item(*selection, result_row, *attrs_to_get, item);
        ret.push_back(std::move(item));
+        co_await coroutine::maybe_yield();
    }
-    return ret;
+    co_return ret;
 }

 static bool check_needs_read_before_write(const parsed::value& v) {
@@ -3254,8 +3258,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                std::vector<rjson::value> jsons = describe_multi_item(schema, partition_slice, *selection, *qr.query_result, *attrs_to_get);
-                return make_ready_future<std::vector<rjson::value>>(std::move(jsons));
+                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get));
            });
            response_futures.push_back(std::move(f));
        }
@@ -3511,7 +3514,7 @@ public:
                    rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it));
                }
            } else {
-                auto deserialized = attrs_type()->deserialize(bv, cql_serialization_format::latest());
+                auto deserialized = attrs_type()->deserialize(bv);
                auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
                for (auto entry : keys_and_values) {
                    std::string attr_name = value_cast<sstring>(entry.first);
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -222,11 +222,11 @@ public:
        const query::result&,
        const std::optional<attrs_to_get>&);

-    static std::vector<rjson::value> describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get);
+    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<bytes_opt>&,
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -73,7 +73,7 @@ struct from_json_visitor {
    }
    // default
    void operator()(const abstract_type& t) const {
-        bo.write(from_json_object(t, v, cql_serialization_format::internal()));
+        bo.write(from_json_object(t, v));
    }
 };

--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -28,6 +28,8 @@
 static logging::logger slogger("alternator-server");

 using namespace httpd;
+using request = http::request;
+using reply = http::reply;

 namespace alternator {

--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -27,7 +27,7 @@ using chunked_content = rjson::chunked_content;
 class server {
    static constexpr size_t content_length_limit = 16*MB;
    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
-            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
+            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<http::request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

    http_server _http_server;
@@ -76,8 +76,8 @@ public:
 private:
    void set_routes(seastar::httpd::routes& r);
    // If verification succeeds, returns the authenticated user's username
-    future<std::string> verify_signature(const seastar::httpd::request&, const chunked_content&);
-    future<executor::request_return_type> handle_api_request(std::unique_ptr<request> req);
+    future<std::string> verify_signature(const seastar::http::request&, const chunked_content&);
+    future<executor::request_return_type> handle_api_request(std::unique_ptr<http::request> req);
 };

 }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -145,19 +145,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id().uuid() < t2.schema()->id().uuid();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id().uuid() == streams_start
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
@@ -883,7 +888,7 @@ future<executor::request_return_type> executor::get_records(client_state& client

    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

        auto result_set = builder.build();
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -94,24 +94,25 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    if (enabled) {
-        if (tags_map.contains(TTL_TAG_KEY)) {
-            co_return api_error::validation("TTL is already enabled");
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
+        if (enabled) {
+            if (tags_map.contains(TTL_TAG_KEY)) {
+                throw api_error::validation("TTL is already enabled");
+            }
+            tags_map[TTL_TAG_KEY] = attribute_name;
+        } else {
+            auto i = tags_map.find(TTL_TAG_KEY);
+            if (i == tags_map.end()) {
+                throw api_error::validation("TTL is already disabled");
+            } else if (i->second != attribute_name) {
+                throw api_error::validation(format(
+                    "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
+                    attribute_name, i->second));
+            }
+            tags_map.erase(TTL_TAG_KEY);
        }
-        tags_map[TTL_TAG_KEY] = attribute_name;
-    } else {
-        auto i = tags_map.find(TTL_TAG_KEY);
-        if (i == tags_map.end()) {
-            co_return api_error::validation("TTL is already disabled");
-        } else if (i->second != attribute_name) {
-            co_return api_error::validation(format(
-                "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
-                attribute_name, i->second));
-        }
-        tags_map.erase(TTL_TAG_KEY);
-    }
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    });
+
    // Prepare the response, which contains a TimeToLiveSpecification
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
@@ -557,8 +558,9 @@ static future<> scan_table_ranges(
        // Read a page, and if that times out, try again after a small sleep.
        // If we didn't catch the timeout exception, it would cause the scan
        // be aborted and only be restarted at the next scanning period.
+        // If we retry too many times, give up and restart the scan later.
        std::unique_ptr<cql3::result_set> rs;
-        for (;;) {
+        for (int retries=0; ; retries++) {
            try {
                // FIXME: which timeout?
                rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
@@ -568,7 +570,14 @@ static future<> scan_table_ranges(
                    std::current_exception());
            }
            // If we didn't break out of this loop, add a minimal sleep
-            co_await seastar::sleep(1s);
+            if (retries >= 10) {
+                // Don't get stuck forever asking the same page, maybe there's
+                // a bug or a real problem in several replicas. Give up on
+                // this scan an retry the scan from a random position later,
+                // in the next scan period.
+                throw runtime_exception("scanner thread failed after too many timeouts for the same page");
+            }
+            co_await sleep_abortable(std::chrono::seconds(1), abort_source);
        }
        auto rows = rs->rows();
        auto meta = rs->get_metadata().get_names();
--- a/api/api-doc/raft.json
+++ b/api/api-doc/raft.json
@@ -0,0 +1,43 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/raft",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/raft/trigger_snapshot/{group_id}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Triggers snapshot creation and log truncation for the given Raft group",
+               "type":"string",
+               "nickname":"trigger_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"group_id",
+                     "description":"The ID of the group which should get snapshotted",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Timeout in seconds after which the endpoint returns a failure. If not provided, 60s is used.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1946,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Reset local schema",
+               "summary":"Forces this node to recalculate versions of schema objects.",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -148,7 +148,34 @@
              ]
           }
        ]
-     }
+     },
+     {
+      "path":"/task_manager/task_status_recursive/{task_id}",
+      "operations":[
+         {
+            "method":"GET",
+            "summary":"Get statuses of the task and all its descendants",
+            "type":"array",
+            "items":{
+               "type":"task_status"
+            },
+            "nickname":"get_task_status_recursively",
+            "produces":[
+               "application/json"
+            ],
+            "parameters":[
+                {
+                    "name":"task_id",
+                    "description":"The uuid of a task to query about",
+                    "required":true,
+                    "allowMultiple":false,
+                    "type":"string",
+                    "paramType":"path"
+                }
+            ]
+         }
+      ]
+    }
    ],
    "models":{
       "task_stats" :{
@@ -168,6 +195,26 @@
                  "failed"
                ],
                "description":"The state of a task"
+             },
+             "type":{
+                "type":"string",
+                "description":"The description of the task"
+             },
+             "keyspace":{
+                "type":"string",
+                "description":"The keyspace the task is working on (if applicable)"
+             },
+             "table":{
+                "type":"string",
+                "description":"The table the task is working on (if applicable)"
+             },
+             "entity":{
+                "type":"string",
+                "description":"Task-specific entity description"
+             },
+             "sequence_number":{
+                "type":"long",
+                "description":"The running sequence number of the task"
             }
           }
       },
@@ -244,6 +291,13 @@
            "progress_completed":{
               "type":"double",
               "description":"The number of units completed so far"
+            },
+            "children_ids":{
+               "type":"array",
+                "items":{
+                    "type":"string"
+                },
+               "description":"Task IDs of children of this task"
            }
          }
       }
--- a/api/api-doc/task_manager_test.json
+++ b/api/api-doc/task_manager_test.json
@@ -86,14 +86,6 @@
                        "type":"string",
                        "paramType":"query"
                    },
-                    {
-                        "name":"type",
-                        "description":"The type of the task",
-                        "required":false,
-                        "allowMultiple":false,
-                        "type":"string",
-                        "paramType":"query"
-                    },
                    {
                        "name":"entity",
                        "description":"Task-specific entity description",
--- a/api/api.cc
+++ b/api/api.cc
@@ -31,6 +31,7 @@
 #include "api/config.hh"
 #include "task_manager.hh"
 #include "task_manager_test.hh"
+#include "raft.hh"

 logging::logger apilog("api");

@@ -277,6 +278,18 @@ future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::confi

 #endif

+future<> set_server_raft(http_context& ctx, sharded<service::raft_group_registry>& raft_gr) {
+    auto rb = std::make_shared<api_registry_builder>(ctx.api_doc);
+    return ctx.http_server.set_routes([rb, &ctx, &raft_gr] (routes& r) {
+        rb->register_function(r, "raft", "The Raft API");
+        set_raft(ctx, r, raft_gr);
+    });
+}
+
+future<> unset_server_raft(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
+}
+
 void req_params::process(const request& req) {
    // Process mandatory parameters
    for (auto& [name, ent] : params) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -14,11 +14,15 @@
 #include "tasks/task_manager.hh"
 #include "seastarx.hh"

+using request = http::request;
+using reply = http::reply;
+
 namespace service {

 class load_meter;
 class storage_proxy;
 class storage_service;
+class raft_group_registry;

 } // namespace service

@@ -113,5 +117,7 @@ future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
 future<> set_server_task_manager(http_context& ctx);
 future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::config> cfg);
+future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
+future<> unset_server_raft(http_context&);

 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -334,13 +334,13 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](replica::column_family& cf) {
-            return cf.active_memtable().partition_count();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t{0}, [](replica::column_family& cf) {
-            return cf.active_memtable().partition_count();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });

@@ -354,25 +354,33 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().total_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().total_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().total_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().total_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

@@ -410,7 +418,9 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_cf_all_memtables_live_data_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        warn(unimplemented::cause::INDEXES);
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.active_memtable().region().occupancy().used_space();
+            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
+                return active_memtable->region().occupancy().used_space();
+            }), uint64_t(0));
        }, std::plus<int64_t>());
    });

@@ -529,13 +539,13 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

    cf::get_all_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -41,7 +41,6 @@ static std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_ha
    return std::move(a);
 }

-
 void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
        return ctx.db.map_reduce0([](replica::database& db) {
@@ -68,9 +67,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<request> req) {
        return ctx.db.map_reduce0([&ctx](replica::database& db) {
            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&ctx, &db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
-                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) {
+                return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) -> future<> {
                    replica::table& cf = *i.second.get();
-                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
                    return make_ready_future<>();
                }).then([&tasks] {
                    return std::move(tasks);
@@ -119,7 +118,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
            auto& cm = db.get_compaction_manager();
            return parallel_for_each(table_names, [&db, &cm, &ks_name, type] (sstring& table_name) {
                auto& t = db.find_column_family(ks_name, table_name);
-                return cm.stop_compaction(type, &t.as_table_state());
+                return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                    return cm.stop_compaction(type, &ts);
+                });
            });
        });
        co_return json_void();
@@ -127,7 +128,7 @@ void set_compaction_manager(http_context& ctx, routes& r) {

    cm::get_pending_tasks.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return cf.get_compaction_strategy().estimated_pending_compactions(cf.as_table_state());
+            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });

--- a/api/endpoint_snitch.cc
+++ b/api/endpoint_snitch.cc
@@ -25,7 +25,7 @@ void set_endpoint_snitch(http_context& ctx, routes& r, sharded<locator::snitch_p
    httpd::endpoint_snitch_info_json::get_datacenter.set(r, [&ctx](const_req req) {
        auto& topology = ctx.shared_token_metadata.local().get()->get_topology();
        auto ep = host_or_broadcast(req);
-        if (!topology.has_endpoint(ep, locator::topology::pending::yes)) {
+        if (!topology.has_endpoint(ep)) {
            // Cannot return error here, nodetool status can race, request
            // info about just-left node and not handle it nicely
            return sstring(locator::production_snitch_base::default_dc);
@@ -36,7 +36,7 @@ void set_endpoint_snitch(http_context& ctx, routes& r, sharded<locator::snitch_p
    httpd::endpoint_snitch_info_json::get_rack.set(r, [&ctx](const_req req) {
        auto& topology = ctx.shared_token_metadata.local().get()->get_topology();
        auto ep = host_or_broadcast(req);
-        if (!topology.has_endpoint(ep, locator::topology::pending::yes)) {
+        if (!topology.has_endpoint(ep)) {
            // Cannot return error here, nodetool status can race, request
            // info about just-left node and not handle it nicely
            return sstring(locator::production_snitch_base::default_rack);
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -17,36 +17,42 @@ namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
-        std::vector<fd::endpoint_state> res;
-        for (auto i : g.get_endpoint_states()) {
-            fd::endpoint_state val;
-            val.addrs = boost::lexical_cast<std::string>(i.first);
-            val.is_alive = i.second.is_alive();
-            val.generation = i.second.get_heart_beat_state().get_generation();
-            val.version = i.second.get_heart_beat_state().get_heart_beat_version();
-            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
-            for (auto a : i.second.get_application_state_map()) {
-                fd::version_value version_val;
-                // We return the enum index and not it's name to stay compatible to origin
-                // method that the state index are static but the name can be changed.
-                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
-                version_val.value = a.second.value;
-                version_val.version = a.second.version;
-                val.application_state.push(version_val);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::vector<fd::endpoint_state> res;
+            for (auto i : g.get_endpoint_states()) {
+                fd::endpoint_state val;
+                val.addrs = boost::lexical_cast<std::string>(i.first);
+                val.is_alive = i.second.is_alive();
+                val.generation = i.second.get_heart_beat_state().get_generation();
+                val.version = i.second.get_heart_beat_state().get_heart_beat_version();
+                val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+                for (auto a : i.second.get_application_state_map()) {
+                    fd::version_value version_val;
+                    // We return the enum index and not it's name to stay compatible to origin
+                    // method that the state index are static but the name can be changed.
+                    version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                    version_val.value = a.second.value;
+                    version_val.version = a.second.version;
+                    val.application_state.push(version_val);
+                }
+                res.push_back(val);
            }
-            res.push_back(val);
-        }
-        return make_ready_future<json::json_return_type>(res);
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_up_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_up_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_up_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_down_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_down_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_down_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_phi_convict_threshold.set(r, [] (std::unique_ptr<request> req) {
@@ -54,11 +60,13 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
-        std::map<sstring, sstring> nodes_status;
-        for (auto& entry : g.get_endpoint_states()) {
-            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
-        }
-        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::map<sstring, sstring> nodes_status;
+            for (auto& entry : g.get_endpoint_states()) {
+                nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+            }
+            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        });
    });

    fd::set_phi_convict_threshold.set(r, [](std::unique_ptr<request> req) {
@@ -67,13 +75,15 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
-        if (!state) {
-            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
-        }
-        std::stringstream ss;
-        g.append_endpoint_state(ss, *state);
-        return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
+            auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
+            if (!state) {
+                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+            }
+            std::stringstream ss;
+            g.append_endpoint_state(ss, *state);
+            return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        });
    });

    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -6,6 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <seastar/core/coroutine.hh>
+
 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
 #include "gms/gossiper.hh"
@@ -14,19 +16,23 @@ namespace api {
 using namespace json;

 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
-    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_unreachable_members();
-        return container_to_vec(res);
+    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto res = co_await g.get_unreachable_members_synchronized();
+        co_return json::json_return_type(container_to_vec(res));
    });

-    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_live_members();
-        return container_to_vec(res);
+
+    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (std::unique_ptr<request> req) {
+        return g.get_live_members_synchronized().then([] (auto res) {
+            return make_ready_future<json::json_return_type>(container_to_vec(res));
+        });
    });

-    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
-        gms::inet_address ep(req.param["addr"]);
-        return g.get_endpoint_downtime(ep);
+    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        gms::inet_address ep(req->param["addr"]);
+        // synchronize unreachable_members on all shards
+        co_await g.get_unreachable_members_synchronized();
+        co_return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<request> req) {
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <seastar/core/coroutine.hh>
+
+#include "api/api.hh"
+#include "api/api-doc/raft.json.hh"
+
+#include "service/raft/raft_group_registry.hh"
+
+using namespace seastar::httpd;
+
+extern logging::logger apilog;
+
+namespace api {
+
+namespace r = httpd::raft_json;
+using namespace json;
+
+void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
+    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
+        raft::group_id gid{utils::UUID{req->param["group_id"]}};
+        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
+            if (timeout_str.empty()) {
+                return std::chrono::seconds{60};
+            }
+            auto dur = std::stoll(timeout_str);
+            if (dur <= 0) {
+                throw std::runtime_error{"Timeout must be a positive number."};
+            }
+            return std::chrono::seconds{dur};
+        });
+
+        std::atomic<bool> found_srv{false};
+        co_await raft_gr.invoke_on_all([gid, timeout_dur, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
+            auto* srv = raft_gr.find_server(gid);
+            if (!srv) {
+                co_return;
+            }
+
+            found_srv = true;
+            abort_on_expiry aoe(lowres_clock::now() + timeout_dur);
+            apilog.info("Triggering Raft group {} snapshot", gid);
+            auto result = co_await srv->trigger_snapshot(&aoe.abort_source());
+            if (result) {
+                apilog.info("New snapshot for Raft group {} created", gid);
+            } else {
+                apilog.info("Could not create new snapshot for Raft group {}, no new entries applied", gid);
+            }
+        });
+
+        if (!found_srv) {
+            throw std::runtime_error{fmt::format("Server for group ID {} not found", gid)};
+        }
+
+        co_return json_void{};
+    });
+}
+
+void unset_raft(http_context&, httpd::routes& r) {
+    r::trigger_snapshot.unset(r);
+}
+
+}
+
--- a/api/raft.hh
+++ b/api/raft.hh
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2023-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "api_init.hh"
+
+namespace api {
+
+void set_raft(http_context& ctx, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr);
+void unset_raft(http_context& ctx, httpd::routes& r);
+
+}
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -243,17 +243,21 @@ future<json::json_return_type> set_tables_autocompaction(http_context& ctx, cons
 }

 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
-    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::start_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::stop_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -275,17 +279,21 @@ void unset_transport_controller(http_context& ctx, routes& r) {
 }

 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
-    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+    ss::stop_rpc_server.set(r, [&ctx, &ctl] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+    ss::start_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -754,7 +762,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
                auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
                co_await run_on_existing_tables("upgrade_sstables", db, keyspace, table_infos, [&] (replica::table& t) {
-                    return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, t.as_table_state(), exclude_current_version);
+                    return t.parallel_foreach_table_state([&] (compaction::table_state& ts) {
+                        return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, ts, exclude_current_version);
+                    });
                });
            });
        } catch (...) {
@@ -1039,14 +1049,11 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&sys_ks](std::unique_ptr<request> req) {
+    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
-        auto& sp = service::get_storage_proxy();
-        auto& fs = sp.local().features();
        apilog.info("reset_local_schema");
-        return db::schema_tables::recalculate_schema_version(sys_ks, sp, fs).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        co_await ss.local().reload_schema();
+        co_return json_void();
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
@@ -1520,10 +1527,15 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_

        try {
            auto opt_stats = co_await db.map_reduce0([&] (replica::database& db) {
-                return map_reduce(column_families, [&] (sstring cfname) {
+                return map_reduce(column_families, [&] (sstring cfname) -> future<std::optional<sstables::compaction_stats>> {
                    auto& cm = db.get_compaction_manager();
                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(cf.as_table_state(), opts);
+                    sstables::compaction_stats stats{};
+                    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
+                        auto r = co_await cm.perform_sstable_scrub(ts, opts);
+                        stats += r.value_or(sstables::compaction_stats{});
+                    });
+                    co_return stats;
                }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
            }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
            if (opt_stats && opt_stats->validation_errors) {
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -30,17 +30,32 @@ inline bool filter_tasks(tasks::task_manager::task_ptr task, std::unordered_map<

 struct full_task_status {
    tasks::task_manager::task::status task_status;
+    std::string type;
    tasks::task_manager::task::progress progress;
    std::string module;
    tasks::task_id parent_id;
    tasks::is_abortable abortable;
+    std::vector<std::string> children_ids;
 };

 struct task_stats {
-    task_stats(tasks::task_manager::task_ptr task) : task_id(task->id().to_sstring()), state(task->get_status().state) {}
+    task_stats(tasks::task_manager::task_ptr task)
+        : task_id(task->id().to_sstring())
+        , state(task->get_status().state)
+        , type(task->type())
+        , keyspace(task->get_status().keyspace)
+        , table(task->get_status().table)
+        , entity(task->get_status().entity)
+        , sequence_number(task->get_status().sequence_number)
+    { }

    sstring task_id;
    tasks::task_manager::task_state state;
+    std::string type;
+    std::string keyspace;
+    std::string table;
+    std::string entity;
+    uint64_t sequence_number;
 };

 tm::task_status make_status(full_task_status status) {
@@ -52,7 +67,7 @@ tm::task_status make_status(full_task_status status) {

    tm::task_status res{};
    res.id = status.task_status.id.to_sstring();
-    res.type = status.task_status.type;
+    res.type = status.type;
    res.state = status.task_status.state;
    res.is_abortable = bool(status.abortable);
    res.start_time = st;
@@ -67,22 +82,29 @@ tm::task_status make_status(full_task_status status) {
    res.progress_units = status.task_status.progress_units;
    res.progress_total = status.progress.total;
    res.progress_completed = status.progress.completed;
+    res.children_ids = std::move(status.children_ids);
    return res;
 }

-future<json::json_return_type> retrieve_status(tasks::task_manager::foreign_task_ptr task) {
+future<full_task_status> retrieve_status(const tasks::task_manager::foreign_task_ptr& task) {
    if (task.get() == nullptr) {
        co_return coroutine::return_exception(httpd::bad_param_exception("Task not found"));
    }
    auto progress = co_await task->get_progress();
    full_task_status s;
    s.task_status = task->get_status();
+    s.type = task->type();
    s.parent_id = task->get_parent_id();
    s.abortable = task->is_abortable();
    s.module = task->get_module_name();
    s.progress.completed = progress.completed;
    s.progress.total = progress.total;
-    co_return make_status(s);
+    std::vector<std::string> ct{task->get_children().size()};
+    boost::transform(task->get_children(), ct.begin(), [] (const auto& child) {
+        return child->id().to_sstring();
+    });
+    s.children_ids = std::move(ct);
+    co_return s;
 }

 void set_task_manager(http_context& ctx, routes& r) {
@@ -134,7 +156,8 @@ void set_task_manager(http_context& ctx, routes& r) {
            }
            co_return std::move(task);
        }));
-        co_return co_await retrieve_status(std::move(task));
+        auto s = co_await retrieve_status(task);
+        co_return make_status(s);
    });

    tm::abort_task.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
@@ -153,11 +176,55 @@ void set_task_manager(http_context& ctx, routes& r) {
        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
            return task->done().then_wrapped([task] (auto f) {
                task->unregister_task();
-                f.get();
+                // done() is called only because we want the task to be complete before getting its status.
+                // The future should be ignored here as the result does not matter.
+                f.ignore_ready_future();
                return make_foreign(task);
            });
        }));
-        co_return co_await retrieve_status(std::move(task));
+        auto s = co_await retrieve_status(task);
+        co_return make_status(s);
+    });
+
+    tm::get_task_status_recursively.set(r, [&ctx] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto& _ctx = ctx;
+        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        std::queue<tasks::task_manager::foreign_task_ptr> q;
+        utils::chunked_vector<full_task_status> res;
+
+        // Get requested task.
+        auto task = co_await tasks::task_manager::invoke_on_task(_ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
+            auto state = task->get_status().state;
+            if (state == tasks::task_manager::task_state::done || state == tasks::task_manager::task_state::failed) {
+                task->unregister_task();
+            }
+            co_return task;
+        }));
+
+        // Push children's statuses in BFS order.
+        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
+        while (!q.empty()) {
+            auto& current = q.front();
+            res.push_back(co_await retrieve_status(current));
+            for (auto& child: current->get_children()) {
+                q.push(co_await child.copy());
+            }
+            q.pop();
+        }
+
+        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
+            auto s = std::move(os);
+            auto res = std::move(r);
+            co_await s.write("[");
+            std::string delim = "";
+            for (auto& status: res) {
+                co_await s.write(std::exchange(delim, ", "));
+                co_await formatter::write(s, make_status(status));
+            }
+            co_await s.write("]");
+            co_await s.close();
+        };
+        co_return f;
    });
 }

--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -47,8 +47,6 @@ void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg) {
        std::string keyspace = it != req->query_parameters.end() ? it->second : "";
        it = req->query_parameters.find("table");
        std::string table = it != req->query_parameters.end() ? it->second : "";
-        it = req->query_parameters.find("type");
-        std::string type = it != req->query_parameters.end() ? it->second : "";
        it = req->query_parameters.find("entity");
        std::string entity = it != req->query_parameters.end() ? it->second : "";
        it = req->query_parameters.find("parent_id");
@@ -60,7 +58,7 @@ void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg) {
        }

        auto module = tms.local().find_module("test");
-        id = co_await module->make_task<tasks::test_task_impl>(shard, id, keyspace, table, type, entity, data);
+        id = co_await module->make_task<tasks::test_task_impl>(shard, id, keyspace, table, entity, data);
        co_await tms.invoke_on(shard, [id] (tasks::task_manager& tm) {
            auto it = tm.get_all_tasks().find(id);
            if (it != tm.get_all_tasks().end()) {
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -66,36 +66,48 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+// Based on Cassandra's resolveRegular function:
+//  - https://github.com/apache/cassandra/blob/e4f31b73c21b04966269c5ac2d3bd2562e5f6c63/src/java/org/apache/cassandra/db/rows/Cells.java#L79-L119
+//
+// Note: the ordering algorithm for cell is the same as for rows,
+// except that the cell value is used to break a tie in case all other attributes are equal.
+// See compare_row_marker_for_merge.
 std::strong_ordering
 compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    // Largest write timestamp wins.
    if (left.timestamp() != right.timestamp()) {
        return left.timestamp() <=> right.timestamp();
    }
+    // Tombstones always win reconciliation with live cells of the same timestamp
    if (left.is_live() != right.is_live()) {
        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
    }
    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
+        // Prefer expiring cells (which will become tombstones at some future date) over live cells.
+        // See https://issues.apache.org/jira/browse/CASSANDRA-14592
        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
+        // If both are expiring, choose the cell with the latest expiry or derived write time.
        if (left.is_live_and_has_ttl()) {
+            // Prefer cell with latest expiry
            if (left.expiry() != right.expiry()) {
                return left.expiry() <=> right.expiry();
-            } else {
-                // prefer the cell that was written later,
-                // so it survives longer after it expires, until purged.
+            } else if (right.ttl() != left.ttl()) {
+                // The cell write time is derived by (expiry - ttl).
+                // Prefer the cell that was written later,
+                // so it survives longer after it expires, until purged,
+                // as it become purgeable gc_grace_seconds after it was written.
+                //
+                // Note that this is an extension to Cassandra's algorithm
+                // which stops at the expiration time, and if equal,
+                // move forward to compare the cell values.
                return right.ttl() <=> left.ttl();
            }
        }
+        // The cell with the largest value wins, if all other attributes of the cells are identical.
+        // This is quite arbitrary, but still required to break the tie in a deterministic way.
+        return compare_unsigned(left.value(), right.value());
    } else {
        // Both are deleted

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -229,6 +229,8 @@ future<authenticated_user> password_authenticator::authenticate(
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
        } catch (exceptions::authentication_exception& e) {
            std::throw_with_nested(e);
+        } catch (exceptions::unavailable_exception& e) {
+            std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
        } catch (...) {
            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -55,6 +55,7 @@ future<bool> default_role_row_satisfies(
        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
+                internal_distributed_query_state(),
                {meta::DEFAULT_SUPERUSER_NAME},
                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -457,7 +457,9 @@ public:
            _begin.ptr->size = _size;
            _current = nullptr;
            _size = 0;
-            return managed_bytes(std::exchange(_begin.ptr, {}));
+            auto begin_ptr = _begin.ptr;
+            _begin.ptr = nullptr;
+            return managed_bytes(begin_ptr);
        } else {
            return managed_bytes();
        }
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -572,7 +572,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        _read_context.cache().on_mispopulate();
        return;
    }
-    auto rt_opt = _rt_assembler.flush(*_schema, position_in_partition::after_key(cr.key()));
+    auto rt_opt = _rt_assembler.flush(*_schema, position_in_partition::after_key(*_schema, cr.key()));
    clogger.trace("csm {}: populate({})", fmt::ptr(this), clustering_row::printer(*_schema, cr));
    _lsa_manager.run_in_update_section_with_allocator([this, &cr, &rt_opt] {
        mutation_partition& mp = _snp->version()->partition();
@@ -634,8 +634,8 @@ inline
 void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", fmt::ptr(this), _next_row.position(), _next_row_in_range);
    _next_row.touch();
-    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
-    auto upper_bound = _next_row_in_range ? next_lower_bound : _upper_bound;
+    auto next_lower_bound = position_in_partition_view::after_key(table_schema(), _next_row.position());
+    auto upper_bound = _next_row_in_range ? next_lower_bound.view : _upper_bound;
    if (_snp->range_tombstones(_lower_bound, upper_bound, [&] (range_tombstone rts) {
        add_range_tombstone_to_buffer(std::move(rts));
        return stop_iteration(_lower_bound_changed && is_buffer_full());
@@ -774,14 +774,14 @@ void cache_flat_mutation_reader::move_to_next_entry() {
    }
 }

-void cache_flat_mutation_reader::flush_tombstones(position_in_partition_view pos, bool end_of_range) {
+void cache_flat_mutation_reader::flush_tombstones(position_in_partition_view pos_, bool end_of_range) {
    // Ensure position is appropriate for range tombstone bound
-    pos = position_in_partition_view::after_key(pos);
-    clogger.trace("csm {}: flush_tombstones({}) end_of_range: {}", fmt::ptr(this), pos, end_of_range);
-    _rt_gen.flush(pos, [this] (range_tombstone_change&& rtc) {
+    auto pos = position_in_partition_view::after_key(*_schema, pos_);
+    clogger.trace("csm {}: flush_tombstones({}) end_of_range: {}", fmt::ptr(this), pos.view, end_of_range);
+    _rt_gen.flush(pos.view, [this] (range_tombstone_change&& rtc) {
        add_to_buffer(std::move(rtc), source::cache);
    }, end_of_range);
-    if (auto rtc_opt = _rt_merger.flush(pos, end_of_range)) {
+    if (auto rtc_opt = _rt_merger.flush(pos.view, end_of_range)) {
        do_add_to_buffer(std::move(*rtc_opt));
    }
 }
@@ -832,7 +832,7 @@ inline
 void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment_v2&& mf) {
    clogger.trace("csm {}: add_clustering_row_to_buffer({})", fmt::ptr(this), mutation_fragment_v2::printer(*_schema, mf));
    auto& row = mf.as_clustering_row();
-    auto new_lower_bound = position_in_partition::after_key(row.key());
+    auto new_lower_bound = position_in_partition::after_key(*_schema, row.key());
    push_mutation_fragment(std::move(mf));
    _lower_bound = std::move(new_lower_bound);
    _lower_bound_changed = true;
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -25,6 +25,7 @@
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
 #include "utils/UUID_gen.hh"
+#include "utils/error_injection.hh"

 #include "cdc/generation.hh"
 #include "cdc/cdc_options.hh"
@@ -44,8 +45,16 @@ static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const

 namespace cdc {

-extern const api::timestamp_clock::duration generation_leeway =
-    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+api::timestamp_clock::duration get_generation_leeway() {
+    static thread_local auto generation_leeway =
+            std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+    utils::get_local_injector().inject("increase_cdc_generation_leeway", [&] {
+        generation_leeway = std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::minutes(5));
+    });
+
+    return generation_leeway;
+}

 static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    i = net::hton(i);
@@ -160,18 +169,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(std::vector<token_range_description> entries)
+topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const& {
+const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-std::vector<token_range_description>&& topology_description::entries() && {
+utils::chunked_vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -263,7 +272,7 @@ public:
    topology_description generate() const {
        const auto tokens = get_tokens();

-        std::vector<token_range_description> vnode_descriptions;
+        utils::chunked_vector<token_range_description> vnode_descriptions;
        vnode_descriptions.reserve(tokens.size());

        vnode_descriptions.push_back(
@@ -331,7 +340,7 @@ future<cdc::generation_id> generation_service::make_new_generation(const std::un
    auto new_generation_timestamp = [add_delay, ring_delay = _cfg.ring_delay] {
        auto ts = db_clock::now();
        if (add_delay && ring_delay != 0ms) {
-            ts += 2 * ring_delay + duration_cast<milliseconds>(generation_leeway);
+            ts += 2 * ring_delay + duration_cast<milliseconds>(get_generation_leeway());
        }
        return ts;
    };
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -46,6 +46,8 @@ namespace gms {

 namespace cdc {

+api::timestamp_clock::duration get_generation_leeway();
+
 class stream_id final {
    bytes _value;
 public:
@@ -94,13 +96,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    std::vector<token_range_description> _entries;
+    utils::chunked_vector<token_range_description> _entries;
 public:
-    topology_description(std::vector<token_range_description> entries);
+    topology_description(utils::chunked_vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const&;
-    std::vector<token_range_description>&& entries() &&;
+    const utils::chunked_vector<token_range_description>& entries() const&;
+    utils::chunked_vector<token_range_description>&& entries() &&;
 };

 /**
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -605,7 +605,7 @@ private:
 public:
    collection_iterator(managed_bytes_view_opt v = {})
        : _v(v.value_or(managed_bytes_view{}))
-        , _rem(_v.empty() ? 0 : read_collection_size(_v, cql_serialization_format::internal()))
+        , _rem(_v.empty() ? 0 : read_collection_size(_v))
    {
        if (_rem != 0) {
            parse();
@@ -650,8 +650,8 @@ template<>
 void collection_iterator<std::pair<managed_bytes_view, managed_bytes_view>>::parse() {
    assert(_rem > 0);
    _next = _v;
-    auto k = read_collection_value(_next, cql_serialization_format::internal());
-    auto v = read_collection_value(_next, cql_serialization_format::internal());
+    auto k = read_collection_value(_next);
+    auto v = read_collection_value(_next);
    _current = std::make_pair(k, v);
 }

@@ -659,7 +659,7 @@ template<>
 void collection_iterator<managed_bytes_view>::parse() {
    assert(_rem > 0);
    _next = _v;
-    auto k = read_collection_value(_next, cql_serialization_format::internal());
+    auto k = read_collection_value(_next);
    _current = k;
 }

@@ -728,7 +728,7 @@ auto make_maybe_back_inserter(Container& c, const abstract_type& type, collectio
 static size_t collection_size(const managed_bytes_opt& bo) {
    if (bo) {
        managed_bytes_view mbv(*bo);
-        return read_collection_size(mbv, cql_serialization_format::internal());
+        return read_collection_size(mbv);
    }
    return 0;
 }
@@ -750,7 +750,7 @@ static managed_bytes merge(const collection_type_impl& ctype, const managed_byte
    // note order: set_union, when finding doubles, use value from first1 (j here). So
    // since this is next, it has prio
    std::set_union(j, e, i, e, make_maybe_back_inserter(res, *type, collection_iterator<managed_bytes_view>(deleted)), cmp);
-    return map_type_impl::serialize_partially_deserialized_form_fragmented(res, cql_serialization_format::internal());
+    return map_type_impl::serialize_partially_deserialized_form_fragmented(res);
 }
 static managed_bytes merge(const set_type_impl& ctype, const managed_bytes_opt& prev, const managed_bytes_opt& next, const managed_bytes_opt& deleted) {
    std::vector<managed_bytes_view> res;
@@ -761,7 +761,7 @@ static managed_bytes merge(const set_type_impl& ctype, const managed_bytes_opt&
    };
    collection_iterator<managed_bytes_view> e, i(prev), j(next), d(deleted);
    std::set_union(j, e, i, e, make_maybe_back_inserter(res, *type, d), cmp);
-    return set_type_impl::serialize_partially_deserialized_form_fragmented(res, cql_serialization_format::internal());
+    return set_type_impl::serialize_partially_deserialized_form_fragmented(res);
 }
 static managed_bytes merge(const user_type_impl& type, const managed_bytes_opt& prev, const managed_bytes_opt& next, const managed_bytes_opt& deleted) {
    std::vector<managed_bytes_view_opt> res(type.size());
@@ -812,15 +812,14 @@ static managed_bytes_opt get_preimage_col_value(const column_definition& cdef, c
            // flatten set
            [&] (const set_type_impl& type) {
                auto v = pirow->get_view(cdef.name_as_text());
-                auto f = cql_serialization_format::internal();
-                auto n = read_collection_size(v, f);
+                auto n = read_collection_size(v);
                std::vector<managed_bytes> tmp;
                tmp.reserve(n);
                while (n--) {
-                    tmp.emplace_back(read_collection_value(v, f)); // key
-                    read_collection_value(v, f); // value. ignore.
+                    tmp.emplace_back(read_collection_value(v)); // key
+                    read_collection_value(v); // value. ignore.
                }
-                return set_type_impl::serialize_partially_deserialized_form_fragmented({tmp.begin(), tmp.end()}, f);
+                return set_type_impl::serialize_partially_deserialized_form_fragmented({tmp.begin(), tmp.end()});
            },
            [&] (const abstract_type& o) -> managed_bytes {
                return pirow->get_blob_fragmented(cdef.name_as_text());
@@ -1122,7 +1121,7 @@ struct process_row_visitor {
                visit_collection(v);

                managed_bytes_opt added_keys = v._added_keys.empty() ? std::nullopt :
-                    std::optional{set_type_impl::serialize_partially_deserialized_form_fragmented(v._added_keys, cql_serialization_format::internal())};
+                    std::optional{set_type_impl::serialize_partially_deserialized_form_fragmented(v._added_keys)};

                return {
                    v._is_column_delete,
@@ -1178,7 +1177,7 @@ struct process_row_visitor {
                visit_collection(v);

                managed_bytes_opt added_cells = v._added_cells.empty() ? std::nullopt :
-                    std::optional{map_type_impl::serialize_partially_deserialized_form_fragmented(v._added_cells, cql_serialization_format::internal())};
+                    std::optional{map_type_impl::serialize_partially_deserialized_form_fragmented(v._added_cells)};

                return {
                    v._is_column_delete,
@@ -1198,7 +1197,7 @@ struct process_row_visitor {
        // then we deserialize again when merging images below
        managed_bytes_opt deleted_elements = std::nullopt;
        if (!deleted_keys.empty()) {
-            deleted_elements = set_type_impl::serialize_partially_deserialized_form_fragmented(deleted_keys, cql_serialization_format::internal());
+            deleted_elements = set_type_impl::serialize_partially_deserialized_form_fragmented(deleted_keys);
        }

        // delta
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -15,10 +15,6 @@

 extern logging::logger cdc_log;

-namespace cdc {
-    extern const api::timestamp_clock::duration generation_leeway;
-} // namespace cdc
-
 static api::timestamp_type to_ts(db_clock::time_point tp) {
    // This assumes that timestamp_clock and db_clock have the same epochs.
    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
@@ -40,7 +36,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const std::vector<cdc::token_range_description>& entries,
+        const utils::chunked_vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
@@ -73,7 +69,7 @@ bool cdc::metadata::streams_available() const {

 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
-    if (ts > now + generation_leeway.count()) {
+    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
@@ -86,27 +82,43 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // Nothing protects us from that until we start using transactions for generation switching.
    }

-    auto it = gen_used_at(now);
-    if (it == _gens.end()) {
+    auto it = gen_used_at(now - get_generation_leeway().count());
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+    }
+
+    if (ts <= now - get_generation_leeway().count()) {
+        // We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
+        // happens iff one of the following is true:
+        // - the write is to no generation,
+        // - the write is to a generation older than the generation under `it`,
+        // - the write is to the generation under `it` and that generation is not the current generation.
+        // Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
+        // but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
+        // are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
+        // the generation under `it` because that generation was operating at `now - generation_leeway`.
+        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
+        if (it == _gens.end() || ts < it->first || is_previous_gen) {
+            throw exceptions::invalid_request_exception(format(
+                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
+                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                    " consistency properties.\n"
+                    "We *do* allow sending writes into the near past, but our ability to do that is limited."
+                    " Are you using client-side timestamps? Make sure your clocks are well-synchronized"
+                    " with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        }
+    }
+
+    it = _gens.begin();
+    if (it == _gens.end() || ts < it->first) {
        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
    }

-    // Garbage-collect generations that will no longer be used.
-    it = _gens.erase(_gens.begin(), it);
-
-    if (it->first > ts) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream from an earlier generation than the currently used one."
-                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                " consistency properties (write timestamp: {}, current generation started at: {})",
-                format_timestamp(ts), format_timestamp(it->first)));
-    }
-
-    // With `generation_leeway` we allow sending writes to the near future. It might happen
-    // that `ts` doesn't belong to the current generation ("current" according to our clock),
-    // but to the next generation. Adjust for this case:
+    // Find the generation operating at `ts`.
    {
        auto next_it = std::next(it);
        while (next_it != _gens.end() && next_it->first <= ts) {
@@ -147,8 +159,8 @@ bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
        ++it;
    }

-    // Check if some new generation has already superseded this one.
-    return it != _gens.end() && it->first <= api::new_timestamp();
+    // Check if the generation is obsolete.
+    return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
 }

 bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
@@ -157,7 +169,7 @@ bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen)
    }

    auto now = api::new_timestamp();
-    auto it = gen_used_at(now);
+    auto it = gen_used_at(now - get_generation_leeway().count());

    if (it != _gens.end()) {
        // Garbage-collect generations that will no longer be used.
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -42,7 +42,9 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
 public:
-    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
+     * it is older than the generation operating at `now - get_generation_leeway()`.
+     */
    bool known_or_obsolete(db_clock::time_point) const;

    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
@@ -54,8 +56,9 @@ public:
     *
     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
-     * by the `cdc::generation_leeway` constant.
+     * yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
+     * the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
+     * `get_generation_leeway()`.
     */
    stream_id get_stream(api::timestamp_type ts, dht::token tok);

--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -21,8 +21,6 @@ class row_tombstone;

 class collection_mutation;

-class cql_serialization_format;
-
 // An auxiliary struct used to (de)construct collection_mutations.
 // Unlike collection_mutation which is a serialized blob, this struct allows to inspect logical units of information
 // (tombstone and cells) inside the mutation easily.
@@ -131,4 +129,4 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec
 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
-bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
+bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view);
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -12,11 +12,11 @@

 class schema;
 class partition_key;
-class clustering_row;
 struct atomic_cell_view;
 struct tombstone;

 namespace db::view {
+struct clustering_or_static_row;
 struct view_key_and_action;
 }

@@ -118,7 +118,7 @@ class collection_column_computation final : public column_computation {
    using collection_kv = std::pair<bytes_view, atomic_cell_view>;
    void operate_on_collection_entries(
            std::invocable<collection_kv*, collection_kv*, tombstone> auto&& old_and_new_row_func, const schema& schema,
-            const partition_key& key, const clustering_row& update, const std::optional<clustering_row>& existing) const;
+            const partition_key& key, const db::view::clustering_or_static_row& update, const std::optional<db::view::clustering_or_static_row>& existing) const;

 public:
    static collection_column_computation for_keys(const bytes& collection_name) {
@@ -141,5 +141,6 @@ public:
        return true;
    }

-    std::vector<db::view::view_key_and_action> compute_values_with_action(const schema& schema, const partition_key& key, const clustering_row& row, const std::optional<clustering_row>& existing) const;
+    std::vector<db::view::view_key_and_action> compute_values_with_action(const schema& schema, const partition_key& key,
+            const db::view::clustering_or_static_row& row, const std::optional<db::view::clustering_or_static_row>& existing) const;
 };
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -168,7 +168,7 @@ std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
 }

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -179,6 +179,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
            hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
        }
        if (sst->filter_has_key(*hk)) {
+            bloom_filter_checks++;
            timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
        }
    }
@@ -414,9 +415,12 @@ private:

 class formatted_sstables_list {
    bool _include_origin = true;
-    std::vector<sstring> _ssts;
+    std::vector<std::string> _ssts;
 public:
    formatted_sstables_list() = default;
+    void reserve(size_t n) {
+        _ssts.reserve(n);
+    }
    explicit formatted_sstables_list(const std::vector<shared_sstable>& ssts, bool include_origin) : _include_origin(include_origin) {
        _ssts.reserve(ssts.size());
        for (const auto& sst : ssts) {
@@ -435,9 +439,7 @@ public:
 };

 std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst) {
-    os << "[";
-    os << boost::algorithm::join(lst._ssts, ",");
-    os << "]";
+    fmt::print(os, "[{}]", fmt::join(lst._ssts, ","));
    return os;
 }

@@ -462,6 +464,8 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    double _estimated_droppable_tombstone_ratio = 0;
+    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
    bool _can_split_large_partition = false;
@@ -518,7 +522,7 @@ protected:
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -573,14 +577,15 @@ protected:
        return bool(_sstable_set);
    }

-    compaction_writer create_gc_compaction_writer() const {
+    compaction_writer create_gc_compaction_writer(run_id gc_run) const {
        auto sst = _sstable_creator(this_shard_id());

        auto&& priority = _io_priority;
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
-        cfg.run_identifier = _run_identifier;
+        cfg.run_identifier = gc_run;
        cfg.monitor = monitor.get();
+        uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }
@@ -600,8 +605,14 @@ protected:
    // When compaction finishes, all the temporary sstables generated here will be deleted and removed
    // from table's sstable set.
    compacted_fragments_writer get_gc_compacted_fragments_writer() {
+        // because the temporary sstable run can overlap with the non-gc sstables run created by
+        // get_compacted_fragments_writer(), we have to use a different run_id. the gc_run_id is
+        // created here as:
+        // 1. it can be shared across all sstables created by this writer
+        // 2. it is optional, as gc writer is not always used
+        auto gc_run = run_id::create_random_id();
        return compacted_fragments_writer(*this,
-             [this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
+             [this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
             [this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
             _stop_request_observable);
    }
@@ -618,8 +629,8 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -641,9 +652,11 @@ private:
    future<> setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
        formatted_sstables_list formatted_msg;
+        formatted_msg.reserve(_sstables.size());
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

+        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
@@ -678,12 +691,16 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
+            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state());
+            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->all()->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->all()->size(), _sstables.size());
        }
+        // _estimated_droppable_tombstone_ratio could exceed 1.0 in certain cases, so limit it to 1.0.
+        _estimated_droppable_tombstone_ratio = std::min(1.0, sum_of_estimated_droppable_tombstone_ratio / ssts->all()->size());

        _compacting = std::move(ssts);

@@ -757,6 +774,7 @@ protected:
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
+                .bloom_filter_checks = _bloom_filter_checks,
            },
        };

@@ -776,7 +794,7 @@ protected:
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_start_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -797,7 +815,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
        };
    }

@@ -907,7 +925,7 @@ void compacted_fragments_writer::split_large_partition() {
    // will result in current fragment storing an inclusive end bound for last pos, and the
    // next fragment storing an exclusive start bound for last pos. This is very important
    // for not losing information on the range tombstone.
-    auto after_last_pos = position_in_partition::after_key(_current_partition.last_pos.key());
+    auto after_last_pos = position_in_partition::after_key(*_c.schema(), _current_partition.last_pos.key());
    if (_current_partition.current_emitted_tombstone) {
        auto rtc = range_tombstone_change(after_last_pos, tombstone{});
        _c.log_debug("Closing active tombstone {} with {} for partition {}", _current_partition.current_emitted_tombstone, rtc, *_current_partition.dk);
@@ -997,51 +1015,6 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

-class reshape_compaction : public compaction {
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
-        : compaction(table_s, std::move(descriptor), cdata) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader() const override {
-        return _compacting->make_local_shard_sstable_reader(_schema,
-                _permit,
-                query::full_partition_range,
-                _schema->full_slice(),
-                _io_priority,
-                tracing::trace_state_ptr(),
-                ::streamed_mutation::forwarding::no,
-                ::mutation_reader::forwarding::no,
-                default_read_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            finish_new_sstable(writer);
-        }
-    }
-};
-
 class regular_compaction : public compaction {
    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
    mutable compaction_read_monitor_generator _monitor_generator;
@@ -1151,12 +1124,13 @@ private:
    }

    void update_pending_ranges() {
-        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
+        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
+        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : _cdata.pending_replacements) {
+        for (auto& pending_replacement : pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1170,7 +1144,70 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-        _cdata.pending_replacements.clear();
+    }
+};
+
+class reshape_compaction : public regular_compaction {
+private:
+    bool has_sstable_replacer() const noexcept {
+        return bool(_replacer);
+    }
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+            : regular_compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
+        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            if (has_sstable_replacer()) {
+                regular_compaction::stop_sstable_writer(writer);
+            } else {
+                finish_new_sstable(writer);
+            }
+        }
+    }
+
+    virtual void on_end_of_compaction() override {
+        if (has_sstable_replacer()) {
+            regular_compaction::on_end_of_compaction();
+        }
    }
 };

@@ -1590,7 +1627,7 @@ private:
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -92,12 +92,15 @@ struct compaction_stats {
    uint64_t start_size = 0;
    uint64_t end_size = 0;
    uint64_t validation_errors = 0;
+    // Bloom filter checks during max purgeable calculation
+    uint64_t bloom_filter_checks = 0;

    compaction_stats& operator+=(const compaction_stats& r) {
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
+        bloom_filter_checks += r.bloom_filter_checks;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -7,15 +7,19 @@
 */

 #include "compaction_manager.hh"
+#include "compaction_descriptor.hh"
 #include "compaction_strategy.hh"
 #include "compaction_backlog_manager.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstables_manager.hh"
+#include <memory>
 #include <seastar/core/metrics.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/switch_to.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "sstables/exceptions.hh"
+#include "sstables/sstable_directory.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "utils/fb_utilities.hh"
 #include "utils/UUID_gen.hh"
@@ -76,6 +80,23 @@ public:
            _compacting.erase(sst);
        }
    }
+
+    class update_me : public compaction_manager::task::on_replacement {
+        compacting_sstable_registration& _registration;
+        public:
+            update_me(compacting_sstable_registration& registration)
+                : _registration{registration} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.release_compacting(sstables);
+            }
+            void on_addition(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.register_compacting(sstables);
+            }
+    };
+
+    auto update_on_sstable_replacement() {
+        return update_me(*this);
+    }
 };

 sstables::compaction_data compaction_manager::create_compaction_data() {
@@ -277,7 +298,7 @@ compaction_manager::task::task(compaction_manager& mgr, compaction::table_state*
    , _description(std::move(desc))
 {}

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task, throw_if_stopping do_throw_if_stopping) {
    _tasks.push_back(task);
    auto unregister_task = defer([this, task] {
        _tasks.remove(task);
@@ -290,6 +311,9 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
        co_return res;
    } catch (sstables::compaction_stopped_exception& e) {
        cmlog.info("{}: stopped, reason: {}", *task, e.what());
+        if (do_throw_if_stopping) {
+            throw;
+        }
    } catch (sstables::compaction_aborted_exception& e) {
        cmlog.error("{}: aborted, reason: {}", *task, e.what());
        _stats.errors++;
@@ -308,14 +332,14 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
    co_return std::nullopt;
 }

-future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge) {
    if (!descriptor.sstables.size()) {
        // if there is nothing to compact, just return.
        co_return sstables::compaction_result{};
    }

    bool should_update_history = this->should_update_history(descriptor.options.type());
-    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, std::move(release_exhausted), std::move(can_purge));
+    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, on_replace, std::move(can_purge));

    if (should_update_history) {
        co_await update_history(*_compacting_table, res, cdata);
@@ -323,8 +347,11 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables_a

    co_return res;
 }
-future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+
+future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge,
+                                                                               sstables::offstrategy offstrategy) {
    compaction::table_state& t = *_compacting_table;
+
    if (can_purge) {
        descriptor.enable_garbage_collection(t.main_sstable_set());
    }
@@ -332,15 +359,26 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables(s
        auto sst = t.make_sstable();
        return sst;
    };
-    descriptor.replacer = [this, &t, release_exhausted] (sstables::compaction_completion_desc desc) {
+
+    descriptor.replacer = [this, &t, &on_replace, offstrategy] (sstables::compaction_completion_desc desc) {
        t.get_compaction_strategy().notify_completion(desc.old_sstables, desc.new_sstables);
        _cm.propagate_replacement(t, desc.old_sstables, desc.new_sstables);
+        // on_replace updates the compacting registration with the old and new
+        // sstables. while on_compaction_completion() removes the old sstables
+        // from the table's sstable set, and adds the new ones to the sstable
+        // set.
+        // since the regular compactions exclude the sstables in the sstable
+        // set which are currently being compacted, if we want to ensure the
+        // exclusive access of compactions to an sstable we should guard it
+        // with the registration when adding/removing it to/from the sstable
+        // set. otherwise, the regular compaction would pick it up in the time
+        // window, where the sstables:
+        // - are still in the main set
+        // - are not being compacted.
+        on_replace.on_addition(desc.new_sstables);
        auto old_sstables = desc.old_sstables;
-        t.on_compaction_completion(std::move(desc), sstables::offstrategy::no).get();
-        // Calls compaction manager's task for this compaction to release reference to exhausted SSTables.
-        if (release_exhausted) {
-            release_exhausted(old_sstables);
-        }
+        t.on_compaction_completion(std::move(desc), offstrategy).get();
+        on_replace.on_removal(old_sstables);
    };

    co_return co_await sstables::compact_sstables(std::move(descriptor), cdata, t);
@@ -385,9 +423,7 @@ protected:
        sstables::compaction_strategy cs = t->get_compaction_strategy();
        sstables::compaction_descriptor descriptor = cs.get_major_compaction_job(*t, _cm.get_candidates(*t));
        auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
-        auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-            compacting.release_compacting(exhausted_sstables);
-        };
+        auto on_replace = compacting.update_on_sstable_replacement();
        setup_new_compaction(descriptor.run_identifier);

        cmlog.info0("User initiated compaction started on behalf of {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
@@ -399,7 +435,7 @@ protected:
        // the exclusive lock can be freed to let regular compaction run in parallel to major
        lock_holder.return_all();

-        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace);

        finish_compaction();

@@ -446,12 +482,12 @@ protected:
    }
 };

-future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job) {
+future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping) {
    if (_state != state::enabled) {
        return make_ready_future<>();
    }

-    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job))).discard_result();
+    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job)), do_throw_if_stopping).discard_result();
 }

 future<> compaction_manager::update_static_shares(float static_shares) {
@@ -645,6 +681,7 @@ sstables::compaction_stopped_exception compaction_manager::task::make_compaction

 compaction_manager::compaction_manager(config cfg, abort_source& as)
    : _cfg(std::move(cfg))
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), [this] () -> float {
        _last_backlog = backlog();
        auto b = _last_backlog / available_memory();
@@ -679,6 +716,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as)

 compaction_manager::compaction_manager()
    : _cfg(config{ .available_memory = 1 })
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), 1, [] () -> float { return 1.0; }))
    , _backlog_manager(_compaction_controller)
    , _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
@@ -736,38 +774,46 @@ void compaction_manager::register_metrics() {
 void compaction_manager::enable() {
    assert(_state == state::none || _state == state::disabled);
    _state = state::enabled;
-    _compaction_submission_timer.arm(periodic_compaction_submission_interval());
-    postponed_compactions_reevaluation();
+    _compaction_submission_timer.arm_periodic(periodic_compaction_submission_interval());
+    _waiting_reevalution = postponed_compactions_reevaluation();
 }

 std::function<void()> compaction_manager::compaction_submission_callback() {
    return [this] () mutable {
        for (auto& e: _compaction_state) {
-            submit(*e.first);
+            postpone_compaction_for_table(e.first);
        }
+        reevaluate_postponed_compactions();
    };
 }

-void compaction_manager::postponed_compactions_reevaluation() {
-    _waiting_reevalution = repeat([this] {
-        return _postponed_reevaluation.wait().then([this] {
-            if (_state != state::enabled) {
-                _postponed.clear();
-                return stop_iteration::yes;
-            }
-            auto postponed = std::move(_postponed);
-            try {
-                for (auto& t : postponed) {
-                    auto s = t->schema();
-                    cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
-                    submit(*t);
+future<> compaction_manager::postponed_compactions_reevaluation() {
+     while (true) {
+        co_await _postponed_reevaluation.when();
+        if (_state != state::enabled) {
+            _postponed.clear();
+            co_return;
+        }
+        // A task_state being reevaluated can re-insert itself into postponed list, which is the reason
+        // for moving the list to be processed into a local.
+        auto postponed = std::exchange(_postponed, {});
+        try {
+            for (auto it = postponed.begin(); it != postponed.end();) {
+                compaction::table_state* t = *it;
+                it = postponed.erase(it);
+                // skip reevaluation of a table_state that became invalid post its removal
+                if (!_compaction_state.contains(t)) {
+                    continue;
                }
-            } catch (...) {
-                _postponed = std::move(postponed);
+                auto s = t->schema();
+                cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
+                submit(*t);
+                co_await coroutine::maybe_yield();
            }
-            return stop_iteration::no;
-        });
-    });
+        } catch (...) {
+            _postponed.insert(postponed.begin(), postponed.end());
+        }
+    }
 }

 void compaction_manager::reevaluate_postponed_compactions() noexcept {
@@ -972,9 +1018,7 @@ protected:
            }
            auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
            auto weight_r = compaction_weight_registration(&_cm, weight);
-            auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = compacting.update_on_sstable_replacement();
            cmlog.debug("Accepted compaction job: task={} ({} sstable(s)) of weight {} for {}.{}",
                fmt::ptr(this), descriptor.sstables.size(), weight, t.schema()->ks_name(), t.schema()->cf_name());

@@ -983,7 +1027,7 @@ protected:

            try {
                bool should_update_history = this->should_update_history(descriptor.options.type());
-                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
                finish_compaction();
                if (should_update_history) {
                    // update_history can take a long time compared to
@@ -1024,7 +1068,7 @@ void compaction_manager::submit(compaction::table_state& t) {

    // OK to drop future.
    // waited via task->stop()
-    (void)perform_task(make_shared<regular_compaction_task>(*this, t));
+    (void)perform_task(make_shared<regular_compaction_task>(*this, t)).then_wrapped([] (auto f) { f.ignore_ready_future(); });
 }

 bool compaction_manager::can_perform_regular_compaction(compaction::table_state& t) {
@@ -1045,7 +1089,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction::
            desc.sstables
            | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::run_identifier))).size();
    };
-    const auto threshold = std::max(schema->max_compaction_threshold(), 32);
+    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}.{}: {} <= {}",
@@ -1083,54 +1127,40 @@ public:
    }
 private:
    future<> run_offstrategy_compaction(sstables::compaction_data& cdata) {
-        // This procedure will reshape sstables in maintenance set until it's ready for
-        // integration into main set.
-        // It may require N reshape rounds before the set satisfies the strategy invariant.
-        // This procedure also only updates maintenance set at the end, on success.
-        // Otherwise, some overlapping could be introduced in the set after each reshape
-        // round, progressively degrading read amplification until integration happens.
-        // The drawback of this approach is the 2x space requirement as the old sstables
-        // will only be deleted at the end. The impact of this space requirement is reduced
-        // by the fact that off-strategy is serialized across all tables, meaning that the
-        // actual requirement is the size of the largest table's maintenance set.
+        // Incrementally reshape the SSTables in maintenance set. The output of each reshape
+        // round is merged into the main set. The common case is that off-strategy input
+        // is mostly disjoint, e.g. repair-based node ops, then all the input will be
+        // reshaped in a single round. The incremental approach allows us to be space
+        // efficient (avoiding a 100% overhead) as we will incrementally replace input
+        // SSTables from maintenance set by output ones into main set.

        compaction::table_state& t = *_compacting_table;
-        const auto& maintenance_sstables = t.maintenance_sstable_set();

        // Filter out sstables that require view building, to avoid a race between off-strategy
        // and view building. Refs: #11882
-        const auto old_sstables = boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_sstables.all()
-                | boost::adaptors::filtered([] (const sstables::shared_sstable& sst) {
-            return !sst->requires_view_building();
-        }));
-        std::vector<sstables::shared_sstable> reshape_candidates = old_sstables;
-        std::vector<sstables::shared_sstable> sstables_to_remove;
-        std::unordered_set<sstables::shared_sstable> new_unused_sstables;
-
-        auto cleanup_new_unused_sstables_on_failure = defer([&new_unused_sstables] {
-            for (auto& sst : new_unused_sstables) {
-                sst->mark_for_deletion();
-            }
-        });
+        auto get_reshape_candidates = [&t] () {
+            auto maintenance_ssts = t.maintenance_sstable_set().all();
+            return boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_ssts
+                | boost::adaptors::filtered([](const sstables::shared_sstable& sst) {
+                        return !sst->requires_view_building();
+                }));
+        };

        auto get_next_job = [&] () -> std::optional<sstables::compaction_descriptor> {
            auto& iop = service::get_local_streaming_priority(); // run reshape in maintenance mode
-            auto desc = t.get_compaction_strategy().get_reshaping_job(reshape_candidates, t.schema(), iop, sstables::reshape_mode::strict);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), iop, sstables::reshape_mode::strict);
            return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

        std::exception_ptr err;
        while (auto desc = get_next_job()) {
-            desc->creator = [this, &new_unused_sstables, &t] (shard_id dummy) {
-                auto sst = t.make_sstable();
-                new_unused_sstables.insert(sst);
-                return sst;
-            };
-            auto input = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(desc->sstables);
+            auto compacting = compacting_sstable_registration(_cm, desc->sstables);
+            auto on_replace = compacting.update_on_sstable_replacement();

-            sstables::compaction_result ret;
            try {
-                ret = co_await sstables::compact_sstables(std::move(*desc), cdata, t);
+                sstables::compaction_result _ = co_await compact_sstables(std::move(*desc), _compaction_data, on_replace,
+                                                                          compaction_manager::can_purge_tombstones::no,
+                                                                          sstables::offstrategy::yes);
            } catch (sstables::compaction_stopped_exception&) {
                // If off-strategy compaction stopped on user request, let's not discard the partial work.
                // Therefore, both un-reshaped and reshaped data will be integrated into main set, allowing
@@ -1139,41 +1169,20 @@ private:
                break;
            }
            _performed = true;
-
-            // update list of reshape candidates without input but with output added to it
-            auto it = boost::remove_if(reshape_candidates, [&] (auto& s) { return input.contains(s); });
-            reshape_candidates.erase(it, reshape_candidates.end());
-            std::move(ret.new_sstables.begin(), ret.new_sstables.end(), std::back_inserter(reshape_candidates));
-
-            // If compaction strategy is unable to reshape input data in a single round, it may happen that a SSTable A
-            // created in round 1 will be compacted in a next round producing SSTable B. As SSTable A is no longer needed,
-            // it can be removed immediately. Let's remove all such SSTables immediately to reduce off-strategy space requirement.
-            // Input SSTables from maintenance set can only be removed later, as SSTable sets are only updated on completion.
-            auto can_remove_now = [&] (const sstables::shared_sstable& s) { return new_unused_sstables.contains(s); };
-            for (auto&& sst : input) {
-                if (can_remove_now(sst)) {
-                    co_await sst->unlink();
-                    new_unused_sstables.erase(std::move(sst));
-                } else {
-                    sstables_to_remove.push_back(std::move(sst));
-                }
-            }
        }

-        // at this moment reshape_candidates contains a set of sstables ready for integration into main set
-        auto completion_desc = sstables::compaction_completion_desc{
-            .old_sstables = std::move(old_sstables),
-            .new_sstables = std::move(reshape_candidates)
-        };
-        co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
-
-        cleanup_new_unused_sstables_on_failure.cancel();
-        // By marking input sstables for deletion instead, the ones which require view building will stay in the staging
-        // directory until they're moved to the main dir when the time comes. Also, that allows view building to resume
-        // on restart if there's a crash midway.
-        for (auto& sst : sstables_to_remove) {
-            sst->mark_for_deletion();
+        // There might be some remaining sstables in maintenance set that didn't require reshape, or the
+        // user has aborted off-strategy. So we can only integrate them into the main set, such that
+        // they become candidates for regular compaction. We cannot hold them forever in maintenance set,
+        // as that causes read and space amplification issues.
+        if (auto sstables = get_reshape_candidates(); sstables.size()) {
+            auto completion_desc = sstables::compaction_completion_desc{
+                .old_sstables = sstables, // removes from maintenance set.
+                .new_sstables = sstables, // adds into main set.
+            };
+            co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
        }
+
        if (err) {
            co_await coroutine::return_exception_ptr(std::move(err));
        }
@@ -1196,9 +1205,11 @@ protected:
            std::exception_ptr ex;
            try {
                compaction::table_state& t = *_compacting_table;
-                auto maintenance_sstables = t.maintenance_sstable_set().all();
-                cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
-                        t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                {
+                    auto maintenance_sstables = t.maintenance_sstable_set().all();
+                    cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
+                               t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                }
                co_await run_offstrategy_compaction(_compaction_data);
                finish_compaction();
                cmlog.info("Done with off-strategy compaction for {}.{}", t.schema()->ks_name(), t.schema()->cf_name());
@@ -1271,9 +1282,7 @@ private:
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, _options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
-            auto release_exhausted = [this] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                _compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = _compacting.update_on_sstable_replacement();

            setup_new_compaction(descriptor.run_identifier);

@@ -1282,7 +1291,7 @@ private:

            std::exception_ptr ex;
            try {
-                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted), _can_purge);
+                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace, _can_purge);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return res;  // done with current sstable
@@ -1439,14 +1448,26 @@ protected:
        co_return std::nullopt;
    }
 private:
-    // Releases reference to cleaned files such that respective used disk space can be freed.
-    void release_exhausted(std::vector<sstables::shared_sstable> exhausted_sstables) {
-        _compacting.release_compacting(exhausted_sstables);
-    }
-
    future<> run_cleanup_job(sstables::compaction_descriptor descriptor) {
        co_await coroutine::switch_to(_cm.compaction_sg().cpu);

+        // Releases reference to cleaned files such that respective used disk space can be freed.
+        using update_registration = compacting_sstable_registration::update_me;
+        class release_exhausted : public update_registration {
+            sstables::compaction_descriptor& _desc;
+        public:
+            release_exhausted(compacting_sstable_registration& registration, sstables::compaction_descriptor& desc)
+                : update_registration{registration}
+                , _desc{desc} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                auto exhausted = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(sstables);
+                std::erase_if(_desc.sstables, [&] (const sstables::shared_sstable& sst) {
+                    return exhausted.contains(sst);
+                });
+                update_registration::on_removal(sstables);
+            }
+        };
+        release_exhausted on_replace{_compacting, descriptor};
        for (;;) {
            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_cm._compaction_controller.backlog_of_shares(200), _cm.available_memory()));
            _cm.register_backlog_tracker(user_initiated);
@@ -1454,8 +1475,7 @@ private:
            std::exception_ptr ex;
            try {
                setup_new_compaction(descriptor.run_identifier);
-                co_await compact_sstables_and_update_history(descriptor, _compaction_data,
-                                          std::bind(&cleanup_sstables_compaction_task::release_exhausted, this, std::placeholders::_1));
+                co_await compact_sstables_and_update_history(descriptor, _compaction_data, on_replace);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return;  // done with current job
@@ -1582,7 +1602,7 @@ compaction_manager::compaction_state::compaction_state(table_state& t)
 }

 void compaction_manager::add(compaction::table_state& t) {
-    auto [_, inserted] = _compaction_state.insert({&t, compaction_state(t)});
+    auto [_, inserted] = _compaction_state.try_emplace(&t, t);
    if (!inserted) {
        auto s = t.schema();
        on_internal_error(cmlog, format("compaction_state for table {}.{} [{}] already exists", s->ks_name(), s->cf_name(), fmt::ptr(&t)));
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -32,6 +32,7 @@
 #include "compaction.hh"
 #include "compaction_weight_registration.hh"
 #include "compaction_backlog_manager.hh"
+#include "compaction/compaction_descriptor.hh"
 #include "strategy_control.hh"
 #include "backlog_controller.hh"
 #include "seastarx.hh"
@@ -49,6 +50,8 @@ public:
    boost::icl::interval_map<dht::token, gc_clock::time_point, boost::icl::partial_absorber, std::less, boost::icl::inplace_max> map;
 };

+using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -86,7 +89,7 @@ private:
        compaction_backlog_tracker backlog_tracker;

        explicit compaction_state(table_state& t);
-        compaction_state(compaction_state&&) = default;
+        compaction_state(compaction_state&&) = delete;
        ~compaction_state();

        bool compaction_disabled() const noexcept {
@@ -137,11 +140,20 @@ public:

        virtual ~task();

+        // called when a compaction replaces the exhausted sstables with the new set
+        struct on_replacement {
+            virtual ~on_replacement() {}
+            // called after the replacement completes
+            // @param sstables the old sstable which are replaced in this replacement
+            virtual void on_removal(const std::vector<sstables::shared_sstable>& sstables) = 0;
+            // called before the replacement happens
+            // @param sstables the new sstables to be added to the table's sstable set
+            virtual void on_addition(const std::vector<sstables::shared_sstable>& sstables) = 0;
+        };
+
    protected:
        virtual future<compaction_stats_opt> do_run() = 0;

-        using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
        state switch_state(state new_state);

        future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -158,12 +170,10 @@ public:
        // otherwise, returns stop_iteration::no after sleep for exponential retry.
        future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);

-        // Compacts set of SSTables according to the descriptor.
-        using release_exhausted_func_t = std::function<void(const std::vector<sstables::shared_sstable>& exhausted_sstables)>;
-        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
-                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
-        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
+        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
+        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
+                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstables::offstrategy offstrategy = sstables::offstrategy::no);
        future<> update_history(compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
        bool should_update_history(sstables::compaction_type ct) {
            return ct == sstables::compaction_type::Compaction;
@@ -296,10 +306,10 @@ private:
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
-    timer<lowres_clock> _compaction_submission_timer = timer<lowres_clock>(compaction_submission_callback());
    static constexpr std::chrono::seconds periodic_compaction_submission_interval() { return std::chrono::seconds(3600); }

    config _cfg;
+    timer<lowres_clock> _compaction_submission_timer;
    compaction_controller _compaction_controller;
    compaction_backlog_manager _backlog_manager;
    optimized_optional<abort_source::subscription> _early_abort_subscription;
@@ -315,7 +325,7 @@ private:
    per_table_history_maps _repair_history_maps;
    tombstone_gc_state _tombstone_gc_state;
 private:
-    future<compaction_stats_opt> perform_task(shared_ptr<task>);
+    future<compaction_stats_opt> perform_task(shared_ptr<task>, throw_if_stopping do_throw_if_stopping = throw_if_stopping::no);

    future<> stop_tasks(std::vector<shared_ptr<task>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -350,7 +360,7 @@ private:
    // table still exists and compaction is not disabled for the table.
    inline bool can_proceed(compaction::table_state* t) const;

-    void postponed_compactions_reevaluation();
+    future<> postponed_compactions_reevaluation();
    void reevaluate_postponed_compactions() noexcept;
    // Postpone compaction for a table that couldn't be executed due to ongoing
    // similar-sized compaction.
@@ -460,7 +470,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping);

    class compaction_reenabler {
        compaction_manager& _cm;
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -65,7 +65,7 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

-uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
+uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
    return partition_estimate;
 }

@@ -409,7 +409,9 @@ public:
                l0_old_ssts.push_back(std::move(sst));
            }
        }
-        _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        if (l0_old_ssts.size() || l0_new_ssts.size()) {
+            _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        }
    }
 };

@@ -748,8 +750,8 @@ compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema
    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

-uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate);
+uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
+    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
 }

 reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -108,7 +108,7 @@ public:

    compaction_backlog_tracker make_backlog_tracker();

-    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr);

    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -70,7 +70,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() = 0;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema);

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -144,6 +144,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    auto max_sstable_size_in_bytes = _max_sstable_size_in_mb * 1024 * 1024;

+    leveled_manifest::logger.debug("get_reshaping_job: mode={} input.size={} max_sstable_size_in_bytes={}", mode == reshape_mode::relaxed ? "relaxed" : "strict", input.size(), max_sstable_size_in_bytes);
+
    for (auto& sst : input) {
        auto sst_level = sst->get_sstable_level();
        if (sst_level > leveled_manifest::MAX_LEVELS - 1) {
@@ -200,10 +202,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

        auto [disjoint, overlapping_sstables] = is_disjoint(level_info[level], tolerance(level));
        if (!disjoint) {
-            auto ideal_level = ideal_level_for_input(input, max_sstable_size_in_bytes);
-            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so compacting everything on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
-            // Unfortunately no good limit to limit input size to max_sstables for LCS major
-            compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
+            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so the level will be entirely compacted on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
+            compaction_descriptor desc(std::move(level_info[level]), iop, level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -229,6 +229,9 @@ leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, s
 }

 unsigned leveled_compaction_strategy::ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size) {
+    if (!max_sstable_size) {
+        return 1;
+    }
    auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
        double inv_log_fanout = 1.0f / std::log(fanout);
        return log(x) * inv_log_fanout;
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include "sstables/sstables.hh"
 #include "size_tiered_compaction_strategy.hh"

 #include <boost/range/adaptor/transformed.hpp>
--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -10,7 +10,7 @@

 #include "compaction_strategy_impl.hh"
 #include "compaction.hh"
-#include "sstables/sstables.hh"
+#include "sstables/shared_sstable.hh"
 #include <boost/algorithm/cxx11/any_of.hpp>

 class size_tiered_backlog_tracker;
--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -10,15 +10,15 @@
 #pragma once

 #include "schema_fwd.hh"
-#include "sstables/sstable_set.hh"
-#include "sstables/sstables_manager.hh"
 #include "compaction_descriptor.hh"

 class reader_permit;
 class compaction_backlog_tracker;

 namespace sstables {
+class sstable_set;
 class compaction_strategy;
+class sstables_manager;
 struct sstable_writer_config;
 }

--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -100,16 +100,27 @@ public:
    };
 };

-uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
-        // Not enough information, we assume the worst
-        return partition_estimate / max_data_segregation_window_count;
-    }
-    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
-    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
-    const auto window_size = get_window_size(_options);
+uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) {
+    // If not enough information, we assume the worst
+    auto estimated_window_count = max_data_segregation_window_count;
+    auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
+    bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
+    auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
+        const auto window_size = get_window_size(_options);
+        return (max_window + (window_size - 1) - min_window) / window_size;
+    };

-    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
+    if (!min_and_max_ts_available && default_ttl.count()) {
+        auto min_window = get_window_for(_options, timestamp_type(0));
+        auto max_window = get_window_for(_options, timestamp_type(default_ttl.count()));
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    } else if (min_and_max_ts_available) {
+        auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
+        auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    }

    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
 }
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -15,7 +15,7 @@
 #include "size_tiered_compaction_strategy.hh"
 #include "timestamp.hh"
 #include "exceptions/exceptions.hh"
-#include "sstables/sstables.hh"
+#include "sstables/shared_sstable.hh"
 #include "service/priority_manager.hh"

 namespace sstables {
@@ -157,7 +157,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) override;
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) override;

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) override;

--- a/compound.hh
+++ b/compound.hh
@@ -16,7 +16,6 @@
 #include <boost/range/adaptor/transformed.hpp>
 #include "utils/serialization.hh"
 #include <seastar/util/backtrace.hh>
-#include "cql_serialization_format.hh"

 enum class allow_prefixes { no, yes };

@@ -280,7 +279,7 @@ public:
        }
        for (size_t i = 0; i != values.size(); ++i) {
            //FIXME: is it safe to assume internal serialization-format format?
-            _types[i]->validate(values[i], cql_serialization_format::internal());
+            _types[i]->validate(values[i]);
        }
    }
    bool equal(managed_bytes_view v1, managed_bytes_view v2) const {
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -560,7 +560,7 @@ public:
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
-                throw runtime_exception(format("non-zero component divider found ({:d}) mid", format("0x{:02x}", composite::eoc_type(marker) & 0xff)));
+                throw runtime_exception(format("non-zero component divider found ({:#02x}) mid", composite::eoc_type(marker) & 0xff));
            }
        }
        return ret;
--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -117,6 +117,8 @@ struct date_type_impl final : public concrete_type<db_clock::time_point> {

 using timestamp_date_base_class = concrete_type<db_clock::time_point>;

+sstring timestamp_to_json_string(const timestamp_date_base_class& t, const bytes_view& bv);
+
 struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    timeuuid_type_impl();
    static utils::UUID from_sstring(sstring_view s);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -65,6 +65,13 @@ commitlog_sync_period_in_ms: 10000
 # is reasonable.
 commitlog_segment_size_in_mb: 32

+# The size of the individual schema commitlog file segments.
+
+# The segment size puts a limit on the mutation size that can be
+# written at once, and some schema mutation writes are much larger
+# than average.
+schema_commitlog_segment_size_in_mb: 32
+
 # seed_provider class_name is saved for future use.
 # A seed address is mandatory.
 seed_provider:
@@ -448,20 +455,20 @@ commitlog_total_space_in_mb: -1
 #    internode_encryption: none
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-#    truststore: <none, use system trust>
-#    certficate_revocation_list: <none>
+#    truststore: <not set, use system trust>
+#    certficate_revocation_list: <not set>
 #    require_client_auth: False
-#    priority_string: <none, use default>
+#    priority_string: <not set, use default>

 # enable or disable client/server encryption.
 # client_encryption_options:
 #    enabled: false
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-#    truststore: <none, use system trust>
-#    certficate_revocation_list: <none>
+#    truststore: <not set, use system trust>
+#    certficate_revocation_list: <not set>
 #    require_client_auth: False
-#    priority_string: <none, use default>
+#    priority_string: <not set, use default>

 # internode_compression controls whether traffic between nodes is
 # compressed.
@@ -553,4 +560,16 @@ murmur3_partitioner_ignore_msb_bits: 12
 # WARNING: It's unsafe to set this to false if the node previously booted
 # with the schema commit log enabled. In such case, some schema changes
 # may be lost if the node was not cleanly stopped.
-force_schema_commit_log: true
+force_schema_commit_log: true
+
+# Use Raft to consistently manage schema information in the cluster.
+# Refer to https://docs.scylladb.com/master/architecture/raft.html for more details.
+# The 'Handling Failures' section is especially important.
+#
+# Once enabled in a cluster, this cannot be turned off.
+# If you want to bootstrap a new cluster without Raft, make sure to set this to `false`
+# before starting your nodes for the first time.
+#
+# A cluster not using Raft can be 'upgraded' to use Raft. Refer to the aforementioned
+# documentation, section 'Enabling Raft in ScyllaDB 5.2 and further', for the procedure.
+consistent_cluster_management: true
--- a/configure.py
+++ b/configure.py
@@ -44,16 +44,12 @@ distro_extra_cflags = ''
 distro_extra_ldflags = ''
 distro_extra_cmake_args = []
 employ_ld_trickery = True
-has_wasmtime = False
-use_wasmtime_as_library = False

 # distro-specific setup
 def distro_setup_nix():
-    global os_ids, employ_ld_trickery, has_wasmtime, use_wasmtime_as_library
+    global os_ids, employ_ld_trickery
    os_ids = ['linux']
    employ_ld_trickery = False
-    has_wasmtime = True
-    use_wasmtime_as_library = True

 if os.environ.get('NIX_CC'):
        distro_setup_nix()
@@ -200,7 +196,7 @@ def linker_flags(compiler):


 def maybe_static(flag, libs):
-    if flag and not args.static:
+    if flag:
        libs = '-Wl,-Bstatic {} -Wl,-Bdynamic'.format(libs)
    return libs

@@ -413,6 +409,7 @@ scylla_tests = set([
    'test/boost/limiting_data_source_test',
    'test/boost/linearizing_input_stream_test',
    'test/boost/loading_cache_test',
+    'test/boost/locator_topology_test',
    'test/boost/log_heap_test',
    'test/boost/estimated_histogram_test',
    'test/boost/summary_test',
@@ -483,6 +480,8 @@ scylla_tests = set([
    'test/boost/virtual_reader_test',
    'test/boost/virtual_table_mutation_source_test',
    'test/boost/virtual_table_test',
+    'test/boost/wasm_test',
+    'test/boost/wasm_alloc_test',
    'test/boost/bptree_test',
    'test/boost/btree_test',
    'test/boost/radix_tree_test',
@@ -574,13 +573,6 @@ all_artifacts = apps | tests | other
 arg_parser = argparse.ArgumentParser('Configure scylla')
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
                        help='Output build-file name (by default build.ninja)')
-arg_parser.add_argument('--static', dest='static', action='store_const', default='',
-                        const='-static',
-                        help='Static link (useful for running on hosts outside the build environment')
-arg_parser.add_argument('--pie', dest='pie', action='store_true',
-                        help='Build position-independent executable (PIE)')
-arg_parser.add_argument('--so', dest='so', action='store_true',
-                        help='Build shared object (SO) instead of executable')
 arg_parser.add_argument('--mode', action='append', choices=list(modes.keys()), dest='selected_modes',
                        help="Build modes to generate ninja files for. The available build modes are:\n{}".format("; ".join(["{} - {}".format(m, cfg['description']) for m, cfg in modes.items()])))
 arg_parser.add_argument('--with', dest='artifacts', action='append', default=[],
@@ -671,7 +663,7 @@ scylla_core = (['message/messaging_service.cc',
                'replica/distributed_loader.cc',
                'replica/memtable.cc',
                'replica/exceptions.cc',
-                'dirty_memory_manager.cc',
+                'replica/dirty_memory_manager.cc',
                'absl-flat_hash_map.cc',
                'atomic_cell.cc',
                'caching_options.cc',
@@ -706,6 +698,7 @@ scylla_core = (['message/messaging_service.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
+                'utils/on_internal_error.cc',
                'converting_mutation_partition_applier.cc',
                'readers/combined.cc',
                'readers/multishard.cc',
@@ -825,6 +818,7 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/statements/detach_service_level_statement.cc',
                'cql3/statements/list_service_level_statement.cc',
                'cql3/statements/list_service_level_attachments_statement.cc',
+                'cql3/statements/describe_statement.cc',
                'cql3/update_parameters.cc',
                'cql3/util.cc',
                'cql3/ut_name.cc',
@@ -946,6 +940,7 @@ scylla_core = (['message/messaging_service.cc',
                'locator/ec2_multi_region_snitch.cc',
                'locator/gce_snitch.cc',
                'locator/topology.cc',
+                'locator/util.cc',
                'service/client_state.cc',
                'service/storage_service.cc',
                'service/misc_services.cc',
@@ -975,6 +970,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -1036,6 +1032,7 @@ scylla_core = (['message/messaging_service.cc',
                'service/raft/raft_group0_client.cc',
                'service/broadcast_tables/experimental/lang.cc',
                'tasks/task_manager.cc',
+                'rust/wasmtime_bindings/src/lib.rs',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')] \
                  + scylla_raft_core
               )
@@ -1082,6 +1079,8 @@ api = ['api/api.cc',
       Json2Code('api/api-doc/error_injection.json'),
       'api/authorization_cache.cc',
       Json2Code('api/api-doc/authorization_cache.json'),
+       'api/raft.cc',
+       Json2Code('api/api-doc/raft.json'),
       ]

 alternator = [
@@ -1153,10 +1152,6 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/experimental/broadcast_tables_lang.idl.hh',
        ]

-rusts = [
-    'rust/inc/src/lib.rs',
-]
-
 headers = find_headers('.', excluded_dirs=['idl', 'build', 'seastar', '.git'])

 scylla_tests_generic_dependencies = [
@@ -1180,7 +1175,7 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci

 scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc']

-scylla_tools = ['tools/scylla-types.cc', 'tools/scylla-sstable.cc', 'tools/schema_loader.cc', 'tools/utils.cc']
+scylla_tools = ['tools/scylla-types.cc', 'tools/scylla-sstable.cc', 'tools/schema_loader.cc', 'tools/utils.cc', 'tools/lua_sstable_consumer.cc']

 deps = {
    'scylla': idls + ['main.cc'] + scylla_core + api + alternator + redis + scylla_tools,
@@ -1278,7 +1273,7 @@ deps['test/boost/bytes_ostream_test'] = [
    "test/lib/log.cc",
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
-deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
+deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc', 'utils/on_internal_error.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
@@ -1309,7 +1304,7 @@ deps['test/boost/exceptions_fallback_test'] = ['test/boost/exceptions_fallback_t

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
 deps['test/boost/schema_loader_test'] += ['tools/schema_loader.cc']
-deps['test/boost/rust_test'] += rusts
+deps['test/boost/rust_test'] += ['rust/inc/src/lib.rs']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
 deps['test/raft/raft_server_test'] = ['test/raft/raft_server_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
@@ -1375,7 +1370,7 @@ warnings = [w

 warnings = ' '.join(warnings + ['-Wno-error=deprecated-declarations'])

-def clang_inline_threshold():
+def get_clang_inline_threshold():
    if args.clang_inline_threshold != -1:
        return args.clang_inline_threshold
    elif platform.machine() == 'aarch64':
@@ -1396,7 +1391,7 @@ for mode in modes:

 optimization_flags = [
    '--param inline-unit-growth=300', # gcc
-    f'-mllvm -inline-threshold={clang_inline_threshold()}',  # clang
+    f'-mllvm -inline-threshold={get_clang_inline_threshold()}',  # clang
    # clang generates 16-byte loads that break store-to-load forwarding
    # gcc also has some trouble: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103554
    '-fno-slp-vectorize',
@@ -1410,15 +1405,6 @@ if flag_supported(flag='-Wstack-usage=4096', compiler=args.cxx):
    for mode in modes:
        modes[mode]['cxxflags'] += f' -Wstack-usage={modes[mode]["stack-usage-threshold"]} -Wno-error=stack-usage='

-if not has_wasmtime:
-    has_wasmtime = os.path.isfile('/usr/lib64/libwasmtime.a') and os.path.isdir('/usr/local/include/wasmtime')
-
-if has_wasmtime:
-    for mode in modes:
-        modes[mode]['cxxflags'] += ' -DSCYLLA_ENABLE_WASMTIME'
-else:
-    print("wasmtime not found - WASM support will not be enabled in this build")
-
 linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
@@ -1429,16 +1415,6 @@ perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'
 # debug info from the libraries we static link with
 regular_link_rule = 'link' if args.debuginfo else 'link_stripped'

-if args.so:
-    args.pie = '-shared'
-    args.fpie = '-fpic'
-elif args.pie:
-    args.pie = '-pie'
-    args.fpie = '-fpie'
-else:
-    args.pie = ''
-    args.fpie = ''
-
 # a list element means a list of alternative packages to consider
 # the first element becomes the HAVE_pkg define
 # a string element is a package name with no alternatives
@@ -1595,11 +1571,14 @@ args.user_ldflags = forced_ldflags + ' ' + args.user_ldflags

 args.user_cflags += f" -ffile-prefix-map={curdir}=."

-seastar_cflags = args.user_cflags
-
 if args.target != '':
-    seastar_cflags += ' -march=' + args.target
-seastar_ldflags = args.user_ldflags
+    args.user_cflags += ' -march=' + args.target
+
+for mode in modes:
+    # Those flags are passed not only to Scylla objects, but also to libraries
+    # that we compile ourselves.
+    modes[mode]['lib_cflags'] = args.user_cflags
+    modes[mode]['lib_ldflags'] = args.user_ldflags + linker_flags

 # cmake likes to separate things with semicolons
 def semicolon_separated(*flags):
@@ -1619,8 +1598,8 @@ def configure_seastar(build_dir, mode, mode_config):
        '-DCMAKE_C_COMPILER={}'.format(args.cc),
        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
        '-DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON',
-        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags).replace(' ', ';')),
-        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(seastar_ldflags, modes[mode]['cxx_ld_flags'])),
+        '-DSeastar_CXX_FLAGS=SHELL:{}'.format(mode_config['lib_cflags']),
+        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(mode_config['lib_ldflags'], mode_config['cxx_ld_flags'])),
        '-DSeastar_CXX_DIALECT=gnu++20',
        '-DSeastar_API_LEVEL=6',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
@@ -1681,52 +1660,16 @@ for mode in build_modes:
    seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
    modes[mode]['seastar_cflags'] = seastar_pc_cflags
    modes[mode]['seastar_libs'] = seastar_pc_libs
+    modes[mode]['seastar_testing_libs'] = pkg_config(pc[mode].replace('seastar.pc', 'seastar-testing.pc'), '--libs', '--static')

-def configure_abseil(build_dir, mode, mode_config):
-    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
+abseil_pkgs = [
+    'absl_raw_hash_set',
+    'absl_hash',
+]

-    abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
-    cmake_mode = mode_config['cmake_build_type']
-    abseil_cmake_args = [
-        '-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
-        '-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
-        '-DCMAKE_C_COMPILER={}'.format(args.cc),
-        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
-        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
-        '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
-        '-DCMAKE_CXX_STANDARD=20',
-        '-DABSL_PROPAGATE_CXX_STD=ON',
-    ] + distro_extra_cmake_args
-
-    abseil_cmd = ['cmake', '-G', 'Ninja', real_relpath('abseil', abseil_build_dir)] + abseil_cmake_args
-
-    os.makedirs(abseil_build_dir, exist_ok=True)
-    subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
-
-abseil_libs = ['absl/' + lib for lib in [
-    'container/libabsl_hashtablez_sampler.a',
-    'container/libabsl_raw_hash_set.a',
-    'synchronization/libabsl_synchronization.a',
-    'synchronization/libabsl_graphcycles_internal.a',
-    'debugging/libabsl_stacktrace.a',
-    'debugging/libabsl_symbolize.a',
-    'debugging/libabsl_debugging_internal.a',
-    'debugging/libabsl_demangle_internal.a',
-    'time/libabsl_time.a',
-    'time/libabsl_time_zone.a',
-    'numeric/libabsl_int128.a',
-    'hash/libabsl_city.a',
-    'hash/libabsl_hash.a',
-    'hash/libabsl_low_level_hash.a',
-    'base/libabsl_malloc_internal.a',
-    'base/libabsl_spinlock_wait.a',
-    'base/libabsl_base.a',
-    'base/libabsl_raw_logging_internal.a',
-    'profiling/libabsl_exponential_biased.a',
-    'base/libabsl_throw_delegate.a']]
+pkgs += abseil_pkgs

 args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
-args.user_cflags += ' -march=' + args.target
 libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
                 ' -lstdc++fs', ' -lcrypt', ' -lcryptopp', ' -lpthread',
                 # Must link with static version of libzstd, since
@@ -1736,10 +1679,6 @@ libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-l
                 '-lxxhash',
                 '-ldeflate',
                ])
-if has_wasmtime:
-    print("Found wasmtime dependency, linking with libwasmtime")
-    if use_wasmtime_as_library:
-        libs += " -lwasmtime"

 if not args.staticboost:
    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'
@@ -1758,7 +1697,6 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
    libs += ' ' + pkg_config(pkg, '--libs')
-args.user_cflags += ' -isystem abseil'
 user_cflags = args.user_cflags + ' -fvisibility=hidden'
 user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
 if args.staticcxx:
@@ -1780,10 +1718,6 @@ if args.ragel_exec:
 else:
    ragel_exec = "ragel"

-if not args.dist_only:
-    for mode, mode_config in build_modes.items():
-        configure_abseil(outdir, mode, mode_config)
-
 with open(buildfile, 'w') as f:
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
@@ -1836,18 +1770,24 @@ with open(buildfile, 'w') as f:
        rule unified
            command = unified/build_unified.sh --mode $mode --unified-pkg $out
        rule rust_header
-            command = cxxbridge $in > $out
+            command = cxxbridge --include rust/cxx.h --header $in > $out
            description = RUST_HEADER $out
+        rule rust_source
+            command = cxxbridge --include rust/cxx.h $in > $out
+            description = RUST_SOURCE $out
+        rule cxxbridge_header
+            command = cxxbridge --header > $out
        ''').format(**globals()))
    for mode in build_modes:
        modeval = modes[mode]
        fmt_lib = 'fmt'
        f.write(textwrap.dedent('''\
            cxx_ld_flags_{mode} = {cxx_ld_flags}
-            ld_flags_{mode} = $cxx_ld_flags_{mode}
-            cxxflags_{mode} = $cxx_ld_flags_{mode} {cxxflags} -iquote. -iquote $builddir/{mode}/gen
+            ld_flags_{mode} = $cxx_ld_flags_{mode} {lib_ldflags}
+            cxxflags_{mode} = $cxx_ld_flags_{mode} {lib_cflags} {cxxflags} -iquote. -iquote $builddir/{mode}/gen
            libs_{mode} = -l{fmt_lib}
            seastar_libs_{mode} = {seastar_libs}
+            seastar_testing_libs_{mode} = {seastar_testing_libs}
            rule cxx.{mode}
              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in
              description = CXX $out
@@ -1897,7 +1837,8 @@ with open(buildfile, 'w') as f:
              pool = console
              description = TEST {mode}
            rule rust_lib.{mode}
-              command = CARGO_HOME=build/{mode}/rust/.cargo cargo build --release --manifest-path=rust/Cargo.toml --target-dir=build/{mode}/rust -p ${{pkg}}
+              command = CARGO_BUILD_DEP_INFO_BASEDIR='.' cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
+                        && touch $out
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, test_repeat=test_repeat, test_timeout=test_timeout, **modeval))
        f.write(
@@ -1916,7 +1857,6 @@ with open(buildfile, 'w') as f:
        ragels = {}
        antlr3_grammars = set()
        rust_headers = {}
-        rust_libs = {}
        seastar_dep = '$builddir/{}/seastar/libseastar.a'.format(mode)
        seastar_testing_dep = '$builddir/{}/seastar/libseastar_testing.a'.format(mode)
        for binary in sorted(build_artifacts):
@@ -1927,9 +1867,8 @@ with open(buildfile, 'w') as f:
                    for src in srcs
                    if src.endswith('.cc')]
            objs.append('$builddir/../utils/arch/powerpc/crc32-vpmsum/crc32.S')
-            if has_wasmtime and not use_wasmtime_as_library:
-                objs.append('/usr/lib64/libwasmtime.a')
            has_thrift = False
+            has_rust = False
            for dep in deps[binary]:
                if isinstance(dep, Thrift):
                    has_thrift = True
@@ -1938,40 +1877,36 @@ with open(buildfile, 'w') as f:
                    objs += dep.objects('$builddir/' + mode + '/gen')
                if isinstance(dep, Json2Code):
                    objs += dep.objects('$builddir/' + mode + '/gen')
-                if dep.endswith('/src/lib.rs'):
-                    lib = dep.replace('/src/lib.rs', '.a').replace('rust/','lib')
-                    objs.append('$builddir/' + mode + '/rust/release/' + lib)
-            if binary.endswith('.a'):
-                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
+                if dep.endswith('.rs'):
+                    has_rust = True
+                    idx = dep.rindex('/src/')
+                    obj = dep[:idx].replace('rust/','') + '.o'
+                    objs.append('$builddir/' + mode + '/gen/rust/' + obj)
+            if has_rust:
+                objs.append('$builddir/' + mode +'/rust-' + mode + '/librust_combined.a')
+            local_libs = '$seastar_libs_{} $libs'.format(mode)
+            if has_thrift:
+                local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
+            if binary in tests:
+                if binary in pure_boost_tests:
+                    local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
+                if binary not in tests_not_using_seastar_test_framework:
+                    local_libs += ' ' + "$seastar_testing_libs_{}".format(mode)
+                # Our code's debugging information is huge, and multiplied
+                # by many tests yields ridiculous amounts of disk space.
+                # So we strip the tests by default; The user can very
+                # quickly re-link the test unstripped by adding a "_g"
+                # to the test name, e.g., "ninja build/release/testname_g"
+                link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                f.write('   libs = {}\n'.format(local_libs))
+                f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                f.write('   libs = {}\n'.format(local_libs))
            else:
-                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
-                    'abseil/' + x for x in abseil_libs
-                ]])
-                if binary in tests:
-                    local_libs = '$seastar_libs_{} $libs'.format(mode)
-                    if binary in pure_boost_tests:
-                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
-                    if binary not in tests_not_using_seastar_test_framework:
-                        pc_path = pc[mode].replace('seastar.pc', 'seastar-testing.pc')
-                        local_libs += ' ' + pkg_config(pc_path, '--libs', '--static')
-                    if has_thrift:
-                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
-                    # Our code's debugging information is huge, and multiplied
-                    # by many tests yields ridiculous amounts of disk space.
-                    # So we strip the tests by default; The user can very
-                    # quickly re-link the test unstripped by adding a "_g"
-                    # to the test name, e.g., "ninja build/release/testname_g"
-                    link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
-                    f.write('   libs = {}\n'.format(local_libs))
-                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
-                    f.write('   libs = {}\n'.format(local_libs))
-                else:
-                    f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep))
-                    if has_thrift:
-                        f.write('   libs =  {} {} $seastar_libs_{} $libs\n'.format(thrift_libs, maybe_static(args.staticboost, '-lboost_system'), mode))
-                    f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
-                    f.write(f'build $builddir/{mode}/{binary}.debug: phony $builddir/{mode}/{binary}.stripped\n')
+                f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep))
+                f.write('   libs = {}\n'.format(local_libs))
+                f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
+                f.write(f'build $builddir/{mode}/{binary}.debug: phony $builddir/{mode}/{binary}.stripped\n')
            for src in srcs:
                if src.endswith('.cc'):
                    obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
@@ -1988,11 +1923,10 @@ with open(buildfile, 'w') as f:
                    thrifts.add(src)
                elif src.endswith('.g'):
                    antlr3_grammars.add(src)
-                elif src.endswith('/src/lib.rs'):
-                    hh = '$builddir/' + mode + '/gen/' + src.replace('/src/lib.rs', '.hh')
+                elif src.endswith('.rs'):
+                    idx = src.rindex('/src/')
+                    hh = '$builddir/' + mode + '/gen/' + src[:idx] + '.hh'
                    rust_headers[hh] = src
-                    staticlib = src.replace('rust/', '$builddir/' + mode + '/rust/release/lib').replace('/src/lib.rs', '.a')
-                    rust_libs[staticlib] = src
                else:
                    raise Exception('No rule for ' + src)
        f.write('   libs = $seastar_libs_{}\n'.format(mode))
@@ -2032,6 +1966,7 @@ with open(buildfile, 'w') as f:
        gen_headers += list(serializers.keys())
        gen_headers += list(ragels.keys())
        gen_headers += list(rust_headers.keys())
+        gen_headers.append('$builddir/{}/gen/rust/cxx.h'.format(mode))
        gen_headers_dep = ' '.join(gen_headers)

        for obj in compiles:
@@ -2055,10 +1990,13 @@ with open(buildfile, 'w') as f:
        for hh in rust_headers:
            src = rust_headers[hh]
            f.write('build {}: rust_header {}\n'.format(hh, src))
-        for lib in rust_libs:
-            src = rust_libs[lib]
-            package = src.replace('/src/lib.rs', '').replace('rust/','')
-            f.write('build {}: rust_lib.{} {}\n  pkg = {}\n'.format(lib, mode, src, package))
+            cc = hh.replace('.hh', '.cc')
+            f.write('build {}: rust_source {}\n'.format(cc, src))
+            obj = cc.replace('.cc', '.o')
+            f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, gen_headers_dep))
+        f.write('build {}: cxxbridge_header\n'.format('$builddir/{}/gen/rust/cxx.h'.format(mode)))
+        librust = '$builddir/{}/rust-{}/librust_combined'.format(mode, mode)
+        f.write('build {}.a: rust_lib.{} rust/Cargo.lock\n  depfile={}.d\n'.format(librust, mode, librust))
        for thrift in thrifts:
            outs = ' '.join(thrift.generated('$builddir/{}/gen'.format(mode)))
            f.write('build {}: thrift.{} {}\n'.format(outs, mode, thrift.source))
@@ -2074,7 +2012,8 @@ with open(buildfile, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
@@ -2137,12 +2076,6 @@ with open(buildfile, 'w') as f:
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')

-        for lib in abseil_libs:
-            f.write('build $builddir/{mode}/abseil/{lib}: ninja $builddir/{mode}/abseil/build.ninja\n'.format(**locals()))
-            f.write('  pool = submodule_pool\n')
-            f.write('  subdir = $builddir/{mode}/abseil\n'.format(**locals()))
-            f.write('  target = {lib}\n'.format(**locals()))
-
    checkheaders_mode = 'dev' if 'dev' in modes else modes.keys()[0]
    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(checkheaders_mode, hh) for hh in headers])))

@@ -2253,7 +2186,7 @@ with open(buildfile, 'w') as f:
            description = List configured modes
        build mode_list: mode_list
        default {modes_list}
-        ''').format(modes_list=' '.join(default_modes), build_ninja_list=' '.join([f'build/{mode}/{dir}/build.ninja' for mode in build_modes for dir in ['seastar', 'abseil']]), **globals()))
+        ''').format(modes_list=' '.join(default_modes), build_ninja_list=' '.join([f'build/{mode}/{dir}/build.ninja' for mode in build_modes for dir in ['seastar']]), **globals()))
    unit_test_list = set(test for test in build_artifacts if test in set(tests))
    f.write(textwrap.dedent('''\
        rule unit_test_list
@@ -2282,7 +2215,7 @@ with open(buildfile, 'w') as f:
 compdb = 'compile_commands.json'
 # per-mode compdbs are built by taking the relevant entries from the
 # output of "ninja -t compdb" and combining them with the CMake-made
-# compdbs for Seastar and Abseil in the relevant mode.
+# compdbs for Seastar in the relevant mode.
 #
 # "ninja -t compdb" output has to be filtered because
 # - it contains rules for all selected modes, and several entries for
@@ -2297,7 +2230,7 @@ with tempfile.NamedTemporaryFile() as ninja_compdb:
    # build mode-specific compdbs
    for mode in selected_modes:
        mode_out = outdir + '/' + mode
-        submodule_compdbs = [mode_out + '/' + submodule + '/' + compdb for submodule in ['abseil', 'seastar']]
+        submodule_compdbs = [mode_out + '/' + submodule + '/' + compdb for submodule in ['seastar']]
        with open(mode_out + '/' + compdb, 'w+b') as combined_mode_specific_compdb:
            subprocess.run(['./scripts/merge-compdb.py', 'build/' + mode,
                            ninja_compdb.name] + submodule_compdbs, stdout=combined_mode_specific_compdb)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -51,6 +51,7 @@ options {
 #include "cql3/statements/index_prop_defs.hh"
 #include "cql3/statements/raw/use_statement.hh"
 #include "cql3/statements/raw/batch_statement.hh"
+#include "cql3/statements/raw/describe_statement.hh"
 #include "cql3/statements/list_users_statement.hh"
 #include "cql3/statements/grant_statement.hh"
 #include "cql3/statements/revoke_statement.hh"
@@ -358,6 +359,7 @@ cqlStatement returns [std::unique_ptr<raw::parsed_statement> stmt]
    | st46=listServiceLevelStatement { $stmt = std::move(st46); }
    | st47=listServiceLevelAttachStatement { $stmt = std::move(st47); }
    | st48=pruneMaterializedViewStatement  { $stmt = std::move(st48); }
+    | st49=describeStatement           { $stmt = std::move(st49); }
    ;

 /*
@@ -461,8 +463,7 @@ orderByClause[raw::select_statement::parameters::orderings_type& orderings]
    ;

 jsonValue returns [expression value]
-    :
-    | s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
+    : s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
    | m=marker         { $value = std::move(m); }
    ;

@@ -1368,6 +1369,59 @@ listServiceLevelAttachStatement returns [std::unique_ptr<list_service_level_atta
      { $stmt = std::make_unique<list_service_level_attachments_statement>(); }
    ;

+/**
+ * (DESCRIBE | DESC) (
+ *    CLUSTER
+ *    [FULL] SCHEMA
+ *    KEYSPACES
+ *    [ONLY] KEYSPACE <name>?
+ *    TABLES
+ *    TABLE <name>
+ *    TYPES
+ *    TYPE <name>
+ *    FUNCTIONS
+ *    FUNCTION <name>
+ *    AGGREGATES
+ *    AGGREGATE <name>
+ * ) (WITH INTERNALS)?
+ */
+describeStatement returns [std::unique_ptr<cql3::statements::raw::describe_statement> stmt]
+    @init {
+        bool fullSchema = false;
+        bool pending = false;
+        bool config = false;
+        bool only = false;
+        std::optional<sstring> keyspace;
+        sstring generic_name = "";
+    }
+    : ( K_DESCRIBE | K_DESC )
+    ( (K_CLUSTER) => K_CLUSTER                      { $stmt = cql3::statements::raw::describe_statement::cluster();                }
+    | (K_FULL { fullSchema=true; })? K_SCHEMA       { $stmt = cql3::statements::raw::describe_statement::schema(fullSchema);       }
+    | (K_KEYSPACES) => K_KEYSPACES                  { $stmt = cql3::statements::raw::describe_statement::keyspaces();              }
+    | (K_ONLY { only=true; })? K_KEYSPACE ( ks=keyspaceName { keyspace = ks; })?
+                                                    { $stmt = cql3::statements::raw::describe_statement::keyspace(keyspace, only); }
+    | (K_TABLES) => K_TABLES                        { $stmt = cql3::statements::raw::describe_statement::tables();                 }
+    | K_COLUMNFAMILY cf=columnFamilyName            { $stmt = cql3::statements::raw::describe_statement::table(cf);                }
+    | K_INDEX idx=columnFamilyName                  { $stmt = cql3::statements::raw::describe_statement::index(idx);               }
+    | K_MATERIALIZED K_VIEW view=columnFamilyName   { $stmt = cql3::statements::raw::describe_statement::view(view);               }
+    | (K_TYPES) => K_TYPES                          { $stmt = cql3::statements::raw::describe_statement::types();                  }
+    | K_TYPE tn=userTypeName                        { $stmt = cql3::statements::raw::describe_statement::type(tn);                 }
+    | (K_FUNCTIONS) => K_FUNCTIONS                  { $stmt = cql3::statements::raw::describe_statement::functions();              }
+    | K_FUNCTION fn=functionName                    { $stmt = cql3::statements::raw::describe_statement::function(fn);             }
+    | (K_AGGREGATES) => K_AGGREGATES                { $stmt = cql3::statements::raw::describe_statement::aggregates();             }
+    | K_AGGREGATE ag=functionName                   { $stmt = cql3::statements::raw::describe_statement::aggregate(ag);            }
+    | ( ( ksT=IDENT                                 { keyspace = sstring{$ksT.text}; }
+        | ksT=QUOTED_NAME                           { keyspace = sstring{$ksT.text}; }
+        | ksK=unreserved_keyword                    { keyspace = ksK; } ) 
+        '.' )?
+        ( tT=IDENT                                  { generic_name = sstring{$tT.text}; }
+        | tT=QUOTED_NAME                            { generic_name = sstring{$tT.text}; }
+        | tK=unreserved_keyword                     { generic_name = tK; } )
+                                                    { $stmt = cql3::statements::raw::describe_statement::generic(keyspace, generic_name); }
+    )
+    ( K_WITH K_INTERNALS { $stmt->with_internals_details(); } )?
+    ;
+
 /** DEFINITIONS **/

 // Column Identifiers.  These need to be treated differently from other
@@ -1513,7 +1567,7 @@ value returns [expression value]
    | l=collectionLiteral  { $value = std::move(l); }
    | u=usertypeLiteral    { $value = std::move(u); }
    | t=tupleLiteral       { $value = std::move(t); }
-    | K_NULL               { $value = null(); }
+    | K_NULL               { $value = make_untyped_null(); }
    | e=marker             { $value = std::move(e); }
    ;

@@ -1523,8 +1577,7 @@ marker returns [expression value]
    ;

 intValue returns [expression value]
-    :
-    | t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
+    : t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
    | e=marker      { $value = std::move(e); }
    ;

@@ -1678,7 +1731,7 @@ relation returns [expression e]
    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
        { $e = binary_operator(token{std::move(l.elements)}, type, std::move(t)); }
    | name=cident K_IS K_NOT K_NULL {
-          $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IS_NOT, null()); }
+          $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IS_NOT, make_untyped_null()); }
    | name=cident K_IN marker1=marker
        { $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IN, std::move(marker1)); }
    | name=cident K_IN in_values=singleColumnInValues
@@ -1897,10 +1950,13 @@ unreserved_function_keyword returns [sstring str]
 basic_unreserved_keyword returns [sstring str]
    : k=( K_KEYS
        | K_AS
+        | K_CLUSTER
        | K_CLUSTERING
        | K_COMPACT
        | K_STORAGE
+        | K_TABLES
        | K_TYPE
+        | K_TYPES
        | K_VALUES
        | K_MAP
        | K_LIST
@@ -1924,11 +1980,14 @@ basic_unreserved_keyword returns [sstring str]
        | K_TRIGGER
        | K_DISTINCT
        | K_CONTAINS
+        | K_INTERNALS
        | K_STATIC
        | K_FROZEN
        | K_TUPLE
        | K_FUNCTION
+        | K_FUNCTIONS
        | K_AGGREGATE
+        | K_AGGREGATES
        | K_SFUNC
        | K_STYPE
        | K_REDUCEFUNC
@@ -1956,6 +2015,9 @@ basic_unreserved_keyword returns [sstring str]
        | K_LEVEL
        | K_LEVELS
        | K_PRUNE
+        | K_ONLY
+        | K_DESCRIBE
+        | K_DESC
        ) { $str = $k.text; }
    ;

@@ -2013,11 +2075,14 @@ K_TRUNCATE:    T R U N C A T E;
 K_DELETE:      D E L E T E;
 K_IN:          I N;
 K_CREATE:      C R E A T E;
+K_SCHEMA:      S C H E M A;
 K_KEYSPACE:    ( K E Y S P A C E
-                 | S C H E M A );
+                 | K_SCHEMA );
 K_KEYSPACES:   K E Y S P A C E S;
 K_COLUMNFAMILY:( C O L U M N F A M I L Y
                 | T A B L E );
+K_TABLES:      ( C O L U M N F A M I L I E S
+                 | T A B L E S );
 K_MATERIALIZED:M A T E R I A L I Z E D;
 K_VIEW:        V I E W;
 K_INDEX:       I N D E X;
@@ -2034,6 +2099,7 @@ K_ALTER:       A L T E R;
 K_RENAME:      R E N A M E;
 K_ADD:         A D D;
 K_TYPE:        T Y P E;
+K_TYPES:       T Y P E S;
 K_COMPACT:     C O M P A C T;
 K_STORAGE:     S T O R A G E;
 K_ORDER:       O R D E R;
@@ -2045,6 +2111,8 @@ K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
 K_IS:          I S;
 K_CONTAINS:    C O N T A I N S;
+K_INTERNALS:   I N T E R N A L S;
+K_ONLY:        O N L Y;

 K_GRANT:       G R A N T;
 K_ALL:         A L L;
@@ -2068,6 +2136,7 @@ K_LOGIN:       L O G I N;
 K_NOLOGIN:     N O L O G I N;
 K_OPTIONS:     O P T I O N S;

+K_CLUSTER:     C L U S T E R;
 K_CLUSTERING:  C L U S T E R I N G;
 K_ASCII:       A S C I I;
 K_BIGINT:      B I G I N T;
@@ -2107,7 +2176,9 @@ K_STATIC:      S T A T I C;
 K_FROZEN:      F R O Z E N;

 K_FUNCTION:    F U N C T I O N;
+K_FUNCTIONS:   F U N C T I O N S;
 K_AGGREGATE:   A G G R E G A T E;
+K_AGGREGATES:  A G G R E G A T E S;
 K_SFUNC:       S F U N C;
 K_STYPE:       S T Y P E;
 K_REDUCEFUNC:  R E D U C E F U N C;
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -10,6 +10,7 @@

 #include "cql3/attributes.hh"
 #include "cql3/column_identifier.hh"
+#include <optional>

 namespace cql3 {

@@ -20,7 +21,9 @@ std::unique_ptr<attributes> attributes::none() {
 attributes::attributes(std::optional<cql3::expr::expression>&& timestamp,
                       std::optional<cql3::expr::expression>&& time_to_live,
                       std::optional<cql3::expr::expression>&& timeout)
-    : _timestamp{std::move(timestamp)}
+    : _timestamp_unset_guard(timestamp)
+    , _timestamp{std::move(timestamp)}
+    , _time_to_live_unset_guard(time_to_live)
    , _time_to_live{std::move(time_to_live)}
    , _timeout{std::move(timeout)}
 { }
@@ -38,7 +41,7 @@ bool attributes::is_timeout_set() const {
 }

 int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
-    if (!_timestamp.has_value()) {
+    if (!_timestamp.has_value() || _timestamp_unset_guard.is_unset(options)) {
        return now;
    }

@@ -46,31 +49,25 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of timestamp");
    }
-    if (tval.is_unset_value()) {
-        return now;
-    }
    try {
-        return tval.view().validate_and_deserialize<int64_t>(*long_type, cql_serialization_format::internal());
+        return tval.view().validate_and_deserialize<int64_t>(*long_type);
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
 }

-int32_t attributes::get_time_to_live(const query_options& options) {
-    if (!_time_to_live.has_value())
-        return 0;
+std::optional<int32_t> attributes::get_time_to_live(const query_options& options) {
+    if (!_time_to_live.has_value() || _time_to_live_unset_guard.is_unset(options))
+        return std::nullopt;

    cql3::raw_value tval = expr::evaluate(*_time_to_live, options);
    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of TTL");
    }
-    if (tval.is_unset_value()) {
-        return 0;
-    }

    int32_t ttl;
    try {
-        ttl = tval.view().validate_and_deserialize<int32_t>(*int32_type, cql_serialization_format::internal());
+        ttl = tval.view().validate_and_deserialize<int32_t>(*int32_type);
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
@@ -91,8 +88,8 @@ int32_t attributes::get_time_to_live(const query_options& options) {

 db::timeout_clock::duration attributes::get_timeout(const query_options& options) const {
    cql3::raw_value timeout = expr::evaluate(*_timeout, options);
-    if (timeout.is_null() || timeout.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Timeout value cannot be unset/null");
+    if (timeout.is_null()) {
+        throw exceptions::invalid_request_exception("Timeout value cannot be null");
    }
    cql_duration duration = timeout.view().deserialize<cql_duration>(*duration_type);
    if (duration.months || duration.days) {
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -11,6 +11,7 @@
 #pragma once

 #include "cql3/expr/expression.hh"
+#include "cql3/expr/unset.hh"
 #include "db/timeout_clock.hh"

 namespace cql3 {
@@ -24,7 +25,9 @@ class prepare_context;
 */
 class attributes final {
 private:
+    expr::unset_bind_variable_guard _timestamp_unset_guard;
    std::optional<cql3::expr::expression> _timestamp;
+    expr::unset_bind_variable_guard _time_to_live_unset_guard;
    std::optional<cql3::expr::expression> _time_to_live;
    std::optional<cql3::expr::expression> _timeout;
 public:
@@ -42,7 +45,7 @@ public:

    int64_t get_timestamp(int64_t now, const query_options& options);

-    int32_t get_time_to_live(const query_options& options);
+    std::optional<int32_t> get_time_to_live(const query_options& options);

    db::timeout_clock::duration get_timeout(const query_options& options) const;

--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -139,10 +139,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti

        cql3::raw_value key_constant = expr::evaluate(*_collection_element, options);
        cql3::raw_value_view key = key_constant.view();
-        if (key.is_unset_value()) {
-            throw exceptions::invalid_request_exception(
-                    format("Invalid 'unset' value in {} element access", cell_type.cql3_type_name()));
-        }
        if (key.is_null()) {
            throw exceptions::invalid_request_exception(
                    format("Invalid null value for {} element access", cell_type.cql3_type_name()));
@@ -196,9 +192,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        // <, >, >=, <=, !=
        cql3::raw_value param = expr::evaluate(*_value, options);

-        if (param.is_unset_value()) {
-            throw exceptions::invalid_request_exception("Invalid 'unset' value in condition");
-        }
        if (param.is_null()) {
            if (_op == expr::oper_t::EQ) {
                return cell_value == nullptr;
@@ -224,9 +217,6 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
            return (*_matcher)(bytes_view(cell_value->serialize_nonnull()));
        } else {
            auto param = expr::evaluate(*_value, options);  // LIKE pattern
-            if (param.is_unset_value()) {
-                throw exceptions::invalid_request_exception("Invalid 'unset' value in LIKE pattern");
-            }
            if (param.is_null()) {
                throw exceptions::invalid_request_exception("Invalid NULL value in LIKE pattern");
            }
@@ -309,7 +299,7 @@ column_condition::raw::prepare(data_dictionary::database db, const sstring& keys

    if (_op == expr::oper_t::LIKE) {
        auto literal_term = expr::as_if<expr::untyped_constant>(&*_value);
-        if (literal_term) {
+        if (literal_term && literal_term->partial_type != expr::untyped_constant::type_class::null) {
            // Pass matcher object
            const sstring& pattern = literal_term->raw_text;
            return column_condition::condition(receiver, std::move(collection_element_expression),
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -33,9 +33,9 @@ public:
    private static final Logger logger = LoggerFactory.getLogger(Constants.class);
 #endif
 public:
-    class setter : public operation {
+    class setter : public operation_skip_if_unset {
    public:
-        using operation::operation;
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
@@ -53,30 +53,26 @@ public:
        virtual void prepare_for_broadcast_tables(statements::broadcast_tables::prepared_update& query) const override;
    };

-    struct adder final : operation {
-        using operation::operation;
+    struct adder final : operation_skip_if_unset {
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
-            } else if (value.is_unset_value()) {
-                return;
            }
            auto increment = value.view().deserialize<int64_t>(*long_type);
            m.set_cell(prefix, column, params.make_counter_update_cell(increment));
        }
    };

-    struct subtracter final : operation {
-        using operation::operation;
+    struct subtracter final : operation_skip_if_unset {
+        using operation_skip_if_unset::operation_skip_if_unset;

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = expr::evaluate(*_e, params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
-            } else if (value.is_unset_value()) {
-                return;
            }
            auto increment = value.view().deserialize<int64_t>(*long_type);
            if (increment == std::numeric_limits<int64_t>::min()) {
@@ -86,10 +82,10 @@ public:
        }
    };

-    class deleter : public operation {
+    class deleter : public operation_no_unset_support {
    public:
        deleter(const column_definition& column)
-            : operation(column, std::nullopt)
+            : operation_no_unset_support(column, std::nullopt)
        { }

        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -473,27 +473,40 @@ sstring maybe_quote(const sstring& identifier) {
    return result;
 }

-sstring quote(const sstring& identifier) {
+template <char C>
+static sstring quote_with(const sstring& str) {
+    static const std::string quote_str{C};
+
    // quote empty string
-    if (identifier.empty()) {
-        return "\"\"";
+    if (str.empty()) {
+        return make_sstring(quote_str, quote_str);
    }
    size_t num_quotes = 0;
-    for (char c : identifier) {
-        num_quotes += (c == '"');
+    for (char c : str) {
+        num_quotes += (c == C);
    }
    if (num_quotes == 0) {
-        return make_sstring("\"", identifier, "\"");
+        return make_sstring(quote_str, str, quote_str);
    }
-    static const std::regex double_quote_re("\"");
+
+    static const std::string double_quote_str{C, C};
+    static const std::regex quote_re(std::string{C});
    std::string result;
-    result.reserve(2 + identifier.size() + num_quotes);
-    result.push_back('"');
-    std::regex_replace(std::back_inserter(result), identifier.begin(), identifier.end(), double_quote_re, "\"\"");
-    result.push_back('"');
+    result.reserve(2 + str.size() + num_quotes);
+    result.push_back(C);
+    std::regex_replace(std::back_inserter(result), str.begin(), str.end(), quote_re, double_quote_str);
+    result.push_back(C);
    return result;
 }

+sstring quote(const sstring& identifier) {
+    return quote_with<'"'>(identifier);
+}
+
+sstring single_quote(const sstring& str) {
+    return quote_with<'\''>(str);
+}
+
 }

 }
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -180,18 +180,6 @@ get_value(const subscript& s, const evaluation_inputs& inputs) {
        // not an error.
        return std::nullopt;
    }
-    if (key.is_unset_value()) {
-        // An m[?] with ? bound to UNSET_VALUE is a invalid query.
-        // We could have detected it earlier while binding, but since
-        // we currently don't, we must protect the following code
-        // which can't work with an UNSET_VALUE. Note that the
-        // placement of this check here means that in an empty table,
-        // where we never need to evaluate the filter expression, this
-        // error will not be detected.
-        throw exceptions::invalid_request_exception(
-            format("Unsupported unset map key for column {}",
-                cdef->name_as_text()));
-    }
    if (col_type->is_map()) {
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto found = key.view().with_linearized([&] (bytes_view key_bv) {
@@ -251,9 +239,6 @@ public:
 /// True iff lhs's value equals rhs.
 bool_or_null equal(const expression& lhs, const managed_bytes_opt& rhs_bytes, const evaluation_inputs& inputs) {
    raw_value lhs_value = evaluate(lhs, inputs);
-    if (lhs_value.is_unset_value()) {
-        throw exceptions::invalid_request_exception("unset value found on left-hand side of an equality operator");
-    }
    if (lhs_value.is_null() || !rhs_bytes.has_value()) {
        return bool_or_null::null();
    }
@@ -269,14 +254,6 @@ static std::optional<std::pair<managed_bytes, managed_bytes>> evaluate_binop_sid
    raw_value lhs_value = evaluate(lhs, inputs);
    raw_value rhs_value = evaluate(rhs, inputs);

-    if (lhs_value.is_unset_value()) {
-        throw exceptions::invalid_request_exception(
-            format("unset value found on left-hand side of a binary operator with operation {}", op));
-    }
-    if (rhs_value.is_unset_value()) {
-        throw exceptions::invalid_request_exception(
-            format("unset value found on right-hand side of a binary operator with operation {}", op));
-    }
    if (lhs_value.is_null() || rhs_value.is_null()) {
        return std::nullopt;
    }
@@ -492,14 +469,7 @@ bool_or_null is_one_of(const expression& lhs, const expression& rhs, const evalu

 bool is_not_null(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
    cql3::raw_value lhs_val = evaluate(lhs, inputs);
-    if (lhs_val.is_unset_value()) {
-        throw exceptions::invalid_request_exception("unset value found on left hand side of IS NOT operator");
-    }
-
    cql3::raw_value rhs_val = evaluate(rhs, inputs);
-    if (rhs_val.is_unset_value()) {
-        throw exceptions::invalid_request_exception("unset value found on right hand side of IS NOT operator");
-    }
    if (!rhs_val.is_null()) {
        throw exceptions::invalid_request_exception("IS NOT operator accepts only NULL as its right side");
    }
@@ -554,9 +524,6 @@ bool is_satisfied_by(const binary_operator& opr, const evaluation_inputs& inputs
    if (binop_eval_result.is_null()) {
        return false;
    }
-    if (binop_eval_result.is_unset_value()) {
-        on_internal_error(expr_logger, format("is_satisfied_by: binary operator evaluated to unset value: {}", opr));
-    }
    if (binop_eval_result.is_empty_value()) {
        on_internal_error(expr_logger, format("is_satisfied_by: binary operator evaluated to EMPTY_VALUE: {}", opr));
    }
@@ -607,9 +574,6 @@ bool is_satisfied_by(const expression& restr, const evaluation_inputs& inputs) {
            [] (const field_selection&) -> bool {
                on_internal_error(expr_logger, "is_satisfied_by: a field selection cannot serve as a restriction by itself");
            },
-            [] (const null&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: NULL cannot serve as a restriction by itself");
-            },
            [] (const bind_variable&) -> bool {
                on_internal_error(expr_logger, "is_satisfied_by: a bind variable cannot serve as a restriction by itself");
            },
@@ -647,9 +611,6 @@ value_list get_IN_values(
        const expression& e, const query_options& options, const serialized_compare& comparator,
        sstring_view column_name) {
    const cql3::raw_value in_list = evaluate(e, options);
-    if (in_list.is_unset_value()) {
-        throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
-    }
    if (in_list.is_null()) {
        return value_list();
    }
@@ -698,6 +659,14 @@ expression make_conjunction(expression a, expression b) {
    return conjunction{std::move(children)};
 }

+untyped_constant
+make_untyped_null() {
+    return {
+        .partial_type = untyped_constant::type_class::null,
+        .raw_text = "null",
+    };
+}
+
 std::vector<expression>
 boolean_factors(expression e) {
    std::vector<expression> ret;
@@ -888,9 +857,6 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                        [] (const field_selection&) -> value_set {
                            on_internal_error(expr_logger, "possible_lhs_values: field selections are not supported as the LHS of a binary expression");
                        },
-                        [] (const null&) -> value_set {
-                            on_internal_error(expr_logger, "possible_lhs_values: nulls are not supported as the LHS of a binary expression");
-                        },
                        [] (const bind_variable&) -> value_set {
                            on_internal_error(expr_logger, "possible_lhs_values: bind variables are not supported as the LHS of a binary expression");
                        },
@@ -929,9 +895,6 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
            [] (const field_selection&) -> value_set {
                on_internal_error(expr_logger, "possible_lhs_values: a field selection cannot serve as a restriction by itself");
            },
-            [] (const null&) -> value_set {
-                on_internal_error(expr_logger, "possible_lhs_values: a NULL cannot serve as a restriction by itself");
-            },
            [] (const bind_variable&) -> value_set {
                on_internal_error(expr_logger, "possible_lhs_values: a bind variable cannot serve as a restriction by itself");
            },
@@ -1026,9 +989,6 @@ secondary_index::index::supports_expression_v is_supported_by_helper(const expre
                        [&] (const field_selection&) -> ret_t {
                            on_internal_error(expr_logger, "is_supported_by: field selections are not supported as the LHS of a binary expression");
                        },
-                        [&] (const null&) -> ret_t {
-                            on_internal_error(expr_logger, "is_supported_by: nulls are not supported as the LHS of a binary expression");
-                        },
                        [&] (const bind_variable&) -> ret_t {
                            on_internal_error(expr_logger, "is_supported_by: bind variables are not supported as the LHS of a binary expression");
                        },
@@ -1111,8 +1071,6 @@ std::ostream& operator<<(std::ostream& os, const expression::printer& pr) {
                } else {
                    if (v.value.is_null()) {
                        os << "null";
-                    } else if (v.value.is_unset_value()) {
-                        os << "unset";
                    } else {
                        v.value.view().with_value([&](const FragmentedView auto& bytes_view) {
                            data_value deser_val = v.type->deserialize(bytes_view);
@@ -1189,10 +1147,6 @@ std::ostream& operator<<(std::ostream& os, const expression::printer& pr) {
            [&] (const field_selection& fs)  {
                fmt::print(os, "({}.{})", to_printer(fs.structure), fs.field);
            },
-            [&] (const null&) {
-                // FIXME: adjust tests and change to NULL
-                fmt::print(os, "null");
-            },
            [&] (const bind_variable&) {
                // FIXME: store and present bind variable name
                fmt::print(os, "?");
@@ -1462,7 +1416,7 @@ expression search_and_replace(const expression& e,
                    };
                },
                [&] (const binary_operator& oper) -> expression {
-                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
+                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
                },
                [&] (const column_mutation_attribute& cma) -> expression {
                    return column_mutation_attribute{cma.kind, recurse(cma.column)};
@@ -1599,7 +1553,6 @@ std::vector<expression> extract_single_column_restrictions_for_column(const expr
        void operator()(const function_call&) {}
        void operator()(const cast&) {}
        void operator()(const field_selection&) {}
-        void operator()(const null&) {}
        void operator()(const bind_variable&) {}
        void operator()(const untyped_constant&) {}
        void operator()(const tuple_constructor&) {}
@@ -1627,10 +1580,6 @@ constant constant::make_null(data_type val_type) {
    return constant(cql3::raw_value::make_null(), std::move(val_type));
 }

-constant constant::make_unset_value(data_type val_type) {
-    return constant(cql3::raw_value::make_unset_value(), std::move(val_type));
-}
-
 constant constant::make_bool(bool bool_val) {
    return constant(raw_value::make_value(boolean_type->decompose(bool_val)), boolean_type);
 }
@@ -1639,22 +1588,14 @@ bool constant::is_null() const {
    return value.is_null();
 }

-bool constant::is_unset_value() const {
-    return value.is_unset_value();
-}
-
 bool constant::has_empty_value_bytes() const {
-    if (is_null_or_unset()) {
+    if (is_null()) {
        return false;
    }

    return value.view().size_bytes() == 0;
 }

-bool constant::is_null_or_unset() const {
-    return is_null() || is_unset_value();
-}
-
 cql3::raw_value_view constant::view() const {
    return value.view();
 }
@@ -1664,7 +1605,7 @@ std::optional<bool> get_bool_value(const constant& constant_val) {
        return std::nullopt;
    }

-    if (constant_val.is_null_or_unset()) {
+    if (constant_val.is_null()) {
        return std::nullopt;
    }

@@ -1719,13 +1660,56 @@ cql3::raw_value evaluate(const binary_operator& binop, const evaluation_inputs&
    return raw_value::make_value(boolean_type->decompose(binop_result.get_value()));
 }

+// Evaluate a conjunction of elements separated by AND.
+// NULL is treated as an "unkown value" - maybe true maybe false.
+// `TRUE AND NULL` evaluates to NULL because it might be true but also might be false.
+// `FALSE AND NULL` evaluates to FALSE because no matter what value NULL acts as, the result will still be FALSE.
+// Empty values are not allowed.
+//
+// Usually in CQL the rule is that when NULL occurs in an operation the whole expression
+// becomes NULL, but here we decided to deviate from this behavior.
+// Treating NULL as an "unkown value" is the standard SQL way of handing NULLs in conjunctions.
+// It works this way in MySQL and Postgres so we do it this way as well.
+//
+// The evaluation short-circuits. Once FALSE is encountered the function returns FALSE
+// immediately without evaluating any further elements.
+// It works this way in Postgres as well, for example:
+// `SELECT true AND NULL AND 1/0 = 0` will throw a division by zero error
+// but `SELECT false AND 1/0 = 0` will successfully evaluate to FALSE.
+cql3::raw_value evaluate(const conjunction& conj, const evaluation_inputs& inputs) {
+    bool has_null = false;
+
+    for (const expression& element : conj.children) {
+        cql3::raw_value element_val = evaluate(element, inputs);
+        if (element_val.is_null()) {
+            has_null = true;
+            continue;
+        }
+        if (element_val.is_empty_value()) {
+            throw exceptions::invalid_request_exception("empty value found inside AND conjunction");
+        }
+        bool element_val_bool = element_val.view().deserialize<bool>(*boolean_type);
+        if (element_val_bool == false) {
+            // The conjunction contains a false value, so the result must be false.
+            // Don't evaluate other elements, short-circuit and return immediately.
+            return raw_value::make_value(boolean_type->decompose(false));
+        }
+    }
+
+    if (has_null) {
+        return raw_value::make_null();
+    }
+
+    return raw_value::make_value(boolean_type->decompose(true));
+}
+
 cql3::raw_value evaluate(const expression& e, const evaluation_inputs& inputs) {
    return expr::visit(overloaded_functor {
        [&](const binary_operator& binop) -> cql3::raw_value {
            return evaluate(binop, inputs);
        },
-        [](const conjunction&) -> cql3::raw_value {
-            on_internal_error(expr_logger, "Can't evaluate a conjunction");
+        [&](const conjunction& conj) -> cql3::raw_value {
+            return evaluate(conj, inputs);
        },
        [](const token&) -> cql3::raw_value {
            on_internal_error(expr_logger, "Can't evaluate token");
@@ -1758,7 +1742,6 @@ cql3::raw_value evaluate(const expression& e, const evaluation_inputs& inputs) {
            on_internal_error(expr_logger, "Can't evaluate a untyped_constant ");
        },

-        [](const null&) { return cql3::raw_value::make_null(); },
        [](const constant& c) { return c.value; },
        [&](const bind_variable& bind_var) { return evaluate(bind_var, inputs); },
        [&](const tuple_constructor& tup) { return evaluate(tup, inputs); },
@@ -1775,32 +1758,31 @@ cql3::raw_value evaluate(const expression& e, const query_options& options) {
 // Takes a value and reserializes it where needs_to_be_reserialized() says it's needed
 template <FragmentedView View>
 static managed_bytes reserialize_value(View value_bytes,
-                                       const abstract_type& type,
-                                       const cql_serialization_format& sf) {
+                                       const abstract_type& type) {
    if (type.is_list()) {
-        utils::chunked_vector<managed_bytes> elements = partially_deserialize_listlike(value_bytes, sf);
+        utils::chunked_vector<managed_bytes> elements = partially_deserialize_listlike(value_bytes);

        const abstract_type& element_type = dynamic_cast<const list_type_impl&>(type).get_elements_type()->without_reversed();
-        if (element_type.bound_value_needs_to_be_reserialized(sf)) {
+        if (element_type.bound_value_needs_to_be_reserialized()) {
            for (managed_bytes& element : elements) {
-                element = reserialize_value(managed_bytes_view(element), element_type, sf);
+                element = reserialize_value(managed_bytes_view(element), element_type);
            }
        }

        return collection_type_impl::pack_fragmented(
            elements.begin(),
            elements.end(),
-            elements.size(), cql_serialization_format::internal()
+            elements.size()
        );
    }

    if (type.is_set()) {
-        utils::chunked_vector<managed_bytes> elements = partially_deserialize_listlike(value_bytes, sf);
+        utils::chunked_vector<managed_bytes> elements = partially_deserialize_listlike(value_bytes);

        const abstract_type& element_type = dynamic_cast<const set_type_impl&>(type).get_elements_type()->without_reversed();
-        if (element_type.bound_value_needs_to_be_reserialized(sf)) {
+        if (element_type.bound_value_needs_to_be_reserialized()) {
            for (managed_bytes& element : elements) {
-                element = reserialize_value(managed_bytes_view(element), element_type, sf);
+                element = reserialize_value(managed_bytes_view(element), element_type);
            }
        }

@@ -1812,26 +1794,26 @@ static managed_bytes reserialize_value(View value_bytes,
        return collection_type_impl::pack_fragmented(
            values_set.begin(),
            values_set.end(),
-            values_set.size(), cql_serialization_format::internal()
+            values_set.size()
        );
    }

    if (type.is_map()) {
-        std::vector<std::pair<managed_bytes, managed_bytes>> elements = partially_deserialize_map(value_bytes, sf);
+        std::vector<std::pair<managed_bytes, managed_bytes>> elements = partially_deserialize_map(value_bytes);

        const map_type_impl mapt = dynamic_cast<const map_type_impl&>(type);
        const abstract_type& key_type = mapt.get_keys_type()->without_reversed();
        const abstract_type& value_type = mapt.get_values_type()->without_reversed();

-        if (key_type.bound_value_needs_to_be_reserialized(sf)) {
+        if (key_type.bound_value_needs_to_be_reserialized()) {
            for (std::pair<managed_bytes, managed_bytes>& element : elements) {
-                element.first = reserialize_value(managed_bytes_view(element.first), key_type, sf);
+                element.first = reserialize_value(managed_bytes_view(element.first), key_type);
            }
        }

-        if (value_type.bound_value_needs_to_be_reserialized(sf)) {
+        if (value_type.bound_value_needs_to_be_reserialized()) {
            for (std::pair<managed_bytes, managed_bytes>& element : elements) {
-                element.second = reserialize_value(managed_bytes_view(element.second), value_type, sf);
+                element.second = reserialize_value(managed_bytes_view(element.second), value_type);
            }
        }

@@ -1849,8 +1831,8 @@ static managed_bytes reserialize_value(View value_bytes,

        for (std::size_t i = 0; i < elements.size(); i++) {
            const abstract_type& element_type = ttype.all_types().at(i)->without_reversed();
-            if (elements[i].has_value() && element_type.bound_value_needs_to_be_reserialized(sf)) {
-                elements[i] = reserialize_value(managed_bytes_view(*elements[i]), element_type, sf);
+            if (elements[i].has_value() && element_type.bound_value_needs_to_be_reserialized()) {
+                elements[i] = reserialize_value(managed_bytes_view(*elements[i]), element_type);
            }
        }

@@ -1873,21 +1855,17 @@ static cql3::raw_value evaluate(const bind_variable& bind_var, const evaluation_
        return cql3::raw_value::make_null();
    }

-    if (value.is_unset_value()) {
-        return cql3::raw_value::make_unset_value();
-    }
-
    const abstract_type& value_type = bind_var.receiver->type->without_reversed();
    try {
-        value.validate(value_type, inputs.options->get_cql_serialization_format());
+        value.validate(value_type);
    } catch (const marshal_exception& e) {
        throw exceptions::invalid_request_exception(format("Exception while binding column {:s}: {:s}",
                                                           bind_var.receiver->name->to_cql_string(), e.what()));
    }

-    if (value_type.bound_value_needs_to_be_reserialized(inputs.options->get_cql_serialization_format())) {
+    if (value_type.bound_value_needs_to_be_reserialized()) {
        managed_bytes new_value = value.with_value([&] (const FragmentedView auto& value_bytes) {
-            return reserialize_value(value_bytes, value_type, inputs.options->get_cql_serialization_format());
+            return reserialize_value(value_bytes, value_type);
        });

        return raw_value::make_value(std::move(new_value));
@@ -1907,10 +1885,6 @@ static cql3::raw_value evaluate(const tuple_constructor& tuple, const evaluation

    for (size_t i = 0; i < tuple.elements.size(); i++) {
        cql3::raw_value elem_val = evaluate(tuple.elements[i], inputs);
-        if (elem_val.is_unset_value()) {
-            throw exceptions::invalid_request_exception(format("Invalid unset value for tuple field number {:d}", i));
-        }
-
        tuple_elements.emplace_back(std::move(elem_val).to_managed_bytes_opt());
    }

@@ -1939,8 +1913,7 @@ static managed_bytes serialize_listlike(const Range& elements, const char* colle
    return collection_type_impl::pack_fragmented(
        elements.begin(),
        elements.end(),
-        elements.size(),
-        cql_serialization_format::internal()
+        elements.size()
    );
 }

@@ -1953,10 +1926,6 @@ static cql3::raw_value evaluate_list(const collection_constructor& collection,
    for (const expression& element : collection.elements) {
        cql3::raw_value evaluated_element = evaluate(element, inputs);

-        if (evaluated_element.is_unset_value()) {
-            throw exceptions::invalid_request_exception("unset value is not supported inside collections");
-        }
-
        if (evaluated_element.is_null()) {
            if (skip_null) {
                continue;
@@ -1983,10 +1952,6 @@ static cql3::raw_value evaluate_set(const collection_constructor& collection, co
            throw exceptions::invalid_request_exception("null is not supported inside collections");
        }

-        if (evaluated_element.is_unset_value()) {
-            throw exceptions::invalid_request_exception("unset value is not supported inside collections");
-        }
-
        if (evaluated_element.view().size_bytes() > std::numeric_limits<uint16_t>::max()) {
            // TODO: Behaviour copied from sets::delayed_value::bind(), but this seems incorrect
            // The original reasoning is:
@@ -2017,10 +1982,6 @@ static cql3::raw_value evaluate_map(const collection_constructor& collection, co
                throw exceptions::invalid_request_exception("null is not supported inside collections");
            }

-            if (key.is_unset_value() || value.is_unset_value()) {
-                throw exceptions::invalid_request_exception("unset value is not supported inside collections");
-            }
-
            if (key.view().size_bytes() > std::numeric_limits<uint16_t>::max()) {
                // TODO: Behaviour copied from maps::delayed_value::bind(), but this seems incorrect
                // The original reasoning is:
@@ -2094,10 +2055,6 @@ static cql3::raw_value evaluate(const usertype_constructor& user_val, const eval
        }

        cql3::raw_value field_val = evaluate(cur_field->second, inputs);
-        if (field_val.is_unset_value()) {
-            throw exceptions::invalid_request_exception(format(
-                "Invalid unset value for field '{}' of user defined type ", utype.field_name_as_string(i)));
-        }

        field_values.emplace_back(std::move(field_val).to_managed_bytes_opt());
    }
@@ -2123,8 +2080,8 @@ static cql3::raw_value evaluate(const function_call& fun_call, const evaluation_

    for (const expression& arg : fun_call.args) {
        cql3::raw_value arg_val = evaluate(arg, inputs);
-        if (arg_val.is_null_or_unset()) {
-            throw exceptions::invalid_request_exception(format("Invalid null or unset value for argument to {}", *scalar_fun));
+        if (arg_val.is_null()) {
+            throw exceptions::invalid_request_exception(format("Invalid null value for argument to {}", *scalar_fun));
        }

        arguments.emplace_back(to_bytes_opt(std::move(arg_val)));
@@ -2139,7 +2096,7 @@ static cql3::raw_value evaluate(const function_call& fun_call, const evaluation_
        }
    }

-    bytes_opt result = scalar_fun->execute(cql_serialization_format::internal(), arguments);
+    bytes_opt result = scalar_fun->execute(arguments);

    if (has_cache_id) {
        inputs.options->cache_pk_function_call(**fun_call.lwt_cache_id, result);
@@ -2150,7 +2107,7 @@ static cql3::raw_value evaluate(const function_call& fun_call, const evaluation_
    }

    try {
-        scalar_fun->return_type()->validate(*result, cql_serialization_format::internal());
+        scalar_fun->return_type()->validate(*result);
    } catch (marshal_exception&) {
        throw runtime_exception(format("Return of function {} ({}) is not a valid value for its declared return type {}",
                                       *scalar_fun, to_hex(result),
@@ -2166,17 +2123,13 @@ static void ensure_can_get_value_elements(const cql3::raw_value& val,
    if (val.is_null()) {
        on_internal_error(expr_logger, fmt::format("{} called with null value", caller_name));
    }
-
-    if (val.is_unset_value()) {
-        on_internal_error(expr_logger, fmt::format("{} called with unset value", caller_name));
-    }
 }

 utils::chunked_vector<managed_bytes> get_list_elements(const cql3::raw_value& val) {
    ensure_can_get_value_elements(val, "expr::get_list_elements");

    return val.view().with_value([](const FragmentedView auto& value_bytes) {
-        return partially_deserialize_listlike(value_bytes, cql_serialization_format::internal());
+        return partially_deserialize_listlike(value_bytes);
    });
 }

@@ -2184,7 +2137,7 @@ utils::chunked_vector<managed_bytes> get_set_elements(const cql3::raw_value& val
    ensure_can_get_value_elements(val, "expr::get_set_elements");

    return val.view().with_value([](const FragmentedView auto& value_bytes) {
-        return partially_deserialize_listlike(value_bytes, cql_serialization_format::internal());
+        return partially_deserialize_listlike(value_bytes);
    });
 }

@@ -2192,7 +2145,7 @@ std::vector<std::pair<managed_bytes, managed_bytes>> get_map_elements(const cql3
    ensure_can_get_value_elements(val, "expr::get_map_elements");

    return val.view().with_value([](const FragmentedView auto& value_bytes) {
-        return partially_deserialize_map(value_bytes, cql_serialization_format::internal());
+        return partially_deserialize_map(value_bytes);
    });
 }

@@ -2316,7 +2269,6 @@ void fill_prepare_context(expression& e, prepare_context& ctx) {
            fill_prepare_context(s.sub, ctx);
        },
        [](untyped_constant&) {},
-        [](null&) {},
        [](constant&) {},
    }, e);
 }
@@ -2568,5 +2520,29 @@ bool has_only_eq_binops(const expression& e) {

    return non_eq_binop == nullptr;
 }
+
+unset_bind_variable_guard::unset_bind_variable_guard(const expr::expression& e) {
+    if (auto bv = expr::as_if<expr::bind_variable>(&e)) {
+        _var = *bv;
+    }
+}
+
+unset_bind_variable_guard::unset_bind_variable_guard(const std::optional<expr::expression>& e) {
+    if (!e) {
+        return;
+    }
+    if (auto bv = expr::as_if<expr::bind_variable>(&*e)) {
+        _var = *bv;
+    }
+}
+
+bool
+unset_bind_variable_guard::is_unset(const query_options& qo) const {
+    if (!_var) {
+        return false;
+    }
+    return qo.is_unset(_var->bind_index);
+}
+
 } // namespace expr
 } // namespace cql3
--- a/cql3/expr/expression.hh
+++ b/cql3/expr/expression.hh
@@ -76,7 +76,6 @@ struct column_mutation_attribute;
 struct function_call;
 struct cast;
 struct field_selection;
-struct null;
 struct bind_variable;
 struct untyped_constant;
 struct constant;
@@ -96,7 +95,6 @@ concept ExpressionElement
        || std::same_as<T, function_call>
        || std::same_as<T, cast>
        || std::same_as<T, field_selection>
-        || std::same_as<T, null>
        || std::same_as<T, bind_variable>
        || std::same_as<T, untyped_constant>
        || std::same_as<T, constant>
@@ -117,7 +115,6 @@ concept invocable_on_expression
        && std::invocable<Func, function_call>
        && std::invocable<Func, cast>
        && std::invocable<Func, field_selection>
-        && std::invocable<Func, null>
        && std::invocable<Func, bind_variable>
        && std::invocable<Func, untyped_constant>
        && std::invocable<Func, constant>
@@ -138,7 +135,6 @@ concept invocable_on_expression_ref
        && std::invocable<Func, function_call&>
        && std::invocable<Func, cast&>
        && std::invocable<Func, field_selection&>
-        && std::invocable<Func, null&>
        && std::invocable<Func, bind_variable&>
        && std::invocable<Func, untyped_constant&>
        && std::invocable<Func, constant&>
@@ -147,7 +143,7 @@ concept invocable_on_expression_ref
        && std::invocable<Func, usertype_constructor&>
        ;

-/// A CQL expression -- union of all possible expression types.  bool means a Boolean constant.
+/// A CQL expression -- union of all possible expression types.
 class expression final {
    // 'impl' holds a variant of all expression types, but since 
    // variants of incomplete types are not allowed, we forward declare it
@@ -198,9 +194,7 @@ bool operator==(const expression& e1, const expression& e2);
 // An expression that doesn't contain subexpressions
 template <typename E>
 concept LeafExpression
-        = std::same_as<bool, E>
-        || std::same_as<unresolved_identifier, E> 
-        || std::same_as<null, E> 
+        = std::same_as<unresolved_identifier, E>
        || std::same_as<bind_variable, E> 
        || std::same_as<untyped_constant, E> 
        || std::same_as<constant, E>
@@ -346,12 +340,6 @@ struct field_selection {
    friend bool operator==(const field_selection&, const field_selection&) = default;
 };

-struct null {
-    data_type type; // may be null before prepare
-
-    friend bool operator==(const null&, const null&) = default;
-};
-
 struct bind_variable {
    int32_t bind_index;

@@ -365,17 +353,18 @@ struct bind_variable {
 // A constant which does not yet have a date type. It is partially typed
 // (we know if it's floating or int) but not sized.
 struct untyped_constant {
-    enum type_class { integer, floating_point, string, boolean, duration, uuid, hex };
+    enum type_class { integer, floating_point, string, boolean, duration, uuid, hex, null };
    type_class partial_type;
    sstring raw_text;

    friend bool operator==(const untyped_constant&, const untyped_constant&) = default;
 };

+untyped_constant make_untyped_null();
+
 // Represents a constant value with known value and type
 // For null and unset the type can sometimes be set to empty_type
 struct constant {
-    // A value serialized using the internal (latest) cql_serialization_format
    cql3::raw_value value;

    // Never nullptr, for NULL and UNSET might be empty_type
@@ -383,7 +372,6 @@ struct constant {

    constant(cql3::raw_value value, data_type type);
    static constant make_null(data_type val_type = empty_type);
-    static constant make_unset_value(data_type val_type = empty_type);
    static constant make_bool(bool bool_val);

    bool is_null() const;
@@ -436,7 +424,7 @@ struct usertype_constructor {
 struct expression::impl final {
    using variant_type = std::variant<
            conjunction, binary_operator, column_value, token, unresolved_identifier,
-            column_mutation_attribute, function_call, cast, field_selection, null,
+            column_mutation_attribute, function_call, cast, field_selection,
            bind_variable, untyped_constant, constant, tuple_constructor, collection_constructor,
            usertype_constructor, subscript>;
    variant_type v;
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -78,7 +78,7 @@ static
 void
 usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    if (!receiver.type->is_user_type()) {
-        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto ut = static_pointer_cast<const user_type_impl>(receiver.type);
@@ -90,7 +90,7 @@ usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_
        const expression& value = u.elements.at(field);
        auto&& field_spec = usertype_field_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *field_spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", receiver.name, field, field_spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", *receiver.name, field, field_spec->type->as_cql3_type()));
        }
    }
 }
@@ -123,7 +123,7 @@ usertype_constructor_prepare_expression(const usertype_constructor& u, data_dict
        auto iraw = u.elements.find(field);
        expression raw;
        if (iraw == u.elements.end()) {
-            raw = expr::null();
+            raw = expr::make_untyped_null();
        } else {
            raw = iraw->second;
            ++found_values;
@@ -313,7 +313,7 @@ set_validate_assignable_to(const collection_constructor& c, data_dictionary::dat
            return;
        }

-        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto&& value_spec = set_value_spec_of(receiver);
@@ -501,18 +501,18 @@ void
 tuple_constructor_validate_assignable_to(const tuple_constructor& tc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver.type->underlying_type());
    if (!tt) {
-        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    for (size_t i = 0; i < tc.elements.size(); ++i) {
        if (i >= tt->size()) {
            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: too many elements. Type {} expects {:d} but got {:d}",
-                                                            receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
+                                                            *receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
        }

        auto&& value = tc.elements[i];
        auto&& spec = component_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", receiver.name, i, spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", *receiver.name, i, spec->type->as_cql3_type()));
        }
    }
 }
@@ -582,6 +582,7 @@ operator<<(std::ostream&out, untyped_constant::type_class t)
        case untyped_constant::type_class::boolean:  return out << "BOOLEAN";
        case untyped_constant::type_class::hex:      return out << "HEX";
        case untyped_constant::type_class::duration: return out << "DURATION";
+        case untyped_constant::type_class::null:     return out << "NULL";
    }
    abort();
 }
@@ -609,8 +610,9 @@ static
 assignment_testable::test_result
 untyped_constant_test_assignment(const untyped_constant& uc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver)
 {
+    bool uc_is_null = uc.partial_type == untyped_constant::type_class::null;
    auto receiver_type = receiver.type->as_cql3_type();
-    if (receiver_type.is_collection() || receiver_type.is_user_type()) {
+    if ((receiver_type.is_collection() || receiver_type.is_user_type()) && !uc_is_null) {
        return assignment_testable::test_result::NOT_ASSIGNABLE;
    }
    if (!receiver_type.is_native()) {
@@ -675,6 +677,10 @@ untyped_constant_test_assignment(const untyped_constant& uc, data_dictionary::da
                return assignment_testable::test_result::EXACT_MATCH;
            }
            break;
+        case untyped_constant::type_class::null:
+            return receiver.type->is_counter()
+                ? assignment_testable::test_result::NOT_ASSIGNABLE
+                : assignment_testable::test_result::WEAKLY_ASSIGNABLE;
    }
    return assignment_testable::test_result::NOT_ASSIGNABLE;
 }
@@ -688,9 +694,18 @@ untyped_constant_prepare_expression(const untyped_constant& uc, data_dictionary:
        return std::nullopt;
    }
    if (!is_assignable(untyped_constant_test_assignment(uc, db, keyspace, *receiver))) {
+      if (uc.partial_type != untyped_constant::type_class::null) {
        throw exceptions::invalid_request_exception(format("Invalid {} constant ({}) for \"{}\" of type {}",
            uc.partial_type, uc.raw_text, *receiver->name, receiver->type->as_cql3_type().to_string()));
+      } else {
+        throw exceptions::invalid_request_exception("Invalid null value for counter increment/decrement");
+      }
    }
+
+    if (uc.partial_type == untyped_constant::type_class::null) {
+        return constant::make_null(receiver->type);
+    }
+
    raw_value raw_val = cql3::raw_value::make_value(untyped_constant_parsed_value(uc, receiver->type));
    return constant(std::move(raw_val), receiver->type);
 }
@@ -715,29 +730,6 @@ bind_variable_prepare_expression(const bind_variable& bv, data_dictionary::datab
    };
 }

-static
-assignment_testable::test_result
-null_test_assignment(data_dictionary::database db,
-        const sstring& keyspace,
-        const column_specification& receiver) {
-    return receiver.type->is_counter()
-        ? assignment_testable::test_result::NOT_ASSIGNABLE
-        : assignment_testable::test_result::WEAKLY_ASSIGNABLE;
-}
-
-static
-std::optional<expression>
-null_prepare_expression(data_dictionary::database db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) {
-    if (!receiver) {
-        // TODO: It is not possible to infer the type of NULL, but perhaps we can have a matcing null_type that can be cast to anything
-        return std::nullopt;
-    }
-    if (!is_assignable(null_test_assignment(db, keyspace, *receiver))) {
-        throw exceptions::invalid_request_exception("Invalid null value for counter increment/decrement");
-    }
-    return constant::make_null(receiver->type);
-}
-
 static
 sstring
 cast_display_name(const cast& c) {
@@ -883,6 +875,53 @@ test_assignment_function_call(const cql3::expr::function_call& fc, data_dictiona
    }
 }

+std::optional<expression> prepare_conjunction(const conjunction& conj,
+                                              data_dictionary::database db,
+                                              const sstring& keyspace,
+                                              const schema* schema_opt,
+                                              lw_shared_ptr<column_specification> receiver) {
+    if (receiver.get() != nullptr && receiver->type->without_reversed().get_kind() != abstract_type::kind::boolean) {
+        throw exceptions::invalid_request_exception(
+            format("AND conjunction produces a boolean value, which doesn't match the type: {} of {}",
+                   receiver->type->name(), receiver->name->text()));
+    }
+
+    lw_shared_ptr<column_specification> child_receiver;
+    if (receiver.get() != nullptr) {
+        ::shared_ptr<column_identifier> child_receiver_name =
+            ::make_shared<column_identifier>(format("AND_element({})", receiver->name->text()), true);
+        child_receiver = make_lw_shared<column_specification>(receiver->ks_name, receiver->cf_name,
+                                                              std::move(child_receiver_name), boolean_type);
+    } else {
+        ::shared_ptr<column_identifier> child_receiver_name =
+            ::make_shared<column_identifier>("AND_element(unknown)", true);
+        sstring cf_name = schema_opt ? schema_opt->cf_name() : "unknown_cf";
+        child_receiver = make_lw_shared<column_specification>(keyspace, std::move(cf_name),
+                                                              std::move(child_receiver_name), boolean_type);
+    }
+
+    std::vector<expression> prepared_children;
+
+    bool all_terminal = true;
+    for (const expression& child : conj.children) {
+        std::optional<expression> prepared_child =
+            try_prepare_expression(child, db, keyspace, schema_opt, child_receiver);
+        if (!prepared_child.has_value()) {
+            throw exceptions::invalid_request_exception(fmt::format("Could not infer type of {}", child));
+        }
+        if (!is<constant>(*prepared_child)) {
+            all_terminal = false;
+        }
+        prepared_children.push_back(std::move(*prepared_child));
+    }
+
+    conjunction result = conjunction{std::move(prepared_children)};
+    if (all_terminal) {
+        return constant(evaluate(result, evaluation_inputs{}), boolean_type);
+    }
+    return result;
+}
+
 std::optional<expression>
 try_prepare_expression(const expression& expr, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
    return expr::visit(overloaded_functor{
@@ -892,8 +931,8 @@ try_prepare_expression(const expression& expr, data_dictionary::database db, con
        [&] (const binary_operator&) -> std::optional<expression> {
            on_internal_error(expr_logger, "binary_operators are not yet reachable via prepare_expression()");
        },
-        [&] (const conjunction&) -> std::optional<expression> {
-            on_internal_error(expr_logger, "conjunctions are not yet reachable via prepare_expression()");
+        [&] (const conjunction& conj) -> std::optional<expression> {
+            return prepare_conjunction(conj, db, keyspace, schema_opt, receiver);
        },
        [] (const column_value& cv) -> std::optional<expression> {
            return cv;
@@ -964,9 +1003,6 @@ try_prepare_expression(const expression& expr, data_dictionary::database db, con
        [&] (const field_selection&) -> std::optional<expression> {
            on_internal_error(expr_logger, "field_selections are not yet reachable via prepare_expression()");
        },
-        [&] (const null&) -> std::optional<expression> {
-            return null_prepare_expression(db, keyspace, receiver);
-        },
        [&] (const bind_variable& bv) -> std::optional<expression> {
            return bind_variable_prepare_expression(bv, db, keyspace, receiver);
        },
@@ -1028,9 +1064,6 @@ test_assignment(const expression& expr, data_dictionary::database db, const sstr
        [&] (const field_selection&) -> test_result {
            on_internal_error(expr_logger, "field_selections are not yet reachable via test_assignment()");
        },
-        [&] (const null&) -> test_result {
-            return null_test_assignment(db, keyspace, receiver);
-        },
        [&] (const bind_variable& bv) -> test_result {
            return bind_variable_test_assignment(bv, db, keyspace, receiver);
        },
@@ -1157,7 +1190,7 @@ static lw_shared_ptr<column_specification> get_lhs_receiver(const expression& pr
 // Given type of LHS and the operation finds the expected type of RHS.
 // The type will be the same as LHS for simple operations like =, but it will be different for more complex ones like IN or CONTAINS.
 static lw_shared_ptr<column_specification> get_rhs_receiver(lw_shared_ptr<column_specification>& lhs_receiver, oper_t oper) {
-    const data_type& lhs_type = lhs_receiver->type->underlying_type();
+    const data_type lhs_type = lhs_receiver->type->underlying_type();

    if (oper == oper_t::IN) {
        data_type rhs_receiver_type = list_type_impl::get_instance(std::move(lhs_type), false);
--- a/cql3/expr/restrictions.cc
+++ b/cql3/expr/restrictions.cc
@@ -144,7 +144,7 @@ void preliminary_binop_vaidation_checks(const binary_operator& binop) {
    }

    if (binop.op == oper_t::IS_NOT) {
-        bool rhs_is_null = is<null>(binop.rhs)
+        bool rhs_is_null = (is<untyped_constant>(binop.rhs) && as<untyped_constant>(binop.rhs).partial_type == untyped_constant::type_class::null)
                           || (is<constant>(binop.rhs) && as<constant>(binop.rhs).is_null());
        if (!rhs_is_null) {
            throw exceptions::invalid_request_exception(format("Unsupported \"IS NOT\" relation: {}", pretty_binop_printer));
--- a/cql3/expr/unset.hh
+++ b/cql3/expr/unset.hh
@@ -0,0 +1,30 @@
+// Copyright (C) 2023-present ScyllaDB
+// SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
+
+#pragma once
+
+#include <optional>
+#include "expression.hh"
+
+namespace cql3 {
+
+class query_options;
+
+}
+
+namespace cql3::expr {
+
+// Some expression users can behave differently if the expression is a bind variable
+// and if that bind variable is unset. unset_bind_variable_guard encapsulates the two
+// conditions.
+class unset_bind_variable_guard {
+    // Disengaged if the operand is not exactly a single bind variable.
+    std::optional<bind_variable> _var;
+public:
+    explicit unset_bind_variable_guard(const expr::expression& operand);
+    explicit unset_bind_variable_guard(std::nullopt_t) {}
+    explicit unset_bind_variable_guard(const std::optional<expr::expression>& operand);
+    bool is_unset(const query_options& qo) const;
+};
+
+}
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -12,7 +12,7 @@
 #include "types.hh"
 #include "types/tuple.hh"
 #include "cql3/functions/scalar_function.hh"
-#include "cql_serialization_format.hh"
+#include "cql3/util.hh"
 #include "utils/big_decimal.hh"
 #include "aggregate_fcts.hh"
 #include "user_aggregate.hh"
@@ -40,10 +40,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        ++_count;
    }
    virtual void set_accumulator(const opt_bytes& acc) override {
@@ -56,7 +56,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return long_type->decompose(_count);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = value_cast<int64_t>(long_type->deserialize(bytes_view(*acc)));
            _count += other;
@@ -189,13 +189,13 @@ public:
    virtual void reset() override {
        _acc = _initcond;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        return _finalfunc ? _finalfunc->execute(sf, std::vector<bytes_opt>{_acc}) : _acc;
+    virtual opt_bytes compute() override {
+        return _finalfunc ? _finalfunc->execute(std::vector<bytes_opt>{_acc}) : _acc;
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        std::vector<bytes_opt> args{_acc};
        args.insert(args.end(), values.begin(), values.end());
-        _acc = _sfunc->execute(sf, args);
+        _acc = _sfunc->execute(args);
    }
    virtual void set_accumulator(const opt_bytes& acc) override {
        _acc = acc;
@@ -203,9 +203,9 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _acc;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        std::vector<bytes_opt> args{_acc, acc};
-        _acc = _rfunc->execute(sf, args);
+        _acc = _rfunc->execute(args);
    }
 };

@@ -218,10 +218,10 @@ public:
    virtual void reset() override {
        _sum = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return data_type_for<Type>()->decompose(accumulator_for<Type>::narrow(_sum));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -237,7 +237,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return accumulator_for<Type>::decompose(_sum);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = accumulator_for<Type>::deserialize(acc);
            _sum += other;
@@ -248,7 +248,7 @@ public:
 template <typename Type>
 class impl_reducible_sum_function final : public impl_sum_function_for<Type> {
 public:
-    virtual bytes_opt compute(cql_serialization_format sf) override {
+    virtual bytes_opt compute() override {
        return this->get_accumulator();
    }
 };
@@ -316,14 +316,14 @@ public:
        _sum = {};
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        Type ret{};
        if (_count) {
            ret = impl_div_for_avg<Type>::div(_sum, _count);
        }
        return data_type_for<Type>()->decompose(ret);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -348,7 +348,7 @@ public:
        );
        return tuple_val.serialize();
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            data_type tuple_type = tuple_type_impl::get_instance({accumulator_for<Type>::data_type(), long_type});
            auto tuple = value_cast<tuple_type_impl::native_type>(tuple_type->deserialize(bytes_view(*acc)));
@@ -362,7 +362,7 @@ public:
 template <typename Type>
 class impl_reducible_avg_function : public impl_avg_function_for<Type> {
 public:
-    virtual bytes_opt compute(cql_serialization_format sf) override {
+    virtual bytes_opt compute() override {
        return this->get_accumulator();
    }
 };
@@ -457,13 +457,13 @@ public:
    virtual void reset() override {
        _max = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        if (!_max) {
            return {};
        }
        return data_type_for<Type>()->decompose(data_value(Type{*_max}));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -487,8 +487,8 @@ public:
        }
        return {};
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
-        return add_input(sf, {acc});
+    virtual void reduce(const opt_bytes& acc) override {
+        return add_input({acc});
    }
 };

@@ -502,10 +502,10 @@ public:
    virtual void reset() override {
        _max = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return _max.value_or(bytes{});
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (values.empty() || !values[0]) {
            return;
        }
@@ -519,11 +519,11 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _max;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc && !acc->length()) {
            return;
        }
-        return add_input(sf, {acc});
+        return add_input({acc});
    }
 };

@@ -598,13 +598,13 @@ public:
    virtual void reset() override {
        _min = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        if (!_min) {
            return {};
        }
        return data_type_for<Type>()->decompose(data_value(Type{*_min}));
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -628,8 +628,8 @@ public:
        }
        return {};
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
-        return add_input(sf, {acc});
+    virtual void reduce(const opt_bytes& acc) override {
+        return add_input({acc});
    }
 };

@@ -643,10 +643,10 @@ public:
    virtual void reset() override {
        _min = {};
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return _min.value_or(bytes{});
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (values.empty() || !values[0]) {
            return;
        }
@@ -660,11 +660,11 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return _min;
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc && !acc->length()) {
            return;
        }
-        return add_input(sf, {acc});
+        return add_input({acc});
    }
 };

@@ -720,10 +720,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
+    virtual opt_bytes compute() override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -739,7 +739,7 @@ public:
    virtual opt_bytes get_accumulator() const override {
        return long_type->decompose(_count);
    }
-    virtual void reduce(cql_serialization_format sf, const opt_bytes& acc) override {
+    virtual void reduce(const opt_bytes& acc) override {
        if (acc) {
            auto other = value_cast<int64_t>(long_type->deserialize(bytes_view(*acc)));
            _count += other;
@@ -814,6 +814,35 @@ bool user_aggregate::is_reducible() const { return _reducefunc != nullptr; }
 bool user_aggregate::requires_thread() const { return _sfunc->requires_thread() || (_finalfunc && _finalfunc->requires_thread()); }
 bool user_aggregate::has_finalfunc() const { return _finalfunc != nullptr; }

+std::ostream& user_aggregate::describe(std::ostream& os) const {
+    auto ks = cql3::util::maybe_quote(name().keyspace);
+    auto na = cql3::util::maybe_quote(name().name);
+
+    os << "CREATE AGGREGATE " << ks << "." << na << "(";
+    for (size_t i = 0; i < _arg_types.size(); i++) {
+        if (i > 0) {
+            os << ", ";
+        }
+        os << _arg_types[i]->cql3_type_name();
+    }
+    os << ")\n";
+
+    os << "SFUNC " << cql3::util::maybe_quote(_sfunc->name().name) << "\n"
+       << "STYPE " << _sfunc->return_type()->cql3_type_name();
+    if (is_reducible()) {
+        os << "\n" << "REDUCEFUNC " << cql3::util::maybe_quote(_reducefunc->name().name);
+    }
+    if (has_finalfunc()) {
+        os << "\n" << "FINALFUNC " << cql3::util::maybe_quote(_finalfunc->name().name);
+    }
+    if (_initcond) {
+        os << "\n" << "INITCOND " << _sfunc->return_type()->deserialize(bytes_view(*_initcond)).to_parsable_string();
+    }
+    os << ";";
+
+    return os;
+}
+
 shared_ptr<aggregate_function>
 aggregate_fcts::make_count_rows_function() {
    return make_shared<count_rows_function>();
--- a/cql3/functions/as_json_function.hh
+++ b/cql3/functions/as_json_function.hh
@@ -18,7 +18,6 @@

 #include "bytes_ostream.hh"
 #include "types.hh"
-#include "cql_serialization_format.hh"

 #include <boost/algorithm/cxx11/any_of.hpp>

@@ -47,7 +46,7 @@ public:

    virtual bool requires_thread() const override;

-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        bytes_ostream encoded_row;
        encoded_row.write("{", 1);
        for (size_t i = 0; i < _selector_names.size(); ++i) {
--- a/cql3/functions/bytes_conversion_fcts.hh
+++ b/cql3/functions/bytes_conversion_fcts.hh
@@ -14,7 +14,6 @@
 #include "exceptions/exceptions.hh"
 #include <seastar/core/print.hh>
 #include "cql3/cql3_type.hh"
-#include "cql_serialization_format.hh"

 namespace cql3 {

@@ -28,7 +27,7 @@ shared_ptr<function>
 make_to_blob_function(data_type from_type) {
    auto name = from_type->as_cql3_type().to_string() + "asblob";
    return make_native_scalar_function<true>(name, bytes_type, { from_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) {
+            [] (const std::vector<bytes_opt>& parameters) {
        return parameters[0];
    });
 }
@@ -38,13 +37,13 @@ shared_ptr<function>
 make_from_blob_function(data_type to_type) {
    sstring name = sstring("blobas") + to_type->as_cql3_type().to_string();
    return make_native_scalar_function<true>(name, to_type, { bytes_type },
-            [name, to_type] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [name, to_type] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        auto&& val = parameters[0];
        if (!val) {
            return val;
        }
        try {
-            to_type->validate(*val, sf);
+            to_type->validate(*val);
            return val;
        } catch (marshal_exception& e) {
            using namespace exceptions;
@@ -58,7 +57,7 @@ inline
 shared_ptr<function>
 make_varchar_as_blob_fct() {
    return make_native_scalar_function<true>("varcharasblob", bytes_type, { utf8_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return parameters[0];
    });
 }
@@ -67,7 +66,7 @@ inline
 shared_ptr<function>
 make_blob_as_varchar_fct() {
    return make_native_scalar_function<true>("blobasvarchar", utf8_type, { bytes_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return parameters[0];
    });
 }
--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -35,7 +35,7 @@ public:
    virtual void print(std::ostream& os) const override {
        os << "cast(" << _arg_types[0]->name() << " as " << _return_type->name() << ")";
    }
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        auto from_type = arg_types()[0];
        auto to_type = return_type();

@@ -165,8 +165,6 @@ static data_value castas_fctn_from_dv_to_string(data_value from) {
    return from.type()->to_string_impl(from);
 }

-// FIXME: Add conversions for counters, after they are fully implemented...
-
 static constexpr unsigned next_power_of_2(unsigned val) {
    unsigned ret = 1;
    while (ret <= val) {
@@ -370,6 +368,26 @@ castas_fctn get_castas_fctn(data_type to_type, data_type from_type) {
        return castas_fctn_from_dv_to_string;
    case cast_switch_case_val(kind::utf8, kind::ascii):
        return castas_fctn_simple<sstring, sstring>;
+
+    case cast_switch_case_val(kind::byte, kind::counter):
+        return castas_fctn_simple<int8_t, int64_t>;
+    case cast_switch_case_val(kind::short_kind, kind::counter):
+        return castas_fctn_simple<int16_t, int64_t>;
+    case cast_switch_case_val(kind::int32, kind::counter):
+        return castas_fctn_simple<int32_t, int64_t>;
+    case cast_switch_case_val(kind::long_kind, kind::counter):
+        return castas_fctn_simple<int64_t, int64_t>;
+    case cast_switch_case_val(kind::float_kind, kind::counter):
+        return castas_fctn_simple<float, int64_t>;
+    case cast_switch_case_val(kind::double_kind, kind::counter):
+        return castas_fctn_simple<double, int64_t>;
+    case cast_switch_case_val(kind::varint, kind::counter):
+        return castas_fctn_simple<utils::multiprecision_int, int64_t>;
+    case cast_switch_case_val(kind::decimal, kind::counter):
+        return castas_fctn_from_integer_to_decimal<int64_t>;
+    case cast_switch_case_val(kind::ascii, kind::counter):
+    case cast_switch_case_val(kind::utf8, kind::counter):
+        return castas_fctn_to_string<int64_t>;
    }
    throw exceptions::invalid_request_exception(format("{} cannot be cast to {}", from_type->name(), to_type->name()));
 }
--- a/cql3/functions/error_injection_fcts.cc
+++ b/cql3/functions/error_injection_fcts.cc
@@ -40,8 +40,8 @@ public:
        return Pure;
    }

-    bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
-        return _func(sf, parameters);
+    bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
+        return _func(parameters);
    }
 };

@@ -61,7 +61,7 @@ make_failure_injection_function(sstring name,

 shared_ptr<function> make_enable_injection_function() {
    return make_failure_injection_function<false>("enable_injection", empty_type, { ascii_type, ascii_type },
-            [] (cql_serialization_format, const std::vector<bytes_opt>& parameters) {
+            [] (const std::vector<bytes_opt>& parameters) {
        sstring injection_name = ascii_type->get_string(parameters[0].value());
        const bool one_shot = ascii_type->get_string(parameters[1].value()) == "true";
        smp::invoke_on_all([injection_name, one_shot] () mutable {
@@ -73,7 +73,7 @@ shared_ptr<function> make_enable_injection_function() {

 shared_ptr<function> make_disable_injection_function() {
    return make_failure_injection_function<false>("disable_injection", empty_type, { ascii_type },
-            [] (cql_serialization_format, const std::vector<bytes_opt>& parameters) {
+            [] (const std::vector<bytes_opt>& parameters) {
        sstring injection_name = ascii_type->get_string(parameters[0].value());
        smp::invoke_on_all([injection_name] () mutable {
            utils::get_local_injector().disable(injection_name);
@@ -85,7 +85,7 @@ shared_ptr<function> make_disable_injection_function() {
 shared_ptr<function> make_enabled_injections_function() {
    const auto list_type_inst = list_type_impl::get_instance(ascii_type, false);
    return make_failure_injection_function<true>("enabled_injections", list_type_inst, {},
-        [list_type_inst] (cql_serialization_format, const std::vector<bytes_opt>&) -> bytes {
+        [list_type_inst] (const std::vector<bytes_opt>&) -> bytes {
            return seastar::map_reduce(smp::all_cpus(), [] (unsigned) {
                return make_ready_future<std::vector<sstring>>(utils::get_local_injector().enabled_injections());
            }, std::vector<data_value>(),
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -13,7 +13,10 @@
 #include "cql3/lists.hh"
 #include "cql3/constants.hh"
 #include "cql3/user_types.hh"
+#include "cql3/ut_name.hh"
 #include "cql3/type_json.hh"
+#include "cql3/functions/user_function.hh"
+#include "cql3/functions/user_aggregate.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "types/map.hh"
 #include "types/set.hh"
@@ -24,6 +27,7 @@
 #include "cql3/prepare_context.hh"
 #include "user_aggregate.hh"
 #include "cql3/expr/expression.hh"
+#include <boost/range/adaptor/transformed.hpp>
 #include <boost/range/adaptors.hpp>

 #include "error_injection_fcts.hh"
@@ -49,6 +53,13 @@ bool abstract_function::requires_thread() const { return false; }

 bool as_json_function::requires_thread() const { return false; }

+static bool same_signature(const shared_ptr<function>& f1, const shared_ptr<function>& f2) {
+    if (f1 == nullptr || f2 == nullptr) {
+        return false;
+    }
+    return f1->name() == f2->name() && f1->arg_types() == f2->arg_types();
+}
+
 thread_local std::unordered_multimap<function_name, shared_ptr<function>> functions::_declared = init();

 void functions::clear_functions() noexcept {
@@ -94,11 +105,6 @@ functions::init() noexcept {
        if (type == cql3_type::blob) {
            continue;
        }
-        // counters are not supported yet
-        if (type.is_counter()) {
-            warn(unimplemented::cause::COUNTERS);
-            continue;
-        }

        declare(make_to_blob_function(type.get_type()));
        declare(make_from_blob_function(type.get_type()));
@@ -140,22 +146,56 @@ void functions::replace_function(shared_ptr<function> func) {
    with_udf_iter(func->name(), func->arg_types(), [func] (functions::declared_t::iterator i) {
        i->second = std::move(func);
    });
+    auto scalar_func = dynamic_pointer_cast<scalar_function>(func);
+    if (!scalar_func) {
+        return;
+    }
+    for (auto& fit : _declared) {
+        auto aggregate = dynamic_pointer_cast<user_aggregate>(fit.second);
+        if (aggregate && (same_signature(aggregate->sfunc(), scalar_func)
+            || (same_signature(aggregate->finalfunc(), scalar_func))
+            || (same_signature(aggregate->reducefunc(), scalar_func))))
+        {
+            // we need to replace at least one underlying function
+            shared_ptr<scalar_function> sfunc = same_signature(aggregate->sfunc(), scalar_func) ? scalar_func : aggregate->sfunc();
+            shared_ptr<scalar_function> finalfunc = same_signature(aggregate->finalfunc(), scalar_func) ? scalar_func : aggregate->finalfunc();
+            shared_ptr<scalar_function> reducefunc = same_signature(aggregate->reducefunc(), scalar_func) ? scalar_func : aggregate->reducefunc();
+            fit.second = ::make_shared<user_aggregate>(aggregate->name(), aggregate->initcond(), sfunc, reducefunc, finalfunc);
+        }
+    }
 }

 void functions::remove_function(const function_name& name, const std::vector<data_type>& arg_types) {
    with_udf_iter(name, arg_types, [] (functions::declared_t::iterator i) { _declared.erase(i); });
 }

-std::optional<function_name> functions::used_by_user_aggregate(const function_name& name) {
+std::optional<function_name> functions::used_by_user_aggregate(shared_ptr<user_function> func) {
    for (const shared_ptr<function>& fptr : _declared | boost::adaptors::map_values) {
        auto aggregate = dynamic_pointer_cast<user_aggregate>(fptr);
-        if (aggregate && (aggregate->sfunc().name() == name || (aggregate->has_finalfunc() && aggregate->finalfunc().name() == name))) {
+        if (aggregate && (same_signature(aggregate->sfunc(), func)
+            || (same_signature(aggregate->finalfunc(), func))
+            || (same_signature(aggregate->reducefunc(), func))))
+        {
            return aggregate->name();
        }
    }
    return {};
 }

+std::optional<function_name> functions::used_by_user_function(const ut_name& user_type) {
+    for (const shared_ptr<function>& fptr : _declared | boost::adaptors::map_values) {
+        for (auto& arg_type : fptr->arg_types()) {
+            if (arg_type->references_user_type(user_type.get_keyspace(), user_type.get_user_type_name())) {
+                return fptr->name();
+            }
+        }
+        if (fptr->return_type()->references_user_type(user_type.get_keyspace(), user_type.get_user_type_name())) {
+            return fptr->name();
+        }
+    }
+    return {};
+}
+
 lw_shared_ptr<column_specification>
 functions::make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
        const function& fun, size_t i) {
@@ -171,7 +211,7 @@ inline
 shared_ptr<function>
 make_to_json_function(data_type t) {
    return make_native_scalar_function<true>("tojson", utf8_type, {t},
-            [t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [t](const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return utf8_type->decompose(to_json_string(*t, parameters[0]));
    });
 }
@@ -180,12 +220,12 @@ inline
 shared_ptr<function>
 make_from_json_function(data_dictionary::database db, const sstring& keyspace, data_type t) {
    return make_native_scalar_function<true>("fromjson", t, {utf8_type},
-            [&db, keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [&db, keyspace, t](const std::vector<bytes_opt>& parameters) -> bytes_opt {
        try {
            rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
            bytes_opt parsed_json_value;
            if (!json_value.IsNull()) {
-                parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+                parsed_json_value.emplace(from_json_object(*t, json_value));
            }
            return parsed_json_value;
        } catch(rjson::error& e) {
@@ -382,6 +422,32 @@ functions::get(data_dictionary::database db,
    return std::move(compatibles[0]);
 }

+template<typename F>
+std::vector<shared_ptr<F>> functions::get_filtered_transformed(const sstring& keyspace) {
+    auto filter = [&] (const std::pair<const function_name, shared_ptr<function>>& d) -> bool {
+        return d.first.keyspace == keyspace && dynamic_cast<F*>(d.second.get());
+    };
+    auto transformer = [] (const std::pair<const function_name, shared_ptr<function>>& d) -> shared_ptr<F> {
+        return dynamic_pointer_cast<F>(d.second);
+    };
+    
+    return boost::copy_range<std::vector<shared_ptr<F>>>(
+        _declared 
+        | boost::adaptors::filtered(filter) 
+        | boost::adaptors::transformed(transformer)
+    );
+}
+
+std::vector<shared_ptr<user_function>>
+functions::get_user_functions(const sstring& keyspace) {
+    return get_filtered_transformed<user_function>(keyspace);
+}
+
+std::vector<shared_ptr<user_aggregate>>
+functions::get_user_aggregates(const sstring& keyspace) {
+    return get_filtered_transformed<user_aggregate>(keyspace);
+}
+
 boost::iterator_range<functions::declared_t::iterator>
 functions::find(const function_name& name) {
    assert(name.has_keyspace()); // : "function name not fully qualified";
--- a/cql3/functions/functions.hh
+++ b/cql3/functions/functions.hh
@@ -27,6 +27,10 @@
 namespace cql3 {

 namespace functions {
+//forward declarations
+    class user_function;
+    class user_aggregate;
+
    using declared_t = std::unordered_multimap<function_name, shared_ptr<function>>;
    void add_agg_functions(declared_t& funcs);

@@ -57,6 +61,8 @@ public:
        const std::vector<shared_ptr<assignment_testable>> args(std::begin(provided_args), std::end(provided_args));
        return get(db, keyspace, name, args, receiver_ks, receiver_cf, receiver);
    }
+    static std::vector<shared_ptr<user_function>> get_user_functions(const sstring& keyspace);
+    static std::vector<shared_ptr<user_aggregate>> get_user_aggregates(const sstring& keyspace);
    static boost::iterator_range<declared_t::iterator> find(const function_name& name);
    static declared_t::iterator find_iter(const function_name& name, const std::vector<data_type>& arg_types);
    static shared_ptr<function> find(const function_name& name, const std::vector<data_type>& arg_types);
@@ -65,11 +71,15 @@ public:
    static void add_function(shared_ptr<function>);
    static void replace_function(shared_ptr<function>);
    static void remove_function(const function_name& name, const std::vector<data_type>& arg_types);
-    static std::optional<function_name> used_by_user_aggregate(const function_name& name);
+    static std::optional<function_name> used_by_user_aggregate(shared_ptr<user_function>);
+    static std::optional<function_name> used_by_user_function(const ut_name& user_type);
 private:
    template <typename F>
    static void with_udf_iter(const function_name& name, const std::vector<data_type>& arg_types, F&& f);

+    template <typename F>
+    static std::vector<shared_ptr<F>> get_filtered_transformed(const sstring& keyspace);
+
    // This method and matchArguments are somewhat duplicate, but this method allows us to provide more precise errors in the common
    // case where there is no override for a given function. This is thus probably worth the minor code duplication.
    static void validate_types(data_dictionary::database db,
--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -12,7 +12,6 @@

 #include "native_function.hh"
 #include "scalar_function.hh"
-#include "cql_serialization_format.hh"
 #include "log.hh"
 #include <seastar/core/shared_ptr.hh>

@@ -48,9 +47,9 @@ public:
    virtual bool is_pure() const override {
        return Pure;
    }
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        try {
-            return _func(sf, parameters);
+            return _func(parameters);
        } catch(exceptions::cassandra_exception&) {
            // If the function's code took the time to produce an official
            // cassandra_exception, pass it through. Otherwise, below we will
--- a/cql3/functions/scalar_function.hh
+++ b/cql3/functions/scalar_function.hh
@@ -23,12 +23,11 @@ public:
    /**
     * Applies this function to the specified parameter.
     *
-     * @param protocolVersion protocol version used for parameters and return value
     * @param parameters the input parameters
     * @return the result of applying this function to the parameter
     * @throws InvalidRequestException if this function cannot not be applied to the parameter
     */
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) = 0;
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) = 0;
 };


--- a/cql3/functions/time_uuid_fcts.hh
+++ b/cql3/functions/time_uuid_fcts.hh
@@ -24,7 +24,7 @@ inline
 shared_ptr<function>
 make_now_fct() {
    return make_native_scalar_function<false>("now", timeuuid_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        return {to_bytes(utils::UUID_gen::get_time_UUID())};
    });
 }
@@ -42,7 +42,7 @@ inline
 shared_ptr<function>
 make_min_timeuuid_fct() {
    return make_native_scalar_function<true>("mintimeuuid", timeuuid_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        auto& bb = values[0];
        if (!bb) {
            return {};
@@ -60,7 +60,7 @@ inline
 shared_ptr<function>
 make_max_timeuuid_fct() {
    return make_native_scalar_function<true>("maxtimeuuid", timeuuid_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        auto& bb = values[0];
        if (!bb) {
            return {};
@@ -89,7 +89,7 @@ inline
 shared_ptr<function>
 make_date_of_fct() {
    return make_native_scalar_function<true>("dateof", timestamp_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -104,7 +104,7 @@ inline
 shared_ptr<function>
 make_unix_timestamp_of_fct() {
    return make_native_scalar_function<true>("unixtimestampof", long_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -117,7 +117,7 @@ make_unix_timestamp_of_fct() {
 inline shared_ptr<function>
 make_currenttimestamp_fct() {
    return make_native_scalar_function<false>("currenttimestamp", timestamp_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        return {timestamp_type->decompose(db_clock::now())};
    });
 }
@@ -125,7 +125,7 @@ make_currenttimestamp_fct() {
 inline shared_ptr<function>
 make_currenttime_fct() {
    return make_native_scalar_function<false>("currenttime", time_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        constexpr int64_t milliseconds_in_day = 3600 * 24 * 1000;
        int64_t milliseconds_since_epoch = std::chrono::duration_cast<std::chrono::milliseconds>(db_clock::now().time_since_epoch()).count();
        int64_t nanoseconds_today = (milliseconds_since_epoch % milliseconds_in_day) * 1000 * 1000;
@@ -136,7 +136,7 @@ make_currenttime_fct() {
 inline shared_ptr<function>
 make_currentdate_fct() {
    return make_native_scalar_function<false>("currentdate", simple_date_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        auto to_simple_date = get_castas_fctn(simple_date_type, timestamp_type);
        return {simple_date_type->decompose(to_simple_date(db_clock::now()))};
    });
@@ -146,7 +146,7 @@ inline
 shared_ptr<function>
 make_currenttimeuuid_fct() {
    return make_native_scalar_function<false>("currenttimeuuid", timeuuid_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        return {timeuuid_type->decompose(timeuuid_native_type{utils::UUID_gen::get_time_UUID()})};
    });
 }
@@ -155,7 +155,7 @@ inline
 shared_ptr<function>
 make_timeuuidtodate_fct() {
    return make_native_scalar_function<true>("todate", simple_date_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -171,7 +171,7 @@ inline
 shared_ptr<function>
 make_timestamptodate_fct() {
    return make_native_scalar_function<true>("todate", simple_date_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -190,7 +190,7 @@ inline
 shared_ptr<function>
 make_timeuuidtotimestamp_fct() {
    return make_native_scalar_function<true>("totimestamp", timestamp_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -205,7 +205,7 @@ inline
 shared_ptr<function>
 make_datetotimestamp_fct() {
    return make_native_scalar_function<true>("totimestamp", timestamp_type, { simple_date_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -224,7 +224,7 @@ inline
 shared_ptr<function>
 make_timeuuidtounixtimestamp_fct() {
    return make_native_scalar_function<true>("tounixtimestamp", long_type, { timeuuid_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -242,7 +242,7 @@ inline
 shared_ptr<function>
 make_timestamptounixtimestamp_fct() {
    return make_native_scalar_function<true>("tounixtimestamp", long_type, { timestamp_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -260,7 +260,7 @@ inline
 shared_ptr<function>
 make_datetounixtimestamp_fct() {
    return make_native_scalar_function<true>("tounixtimestamp", long_type, { simple_date_type },
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
--- a/cql3/functions/token_fct.hh
+++ b/cql3/functions/token_fct.hh
@@ -31,7 +31,7 @@ public:
                    , _schema(s) {
    }

-    bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    bytes_opt execute(const std::vector<bytes_opt>& parameters) override {
        if (std::any_of(parameters.cbegin(), parameters.cend(), [](const auto& param){ return !param; })) {
            return std::nullopt;
        }
--- a/cql3/functions/user_aggregate.hh
+++ b/cql3/functions/user_aggregate.hh
@@ -11,11 +11,12 @@
 #include "abstract_function.hh"
 #include "scalar_function.hh"
 #include "aggregate_function.hh"
+#include "data_dictionary/keyspace_element.hh"

 namespace cql3 {
 namespace functions {

-class user_aggregate : public abstract_function, public aggregate_function{
+class user_aggregate : public abstract_function, public aggregate_function, public data_dictionary::keyspace_element {
    bytes_opt _initcond;
    ::shared_ptr<scalar_function> _sfunc;
    ::shared_ptr<scalar_function> _reducefunc;
@@ -31,14 +32,19 @@ public:
    virtual bool requires_thread() const override;
    bool has_finalfunc() const;

-    const scalar_function& sfunc() const {
-        return *_sfunc;
+    virtual sstring keypace_name() const override { return name().keyspace; }
+    virtual sstring element_name() const override { return name().name; }
+    virtual sstring element_type() const override { return "aggregate"; }
+    virtual std::ostream& describe(std::ostream& os) const override;
+
+    seastar::shared_ptr<scalar_function> sfunc() const {
+        return _sfunc;
    }
-    const scalar_function& reducefunc() const {
-        return *_reducefunc;
+    seastar::shared_ptr<scalar_function> reducefunc() const {
+        return _reducefunc;
    }
-    const scalar_function& finalfunc() const {
-        return *_finalfunc;
+    seastar::shared_ptr<scalar_function> finalfunc() const {
+        return _finalfunc;
    }
    const bytes_opt& initcond() const {
        return _initcond;
--- a/cql3/functions/user_function.cc
+++ b/cql3/functions/user_function.cc
@@ -7,8 +7,8 @@
 */

 #include "user_function.hh"
+#include "cql3/util.hh"
 #include "log.hh"
-#include "cql_serialization_format.hh"
 #include "lang/wasm.hh"

 #include <seastar/core/thread.hh>
@@ -32,7 +32,7 @@ bool user_function::is_aggregate() const { return false; }

 bool user_function::requires_thread() const { return true; }

-bytes_opt user_function::execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) {
+bytes_opt user_function::execute(const std::vector<bytes_opt>& parameters) {
    const auto& types = arg_types();
    if (parameters.size() != types.size()) {
        throw std::logic_error("Wrong number of parameters");
@@ -66,5 +66,33 @@ bytes_opt user_function::execute(cql_serialization_format sf, const std::vector<
        });
 }

+std::ostream& user_function::describe(std::ostream& os) const {
+    auto ks = cql3::util::maybe_quote(name().keyspace);
+    auto na = cql3::util::maybe_quote(name().name);
+
+    os << "CREATE FUNCTION " << ks << "." << na << "(";
+    for (size_t i = 0; i < _arg_names.size(); i++) {
+        if (i > 0) {
+            os << ", ";
+        }
+        os << _arg_names[i] << " " << _arg_types[i]->cql3_type_name();
+    }
+    os << ")\n";
+
+    if (_called_on_null_input) {
+        os << "CALLED";
+    } else {
+        os << "RETURNS NULL";
+    }
+    os << " ON NULL INPUT\n"
+       << "RETURNS " << _return_type->cql3_type_name() << "\n"
+       << "LANGUAGE " << _language << "\n"
+       << "AS $$\n"
+       << _body << "\n"
+       << "$$;";
+
+    return os;
+}
+
 }
 }
--- a/cql3/functions/user_function.hh
+++ b/cql3/functions/user_function.hh
@@ -14,18 +14,19 @@
 #include "scalar_function.hh"
 #include "lang/lua.hh"
 #include "lang/wasm.hh"
+#include "data_dictionary/keyspace_element.hh"

 namespace cql3 {
 namespace functions {


-class user_function final : public abstract_function, public scalar_function {
+class user_function final : public abstract_function, public scalar_function, public data_dictionary::keyspace_element {
 public:
    struct lua_context {
        sstring bitcode;
        // FIXME: We should not need a copy in each function. It is here
        // because user_function::execute is only passed the
-        // cql_serialization_format and the runtime arguments.  We could
+        // the runtime arguments.  We could
        // avoid it by having a runtime->execute(user_function) instead,
        // but that is a large refactoring. We could also store a
        // lua_runtime in a thread_local variable, but that is one extra
@@ -58,7 +59,12 @@ public:
    virtual bool is_native() const override;
    virtual bool is_aggregate() const override;
    virtual bool requires_thread() const override;
-    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override;
+    virtual bytes_opt execute(const std::vector<bytes_opt>& parameters) override;
+
+    virtual sstring keypace_name() const override { return name().keyspace; }
+    virtual sstring element_name() const override { return name().name; }
+    virtual sstring element_type() const override { return "function"; }
+    virtual std::ostream& describe(std::ostream& os) const override;
 };

 }
--- a/cql3/functions/uuid_fcts.hh
+++ b/cql3/functions/uuid_fcts.hh
@@ -22,7 +22,7 @@ inline
 shared_ptr<function>
 make_uuid_fct() {
    return make_native_scalar_function<false>("uuid", uuid_type, {},
-            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return {uuid_type->decompose(utils::make_random_uuid())};
    });
 }
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -37,9 +37,6 @@ lists::setter::execute(mutation& m, const clustering_key_prefix& prefix, const u

 void
 lists::setter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const column_definition& column, const cql3::raw_value& value) {
-    if (value.is_unset_value()) {
-        return;
-    }
    if (column.type->is_multi_cell()) {
        // Delete all cells first, then append new ones
        collection_mutation_view_description mut;
@@ -70,13 +67,7 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
    if (index.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value for list index");
    }
-    if (index.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Invalid unset value for list index");
-    }
    auto value = expr::evaluate(*_e, params._options);
-    if (value.is_unset_value()) {
-        return;
-    }

    auto idx = index.view().deserialize<int32_t>(*int32_type);
    auto&& existing_list_opt = params.get_prefetched_list(m.key(), prefix, column);
@@ -122,10 +113,6 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
        throw exceptions::invalid_request_exception("Invalid null value for list index");
    }

-    if (index.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Invalid unset value for list index");
-    }
-
    auto ltype = static_cast<const list_type_impl*>(column.type.get());

    collection_mutation_description mut;
@@ -145,9 +132,6 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
 void
 lists::appender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    const cql3::raw_value value = expr::evaluate(*_e, params._options);
-    if (value.is_unset_value()) {
-        return;
-    }
    assert(column.type->is_multi_cell()); // "Attempted to append to a frozen list";
    do_append(value, m, prefix, column, params);
 }
@@ -161,7 +145,7 @@ lists::do_append(const cql3::raw_value& list_value,
    if (column.type->is_multi_cell()) {
        // If we append null, do nothing. Note that for Setter, we've
        // already removed the previous value so we're good here too
-        if (list_value.is_null_or_unset()) {
+        if (list_value.is_null()) {
            return;
        }

@@ -199,7 +183,7 @@ void
 lists::prepender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to prepend to a frozen list";
    cql3::raw_value lvalue = expr::evaluate(*_e, params._options);
-    if (lvalue.is_null_or_unset()) {
+    if (lvalue.is_null()) {
        return;
    }

@@ -265,7 +249,7 @@ lists::discarder::execute(mutation& m, const clustering_key_prefix& prefix, cons
        return;
    }

-    if (lvalue.is_null_or_unset()) {
+    if (lvalue.is_null()) {
        return;
    }

@@ -304,9 +288,6 @@ lists::discarder_by_index::execute(mutation& m, const clustering_key_prefix& pre
    if (index.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value for list index");
    }
-    if (index.is_unset_value()) {
-        return;
-    }

    auto&& existing_list_opt = params.get_prefetched_list(m.key(), prefix, column);
    int32_t idx = index.view().deserialize<int32_t>(*int32_type);
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -27,21 +27,21 @@ public:
    static lw_shared_ptr<column_specification> value_spec_of(const column_specification&);
    static lw_shared_ptr<column_specification> uuid_index_spec_of(const column_specification&);
 public:
-    class setter : public operation {
+    class setter : public operation_skip_if_unset {
    public:
        setter(const column_definition& column, expr::expression e)
-                : operation(column, std::move(e)) {
+                : operation_skip_if_unset(column, std::move(e)) {
        }
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
        static void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const column_definition& column, const cql3::raw_value& value);
    };

-    class setter_by_index : public operation {
+    class setter_by_index : public operation_skip_if_unset {
    protected:
        expr::expression _idx;
    public:
        setter_by_index(const column_definition& column, expr::expression idx, expr::expression e)
-            : operation(column, std::move(e)), _idx(std::move(idx)) {
+            : operation_skip_if_unset(column, std::move(e)), _idx(std::move(idx)) {
        }
        virtual bool requires_read() const override;
        virtual void fill_prepare_context(prepare_context& ctx) override;
@@ -57,9 +57,9 @@ public:
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class appender : public operation {
+    class appender : public operation_skip_if_unset {
    public:
-        using operation::operation;
+        using operation_skip_if_unset::operation_skip_if_unset;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

@@ -69,25 +69,25 @@ public:
            const column_definition& column,
            const update_parameters& params);

-    class prepender : public operation {
+    class prepender : public operation_skip_if_unset {
    public:
-        using operation::operation;
+        using operation_skip_if_unset::operation_skip_if_unset;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class discarder : public operation {
+    class discarder : public operation_skip_if_unset {
    public:
        discarder(const column_definition& column, expr::expression e)
-                : operation(column, std::move(e)) {
+                : operation_skip_if_unset(column, std::move(e)) {
        }
        virtual bool requires_read() const override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class discarder_by_index : public operation {
+    class discarder_by_index : public operation_skip_if_unset {
    public:
        discarder_by_index(const column_definition& column, expr::expression idx)
-                : operation(column, std::move(idx)) {
+                : operation_skip_if_unset(column, std::move(idx)) {
        }
        virtual bool requires_read() const override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -26,9 +26,6 @@ maps::setter::execute(mutation& m, const clustering_key_prefix& row_key, const u

 void
 maps::setter::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params, const column_definition& column, const cql3::raw_value& value) {
-    if (value.is_unset_value()) {
-        return;
-    }
    if (column.type->is_multi_cell()) {
        // Delete all cells first, then put new ones
        collection_mutation_description mut;
@@ -50,12 +47,6 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = expr::evaluate(_k, params._options);
    auto value = expr::evaluate(*_e, params._options);
-    if (value.is_unset_value()) {
-        return;
-    }
-    if (key.is_unset_value()) {
-        throw invalid_request_exception("Invalid unset map key");
-    }
    if (key.is_null()) {
        throw invalid_request_exception("Invalid null map key");
    }
@@ -73,9 +64,7 @@ void
 maps::putter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to add items to a frozen map";
    cql3::raw_value value = expr::evaluate(*_e, params._options);
-    if (!value.is_unset_value()) {
-        do_put(m, prefix, params, value, column);
-    }
+    do_put(m, prefix, params, value, column);
 }

 void
@@ -111,9 +100,6 @@ maps::discarder_by_key::execute(mutation& m, const clustering_key_prefix& prefix
    if (key.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null map key");
    }
-    if (key.is_unset_value()) {
-        throw exceptions::invalid_request_exception("Invalid unset map key");
-    }
    collection_mutation_description mut;
    mut.cells.emplace_back(std::move(key).to_bytes(), params.make_dead_cell());

--- a/cql3/maps.hh
+++ b/cql3/maps.hh
@@ -27,30 +27,30 @@ public:
    static lw_shared_ptr<column_specification> key_spec_of(const column_specification& column);
    static lw_shared_ptr<column_specification> value_spec_of(const column_specification& column);

-    class setter : public operation {
+    class setter : public operation_skip_if_unset {
    public:
        setter(const column_definition& column, expr::expression e)
-                : operation(column, std::move(e)) {
+                : operation_skip_if_unset(column, std::move(e)) {
        }

        virtual void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) override;
        static void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params, const column_definition& column, const cql3::raw_value& value);
    };

-    class setter_by_key : public operation {
+    class setter_by_key : public operation_skip_if_unset {
        expr::expression _k;
    public:
        setter_by_key(const column_definition& column, expr::expression k, expr::expression e)
-            : operation(column, std::move(e)), _k(std::move(k)) {
+            : operation_skip_if_unset(column, std::move(e)), _k(std::move(k)) {
        }
        virtual void fill_prepare_context(prepare_context& ctx) override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    class putter : public operation {
+    class putter : public operation_skip_if_unset {
    public:
        putter(const column_definition& column, expr::expression e)
-            : operation(column, std::move(e)) {
+            : operation_skip_if_unset(column, std::move(e)) {
        }
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };
@@ -58,10 +58,10 @@ public:
    static void do_put(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params,
            const cql3::raw_value& value, const column_definition& column);

-    class discarder_by_key : public operation {
+    class discarder_by_key : public operation_no_unset_support {
    public:
        discarder_by_key(const column_definition& column, expr::expression k)
-                : operation(column, std::move(k)) {
+                : operation_no_unset_support(column, std::move(k)) {
        }
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };
--- a/cql3/operation.cc
+++ b/cql3/operation.cc
@@ -32,9 +32,9 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
    using exceptions::invalid_request_exception;
    auto rtype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!rtype) {
-        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!rtype->is_multi_cell()) {
-        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (rtype->get_kind() == abstract_type::kind::list) {
@@ -47,7 +47,7 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
            return make_shared<lists::setter_by_index>(receiver, std::move(idx), std::move(lval));
        }
    } else if (rtype->get_kind() == abstract_type::kind::set) {
-        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name_as_text()));
    } else if (rtype->get_kind() == abstract_type::kind::map) {
        auto key = prepare_expression(_selector, db, keyspace, nullptr, maps::key_spec_of(*receiver.column_specification));
        auto mval = prepare_expression(_value, db, keyspace, nullptr, maps::value_spec_of(*receiver.column_specification));
@@ -136,11 +136,11 @@ operation::addition::prepare(data_dictionary::database db, const sstring& keyspa
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        return make_shared<constants::adder>(receiver, std::move(v));
    } else if (!ctype->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -169,14 +169,14 @@ operation::subtraction::prepare(data_dictionary::database db, const sstring& key
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);
        return make_shared<constants::subtracter>(receiver, std::move(v));
    }
    if (!ctype->is_multi_cell()) {
        throw exceptions::invalid_request_exception(
-                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -211,9 +211,9 @@ operation::prepend::prepare(data_dictionary::database db, const sstring& keyspac
    auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);

    if (!dynamic_cast<const list_type_impl*>(receiver.type.get())) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name_as_text()));
    }

    return make_shared<lists::prepender>(receiver, std::move(v));
@@ -268,9 +268,9 @@ operation::set_counter_value_from_tuple_list::prepare(data_dictionary::database
    auto v = prepare_expression(_value, db, keyspace, nullptr, spec);

    // Will not be used elsewhere, so make it local.
-    class counter_setter : public operation {
+    class counter_setter : public operation_no_unset_support {
    public:
-        using operation::operation;
+        using operation_no_unset_support::operation_no_unset_support;

        bool is_raw_counter_shard_write() const override {
            return true;
@@ -340,9 +340,9 @@ operation::element_deletion::affected_column() const {
 shared_ptr<operation>
 operation::element_deletion::prepare(data_dictionary::database db, const sstring& keyspace, const column_definition& receiver) const {
    if (!receiver.type->is_collection()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name_as_text()));
    }
    auto ctype = static_pointer_cast<const collection_type_impl>(receiver.type);
    if (ctype->get_kind() == abstract_type::kind::list) {
--- a/cql3/operation.hh
+++ b/cql3/operation.hh
@@ -17,6 +17,7 @@
 #include "update_parameters.hh"
 #include "cql3/column_identifier.hh"
 #include "cql3/expr/expression.hh"
+#include "cql3/expr/unset.hh"

 #include <optional>

@@ -54,10 +55,13 @@ protected:
    // may require none of more than one expression, but most need 1 so it simplify things a bit.
    std::optional<expr::expression> _e;

+    // A guard to check if the operation should be skipped due to unset operand.
+    expr::unset_bind_variable_guard _unset_guard;
 public:
-    operation(const column_definition& column_, std::optional<expr::expression> e)
+    operation(const column_definition& column_, std::optional<expr::expression> e, expr::unset_bind_variable_guard ubvg)
        : column{column_}
        , _e(std::move(e))
+        , _unset_guard(std::move(ubvg))
    { }

    virtual ~operation() {}
@@ -87,10 +91,14 @@ public:
    }

    /**
-     * Execute the operation.
+     * Execute the operation. Check should_skip_operation() first.
     */
    virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) = 0;
-    
+
+    bool should_skip_operation(const query_options& qo) const {
+        return _unset_guard.is_unset(qo);
+    }
+
    virtual void prepare_for_broadcast_tables(statements::broadcast_tables::prepared_update&) const;

    /**
@@ -265,4 +273,18 @@ public:
    };
 };

+class operation_skip_if_unset : public operation {
+public:
+    operation_skip_if_unset(const column_definition& column, expr::expression e)
+            : operation(column, e, expr::unset_bind_variable_guard(e)) {
+    }
+};
+
+class operation_no_unset_support : public operation {
+public:
+    operation_no_unset_support(const column_definition& column, std::optional<expr::expression> e)
+            : operation(column, std::move(e), expr::unset_bind_variable_guard(std::nullopt)) {
+    }
+};
+
 }
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -23,42 +23,43 @@ thread_local const query_options::specific_options query_options::specific_optio

 thread_local query_options query_options::DEFAULT{default_cql_config,
    db::consistency_level::ONE, std::nullopt,
-    std::vector<cql3::raw_value_view>(), false, query_options::specific_options::DEFAULT, cql_serialization_format::latest()};
+    std::vector<cql3::raw_value_view>(), false, query_options::specific_options::DEFAULT};

 query_options::query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
                           std::vector<cql3::raw_value> values,
                           std::vector<cql3::raw_value_view> value_views,
+                           cql3::unset_bind_variable_vector unset,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf)
+                           specific_options options
+                           )
   : _cql_config(cfg)
   , _consistency(consistency)
   , _names(std::move(names))
   , _values(std::move(values))
   , _value_views(value_views)
+   , _unset(unset)
   , _skip_metadata(skip_metadata)
   , _options(std::move(options))
-   , _cql_serialization_format(sf)
 {
 }

 query_options::query_options(const cql_config& cfg,
                             db::consistency_level consistency,
                             std::optional<std::vector<sstring_view>> names,
-                             std::vector<cql3::raw_value> values,
+                             cql3::raw_value_vector_with_unset values,
                             bool skip_metadata,
-                             specific_options options,
-                             cql_serialization_format sf)
+                             specific_options options
+                             )
    : _cql_config(cfg)
    , _consistency(consistency)
    , _names(std::move(names))
-    , _values(std::move(values))
+    , _values(std::move(values.values))
    , _value_views()
+    , _unset(std::move(values.unset))
    , _skip_metadata(skip_metadata)
    , _options(std::move(options))
-    , _cql_serialization_format(sf)
 {
    fill_value_views();
 }
@@ -66,22 +67,22 @@ query_options::query_options(const cql_config& cfg,
 query_options::query_options(const cql_config& cfg,
                             db::consistency_level consistency,
                             std::optional<std::vector<sstring_view>> names,
-                             std::vector<cql3::raw_value_view> value_views,
+                             cql3::raw_value_view_vector_with_unset value_views,
                             bool skip_metadata,
-                             specific_options options,
-                             cql_serialization_format sf)
+                             specific_options options
+                             )
    : _cql_config(cfg)
    , _consistency(consistency)
    , _names(std::move(names))
    , _values()
-    , _value_views(std::move(value_views))
+    , _value_views(std::move(value_views.values))
+    , _unset(std::move(value_views.unset))
    , _skip_metadata(skip_metadata)
    , _options(std::move(options))
-    , _cql_serialization_format(sf)
 {
 }

-query_options::query_options(db::consistency_level cl, std::vector<cql3::raw_value> values,
+query_options::query_options(db::consistency_level cl, cql3::raw_value_vector_with_unset values,
        specific_options options)
    : query_options(
          default_cql_config,
@@ -89,8 +90,7 @@ query_options::query_options(db::consistency_level cl, std::vector<cql3::raw_val
          {},
          std::move(values),
          false,
-          std::move(options),
-          cql_serialization_format::latest()
+          std::move(options)
      )
 {
 }
@@ -101,9 +101,9 @@ query_options::query_options(std::unique_ptr<query_options> qo, lw_shared_ptr<se
        std::move(qo->_names),
        std::move(qo->_values),
        std::move(qo->_value_views),
+        std::move(qo->_unset),
        qo->_skip_metadata,
-        query_options::specific_options{qo->_options.page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp},
-        qo->_cql_serialization_format) {
+        query_options::specific_options{qo->_options.page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}) {

 }

@@ -113,13 +113,13 @@ query_options::query_options(std::unique_ptr<query_options> qo, lw_shared_ptr<se
        std::move(qo->_names),
        std::move(qo->_values),
        std::move(qo->_value_views),
+        std::move(qo->_unset),
        qo->_skip_metadata,
-        query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp},
-        qo->_cql_serialization_format) {
+        query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}) {

 }

-query_options::query_options(std::vector<cql3::raw_value> values)
+query_options::query_options(cql3::raw_value_vector_with_unset values)
    : query_options(
          db::consistency_level::ONE, std::move(values))
 {}
@@ -135,12 +135,21 @@ void query_options::prepare(const std::vector<lw_shared_ptr<column_specification
    ordered_values.reserve(specs.size());
    for (auto&& spec : specs) {
        auto& spec_name = spec->name->text();
+        bool found_value_for_name = false;
        for (size_t j = 0; j < names.size(); j++) {
            if (names[j] == spec_name) {
                ordered_values.emplace_back(_value_views[j]);
+                found_value_for_name = true;
                break;
            }
        }
+
+        // No bound value was found with the name `spec_name`.
+        // This means that the user forgot to include a bound value with such name.
+        if (!found_value_for_name) {
+            throw exceptions::invalid_request_exception(
+                format("Missing value for bind marker with name: {}", spec_name));
+        }
    }
    _value_views = std::move(ordered_values);
 }
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -11,13 +11,14 @@
 #pragma once

 #include <concepts>
+#include <initializer_list>
 #include "timestamp.hh"
 #include "bytes.hh"
 #include "db/consistency_level_type.hh"
 #include "service/query_state.hh"
 #include "service/pager/paging_state.hh"
 #include "cql3/values.hh"
-#include "cql_serialization_format.hh"
+#include "utils/small_vector.hh"

 namespace cql3 {

@@ -28,6 +29,38 @@ class column_specification;

 using computed_function_values = std::unordered_map<uint8_t, bytes_opt>;

+using unset_bind_variable_vector = utils::small_vector<bool, 16>;
+
+// Matches a raw_value_view with an unset vector to support CQL binary protocol
+// "unset" values.
+struct raw_value_view_vector_with_unset {
+    std::vector<raw_value_view> values;
+    unset_bind_variable_vector unset;
+
+    raw_value_view_vector_with_unset(std::vector<raw_value_view> values_, unset_bind_variable_vector unset_) : values(std::move(values_)), unset(std::move(unset_)) {}
+    // Constructor with no unset support, for tests and internal queries
+    raw_value_view_vector_with_unset(std::vector<raw_value_view> values_) : values(std::move(values_)) {
+        unset.resize(values.size());
+    }
+    raw_value_view_vector_with_unset() = default;
+};
+
+// Matches a raw_value with an unset vector to support CQL binary protocol
+// "unset" values.
+struct raw_value_vector_with_unset {
+    std::vector<raw_value> values;
+    unset_bind_variable_vector unset;
+
+    raw_value_vector_with_unset(std::vector<raw_value> values_, unset_bind_variable_vector unset_) : values(std::move(values_)), unset(std::move(unset_)) {}
+    // Constructor with no unset support, for tests and internal queries
+    raw_value_vector_with_unset(std::vector<raw_value> values_) : values(std::move(values_)) {
+        unset.resize(values.size());
+    }
+    // Mostly for testing.
+    raw_value_vector_with_unset(std::initializer_list<raw_value> values_) : raw_value_vector_with_unset(std::vector(values_)) {}
+    raw_value_vector_with_unset() = default;
+};
+
 /**
 * Options for a query.
 */
@@ -48,9 +81,9 @@ private:
    const std::optional<std::vector<sstring_view>> _names;
    std::vector<cql3::raw_value> _values;
    std::vector<cql3::raw_value_view> _value_views;
+    unset_bind_variable_vector _unset;
    const bool _skip_metadata;
    const specific_options _options;
-    cql_serialization_format _cql_serialization_format;
    std::optional<std::vector<query_options>> _batch_options;
    // We must use the same microsecond-precision timestamp for
    // all cells created by an LWT statement or when a statement
@@ -83,23 +116,10 @@ private:
    // evaluation sites and we only have a const reference to `query_options`.
    mutable computed_function_values _cached_pk_fn_calls;
 private:
-    /**
-     * @brief Batch query_options constructor.
-     *
-     * Requirements:
-     *   - @tparam OneMutationDataRange has a begin() and end() iterators.
-     *   - The values of @tparam OneMutationDataRange are of either raw_value_view or raw_value types.
-     *
-     * @param o Base query_options object. query_options objects for each statement in the batch will derive the values from it.
-     * @param values_ranges a vector of values ranges for each statement in the batch.
-     */
-    template<typename OneMutationDataRange>
-    requires requires (OneMutationDataRange range) {
-         std::begin(range);
-         std::end(range);
-    } && ( requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value_view>; } ||
-           requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value>; } )
-    explicit query_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges);
+    // Batch constructor.
+    template <typename Values>
+    requires std::same_as<Values, raw_value_vector_with_unset> || std::same_as<Values, raw_value_view_vector_with_unset>
+    explicit query_options(query_options&& o, std::vector<Values> values_ranges);

 public:
    query_options(query_options&&) = default;
@@ -108,43 +128,30 @@ public:
    explicit query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
-                           std::vector<cql3::raw_value> values,
+                           raw_value_vector_with_unset values,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf);
+                           specific_options options
+                           );
    explicit query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
                           std::vector<cql3::raw_value> values,
                           std::vector<cql3::raw_value_view> value_views,
+                           unset_bind_variable_vector unset,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf);
+                           specific_options options
+                           );
    explicit query_options(const cql_config& cfg,
                           db::consistency_level consistency,
                           std::optional<std::vector<sstring_view>> names,
-                           std::vector<cql3::raw_value_view> value_views,
+                           raw_value_view_vector_with_unset value_views,
                           bool skip_metadata,
-                           specific_options options,
-                           cql_serialization_format sf);
+                           specific_options options
+                           );

-    /**
-     * @brief Batch query_options factory.
-     *
-     * Requirements:
-     *   - @tparam OneMutationDataRange has a begin() and end() iterators.
-     *   - The values of @tparam OneMutationDataRange are of either raw_value_view or raw_value types.
-     *
-     * @param o Base query_options object. query_options objects for each statement in the batch will derive the values from it.
-     * @param values_ranges a vector of values ranges for each statement in the batch.
-     */
-    template<typename OneMutationDataRange>
-    requires requires (OneMutationDataRange range) {
-         std::begin(range);
-         std::end(range);
-    } && ( requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value_view>; } ||
-           requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value>; } )
-    static query_options make_batch_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges) {
+    template <typename Values>
+    requires std::same_as<Values, raw_value_vector_with_unset> || std::same_as<Values, raw_value_view_vector_with_unset>
+    static query_options make_batch_options(query_options&& o, std::vector<Values> values_ranges) {
        return query_options(std::move(o), std::move(values_ranges));
    }

@@ -152,8 +159,8 @@ public:
    static thread_local query_options DEFAULT;

    // forInternalUse
-    explicit query_options(std::vector<cql3::raw_value> values);
-    explicit query_options(db::consistency_level, std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
+    explicit query_options(raw_value_vector_with_unset values);
+    explicit query_options(db::consistency_level, raw_value_vector_with_unset values, specific_options options = specific_options::DEFAULT);
    explicit query_options(std::unique_ptr<query_options>, lw_shared_ptr<service::pager::paging_state> paging_state);
    explicit query_options(std::unique_ptr<query_options>, lw_shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);

@@ -162,7 +169,14 @@ public:
    }

    cql3::raw_value_view get_value_at(size_t idx) const {
-        return _value_views.at(idx);
+        if (_unset.at(idx)) {
+            throw exceptions::invalid_request_exception(fmt::format("Unexpected unset value for bind variable {}", idx));
+        }
+        return _value_views[idx];
+    }
+
+    bool is_unset(size_t idx) const {
+        return _unset.at(idx);
    }

    size_t get_values_count() const {
@@ -195,18 +209,6 @@ public:
        return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
    }

-    /**
-     * The protocol version for the query. Will be 3 if the object don't come from
-     * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
-     */
-    int get_protocol_version() const {
-        return _cql_serialization_format.protocol_version();
-    }
-
-    cql_serialization_format get_cql_serialization_format() const {
-        return _cql_serialization_format;
-    }
-
    const query_options::specific_options& get_specific_options() const {
        return _options;
    }
@@ -278,19 +280,15 @@ private:
    void fill_value_views();
 };

-template<typename OneMutationDataRange>
-requires requires (OneMutationDataRange range) {
-     std::begin(range);
-     std::end(range);
-} && ( requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value_view>; } ||
-       requires (OneMutationDataRange range) { { *range.begin() } -> std::convertible_to<raw_value>; } )
-query_options::query_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges)
+template <typename Values>
+requires std::same_as<Values, raw_value_vector_with_unset> || std::same_as<Values, raw_value_view_vector_with_unset>
+query_options::query_options(query_options&& o, std::vector<Values> values_ranges)
    : query_options(std::move(o))
 {
    std::vector<query_options> tmp;
    tmp.reserve(values_ranges.size());
    std::transform(values_ranges.begin(), values_ranges.end(), std::back_inserter(tmp), [this](auto& values_range) {
-        return query_options(_cql_config, _consistency, {}, std::move(values_range), _skip_metadata, _options, _cql_serialization_format);
+        return query_options(_cql_config, _consistency, {}, std::move(values_range), _skip_metadata, _options);
    });
    _batch_options = std::move(tmp);
 }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -22,6 +22,7 @@
 #include "db/config.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "hashers.hh"
+#include "utils/error_injection.hh"

 namespace cql3 {

@@ -600,6 +601,14 @@ query_processor::get_statement(const sstring_view& query, const service::client_
 std::unique_ptr<raw::parsed_statement>
 query_processor::parse_statement(const sstring_view& query) {
    try {
+        {
+            const char* error_injection_key = "query_processor-parse_statement-test_failure";
+            utils::get_local_injector().inject(error_injection_key, [&]() {
+                if (query.find(error_injection_key) != sstring_view::npos) {
+                    throw std::runtime_error(error_injection_key);
+                }
+            });
+        }
        auto statement = util::do_with_parser(query,  std::mem_fn(&cql3_parser::CqlParser::query));
        if (!statement) {
            throw exceptions::syntax_exception("Parsing failed");
@@ -923,6 +932,9 @@ void query_processor::migration_subscriber::on_update_aggregate(const sstring& k
 void query_processor::migration_subscriber::on_update_view(
        const sstring& ks_name,
        const sstring& view_name, bool columns_changed) {
+    // scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
+    // them as such when changed.
+    on_update_column_family(ks_name, view_name, columns_changed);
 }

 void query_processor::migration_subscriber::on_drop_keyspace(const sstring& ks_name) {
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -153,10 +153,6 @@ static std::vector<expr::expression> extract_partition_range(
            on_internal_error(rlogger, "extract_partition_range(field_selection)");
        }

-        void operator()(const null&) {
-            on_internal_error(rlogger, "extract_partition_range(null)");
-        }
-
        void operator()(const bind_variable&) {
            on_internal_error(rlogger, "extract_partition_range(bind_variable)");
        }
@@ -278,10 +274,6 @@ static std::vector<expr::expression> extract_clustering_prefix_restrictions(
            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(field_selection)");
        }

-        void operator()(const null&) {
-            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(null)");
-        }
-
        void operator()(const bind_variable&) {
            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(bind_variable)");
        }
@@ -1239,10 +1231,6 @@ struct multi_column_range_accumulator {
        on_internal_error(rlogger, "field selection encountered outside binary operator");
    }

-    void operator()(const null&) {
-        on_internal_error(rlogger, "null encountered outside binary operator");
-    }
-
    void operator()(const bind_variable&) {
        on_internal_error(rlogger, "bind variable encountered outside binary operator");
    }
@@ -1800,7 +1788,7 @@ void statement_restrictions::prepare_indexed_global(const schema& idx_tbl_schema
            oper_t::EQ,
            // TODO: This should be a unique marker whose value we set at execution time.  There is currently no
            // handy mechanism for doing that in query_options.
-            expr::constant::make_unset_value(token_column->type));
+            expr::constant::make_null(token_column->type));
 }

 void statement_restrictions::prepare_indexed_local(const schema& idx_tbl_schema) {
--- a/cql3/selection/aggregate_function_selector.hh
+++ b/cql3/selection/aggregate_function_selector.hh
@@ -9,7 +9,6 @@

 #include "abstract_function_selector.hh"
 #include "cql3/functions/aggregate_function.hh"
-#include "cql_serialization_format.hh"

 #pragma once

@@ -24,20 +23,20 @@ public:
        return true;
    }

-    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
+    virtual void add_input(result_set_builder& rs) override {
        // Aggregation of aggregation is not supported
        size_t m = _arg_selectors.size();
        for (size_t i = 0; i < m; ++i) {
            auto&& s = _arg_selectors[i];
-            s->add_input(sf, rs);
-            _args[i] = s->get_output(sf);
+            s->add_input(rs);
+            _args[i] = s->get_output();
            s->reset();
        }
-        _aggregate->add_input(sf, _args);
+        _aggregate->add_input(_args);
    }

-    virtual bytes_opt get_output(cql_serialization_format sf) override {
-        return _aggregate->compute(sf);
+    virtual bytes_opt get_output() override {
+        return _aggregate->compute();
    }

    virtual void reset() override {
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -13,7 +13,6 @@
 #include "selector.hh"
 #include "types.hh"
 #include "types/user.hh"
-#include "cql_serialization_format.hh"

 namespace cql3 {

@@ -59,12 +58,12 @@ public:
        return false;
    }

-    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
-        _selected->add_input(sf, rs);
+    virtual void add_input(result_set_builder& rs) override {
+        _selected->add_input(rs);
    }

-    virtual bytes_opt get_output(cql_serialization_format sf) override {
-        auto&& value = _selected->get_output(sf);
+    virtual bytes_opt get_output() override {
+        auto&& value = _selected->get_output();
        if (!value) {
            return std::nullopt;
        }
@@ -81,7 +80,7 @@ public:

    virtual sstring assignment_testable_source_context() const override {
        auto&& name = _type->field_name(_field);
-        auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
+        auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
        return format("{}.{}", _selected, sname);
    }

--- a/cql3/selection/scalar_function_selector.hh
+++ b/cql3/selection/scalar_function_selector.hh
@@ -11,7 +11,6 @@

 #include "abstract_function_selector.hh"
 #include "cql3/functions/scalar_function.hh"
-#include "cql_serialization_format.hh"

 namespace cql3 {

@@ -28,25 +27,25 @@ public:
        return _arg_selectors[0]->is_aggregate();
    }

-    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
+    virtual void add_input(result_set_builder& rs) override {
        size_t m = _arg_selectors.size();
        for (size_t i = 0; i < m; ++i) {
            auto&& s = _arg_selectors[i];
-            s->add_input(sf, rs);
+            s->add_input(rs);
        }
    }

    virtual void reset() override {
    }

-    virtual bytes_opt get_output(cql_serialization_format sf) override {
+    virtual bytes_opt get_output() override {
        size_t m = _arg_selectors.size();
        for (size_t i = 0; i < m; ++i) {
            auto&& s = _arg_selectors[i];
-            _args[i] = s->get_output(sf);
+            _args[i] = s->get_output();
            s->reset();
        }
-        return fun()->execute(sf, _args);
+        return fun()->execute(_args);
    }

    virtual bool requires_thread() const override;
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -222,9 +222,6 @@ prepare_selectable(const schema& s, const expr::expression& raw_selectable) {
            return make_shared<selectable::with_field_selection>(prepare_selectable(s, fs.structure),
                    fs.field->prepare(s));
        },
-        [&] (const expr::null&) -> shared_ptr<selectable> {
-            on_internal_error(slogger, "null found its way to selector context");
-        },
        [&] (const expr::bind_variable&) -> shared_ptr<selectable> {
            on_internal_error(slogger, "bind_variable found its way to selector context");
        },
@@ -283,9 +280,6 @@ selectable_processes_selection(const expr::expression& raw_selectable) {
        [&] (const expr::field_selection& fs) -> bool {
            return true;
        },
-        [&] (const expr::null&) -> bool {
-            on_internal_error(slogger, "null found its way to selector context");
-        },
        [&] (const expr::bind_variable&) -> bool {
            on_internal_error(slogger, "bind_variable found its way to selector context");
        },
--- a/Show More
+++ b/Show More