Update seastar submodule

* seastar e45cef9c...1b299004 (3): > rpc: Abort server connection streams on stop > rpc: Do not register stream to dying parent > rpc: Fix client-side stream registration race refs: #13100 Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
scylla_fstrim_setup: start scylla-fstrim.timer on setup
2023-09-06 12:35:37 +03:00 · 2023-07-18 16:03:53 +03:00 · 2023-07-14 18:18:05 +03:00 · 2023-07-14 15:48:28 +03:00 · 2023-07-13 22:48:36 +03:00 · 2023-07-13 22:48:30 +03:00
539 changed files with 17043 additions and 15792 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)

 set(Seastar_TESTING ON CACHE BOOL "" FORCE)
+set(Seastar_API_LEVEL 6 CACHE STRING "" FORCE)
 add_subdirectory(seastar)

 # System libraries dependencies
@@ -183,12 +184,25 @@ target_link_libraries(scylla PRIVATE
 # Force SHA1 build-id generation
 set(default_linker_flags "-Wl,--build-id=sha1")
 include(CheckLinkerFlag)
-foreach(linker "lld" "gold")
+set(Scylla_USE_LINKER
+    ""
+    CACHE
+    STRING
+    "Use specified linker instead of the default one")
+if(Scylla_USE_LINKER)
+    set(linkers "${Scylla_USE_LINKER}")
+else()
+    set(linkers "lld" "gold")
+endif()
+
+foreach(linker ${linkers})
    set(linker_flag "-fuse-ld=${linker}")
    check_linker_flag(CXX ${linker_flag} "CXX_LINKER_HAVE_${linker}")
    if(CXX_LINKER_HAVE_${linker})
        string(APPEND default_linker_flags " ${linker_flag}")
        break()
+    elseif(Scylla_USE_LINKER)
+        message(FATAL_ERROR "${Scylla_USE_LINKER} is not supported.")
    endif()
 endforeach()

--- a/2
+++ b/2
@@ -72,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.3.0-dev
+VERSION=5.3.0-rc1

 if test -f version
 then
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -53,7 +53,7 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::strin
    if (result_set->empty()) {
        co_await coroutine::return_exception(api_error::unrecognized_client(format("User not found: {}", username)));
    }
-    const bytes_opt& salted_hash = result_set->rows().front().front(); // We only asked for 1 row and 1 column
+    const managed_bytes_opt& salted_hash = result_set->rows().front().front(); // We only asked for 1 row and 1 column
    if (!salted_hash) {
        co_await coroutine::return_exception(api_error::unrecognized_client(format("No password found for user: {}", username)));
    }
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -76,13 +76,16 @@ future<> controller::start_server() {
        _ssg = create_smp_service_group(c).get0();

        rmw_operation::set_default_write_isolation(_config.alternator_write_isolation());
-        executor::set_default_timeout(std::chrono::milliseconds(_config.alternator_timeout_in_ms()));

        net::inet_address addr = utils::resolve(_config.alternator_address, family).get0();

        auto get_cdc_metadata = [] (cdc::generation_service& svc) { return std::ref(svc.get_cdc_metadata()); };
-
-        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
+        auto get_timeout_in_ms = [] (const db::config& cfg) -> utils::updateable_value<uint32_t> {
+            return cfg.alternator_timeout_in_ms;
+        };
+        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks),
+                        sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value(),
+                        sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
        // Note: from this point on, if start_server() throws for any reason,
        // it must first call stop_server() to stop the executor and server
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -6,8 +6,6 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

-#include <regex>
-
 #include "utils/base64.hh"

 #include <seastar/core/sleep.hh>
@@ -90,17 +88,20 @@ json::json_return_type make_streamed(rjson::value&& value) {
        // move objects to coroutine frame.
        auto los = std::move(os);
        auto lrs = std::move(rs);
+        std::exception_ptr ex;
        try {
            co_await rjson::print(*lrs, los);
-            co_await los.flush();
-            co_await los.close();
        } catch (...) {
            // at this point, we cannot really do anything. HTTP headers and return code are
            // already written, and quite potentially a portion of the content data.
            // just log + rethrow. It is probably better the HTTP server closes connection
            // abruptly or something...
-            elogger.error("Unhandled exception in data streaming: {}", std::current_exception());
-            throw;
+            ex = std::current_exception();
+            elogger.error("Exception during streaming HTTP response: {}", ex);
+        }
+        co_await los.close();
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
        }
        co_return;
    };
@@ -535,7 +536,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
        }

        auto m = co_await mm.prepare_column_family_drop_announcement(keyspace_name, table_name, group0_guard.write_timestamp(), service::migration_manager::drop_views::yes);
-        auto m2 = mm.prepare_keyspace_drop_announcement(keyspace_name, group0_guard.write_timestamp());
+        auto m2 = co_await mm.prepare_keyspace_drop_announcement(keyspace_name, group0_guard.write_timestamp());

        std::move(m2.begin(), m2.end(), std::back_inserter(m));

@@ -1365,14 +1366,11 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) co

 // The DynamoDB API doesn't let the client control the server's timeout, so
 // we have a global default_timeout() for Alternator requests. The value of
-// s_default_timeout is overwritten in alternator::controller::start_server()
+// s_default_timeout_ms is overwritten in alternator::controller::start_server()
 // based on the "alternator_timeout_in_ms" configuration parameter.
-db::timeout_clock::duration executor::s_default_timeout = 10s;
-void executor::set_default_timeout(db::timeout_clock::duration timeout) {
-    s_default_timeout = timeout;
-}
+thread_local utils::updateable_value<uint32_t> executor::s_default_timeout_in_ms{10'000};
 db::timeout_clock::time_point executor::default_timeout() {
-    return db::timeout_clock::now() + s_default_timeout;
+    return db::timeout_clock::now() + std::chrono::milliseconds(s_default_timeout_in_ms);
 }
        
 static future<std::unique_ptr<rjson::value>> get_previous_item(
@@ -2300,14 +2298,14 @@ static std::optional<attrs_to_get> calculate_attrs_to_get(const rjson::value& re
 * as before.
 */ 
 void executor::describe_single_item(const cql3::selection::selection& selection,
-    const std::vector<bytes_opt>& result_row,
+    const std::vector<managed_bytes_opt>& result_row,
    const std::optional<attrs_to_get>& attrs_to_get,
    rjson::value& item,
    bool include_all_embedded_attributes) 
 {
    const auto& columns = selection.get_columns();
    auto column_it = columns.begin();
-    for (const bytes_opt& cell : result_row) {
+    for (const managed_bytes_opt& cell : result_row) {
        std::string column_name = (*column_it)->name_as_text();
        if (cell && column_name != executor::ATTRS_COLUMN_NAME) {
            if (!attrs_to_get || attrs_to_get->contains(column_name)) {
@@ -2315,7 +2313,9 @@ void executor::describe_single_item(const cql3::selection::selection& selection,
                // so add() makes sense
                rjson::add_with_string_name(item, column_name, rjson::empty_object());
                rjson::value& field = item[column_name.c_str()];
-                rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(*cell, **column_it));
+                cell->with_linearized([&] (bytes_view linearized_cell) {
+                    rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(linearized_cell, **column_it));
+                });
            }
        } else if (cell) {
            auto deserialized = attrs_type()->deserialize(*cell);
@@ -2371,21 +2371,22 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
    return item;
 }

-std::vector<rjson::value> executor::describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get) {
-    cql3::selection::result_set_builder builder(selection, gc_clock::now());
-    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
+future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get) {
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        describe_single_item(selection, result_row, attrs_to_get, item);
+        describe_single_item(*selection, result_row, *attrs_to_get, item);
        ret.push_back(std::move(item));
+        co_await coroutine::maybe_yield();
    }
-    return ret;
+    co_return ret;
 }

 static bool check_needs_read_before_write(const parsed::value& v) {
@@ -3257,8 +3258,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                std::vector<rjson::value> jsons = describe_multi_item(schema, partition_slice, *selection, *qr.query_result, *attrs_to_get);
-                return make_ready_future<std::vector<rjson::value>>(std::move(jsons));
+                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get));
            });
            response_futures.push_back(std::move(f));
        }
@@ -3498,7 +3498,7 @@ public:
        _column_it = _columns.begin();
    }

-    void accept_value(const std::optional<query::result_bytes_view>& result_bytes_view) {
+    void accept_value(managed_bytes_view_opt result_bytes_view) {
        if (!result_bytes_view) {
            ++_column_it;
            return;
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -22,6 +22,7 @@
 #include "alternator/error.hh"
 #include "stats.hh"
 #include "utils/rjson.hh"
+#include "utils/updateable_value.hh"

 namespace db {
    class system_distributed_keyspace;
@@ -170,8 +171,16 @@ public:
    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
    static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";

-    executor(gms::gossiper& gossiper, service::storage_proxy& proxy, service::migration_manager& mm, db::system_distributed_keyspace& sdks, cdc::metadata& cdc_metadata, smp_service_group ssg)
-        : _gossiper(gossiper), _proxy(proxy), _mm(mm), _sdks(sdks), _cdc_metadata(cdc_metadata), _ssg(ssg) {}
+    executor(gms::gossiper& gossiper,
+             service::storage_proxy& proxy,
+             service::migration_manager& mm,
+             db::system_distributed_keyspace& sdks,
+             cdc::metadata& cdc_metadata,
+             smp_service_group ssg,
+             utils::updateable_value<uint32_t> default_timeout_in_ms)
+        : _gossiper(gossiper), _proxy(proxy), _mm(mm), _sdks(sdks), _cdc_metadata(cdc_metadata), _ssg(ssg) {
+        s_default_timeout_in_ms = std::move(default_timeout_in_ms);
+    }

    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
@@ -199,13 +208,16 @@ public:
    future<request_return_type> describe_continuous_backups(client_state& client_state, service_permit permit, rjson::value request);

    future<> start();
-    future<> stop() { return make_ready_future<>(); }
+    future<> stop() {
+        // disconnect from the value source, but keep the value unchanged.
+        s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
+        return make_ready_future<>();
+    }

    static sstring table_name(const schema&);
    static db::timeout_clock::time_point default_timeout();
-    static void set_default_timeout(db::timeout_clock::duration timeout);
 private:
-    static db::timeout_clock::duration s_default_timeout;
+    static thread_local utils::updateable_value<uint32_t> s_default_timeout_in_ms;
 public:
    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);

@@ -222,14 +234,14 @@ public:
        const query::result&,
        const std::optional<attrs_to_get>&);

-    static std::vector<rjson::value> describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get);
+    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get);

    static void describe_single_item(const cql3::selection::selection&,
-        const std::vector<bytes_opt>&,
+        const std::vector<managed_bytes_opt>&,
        const std::optional<attrs_to_get>&,
        rjson::value&,
        bool = false);
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -50,6 +50,115 @@ type_representation represent_type(alternator_type atype) {
    return it->second;
 }

+// Get the magnitude and precision of a big_decimal - as these concepts are
+// defined by DynamoDB - to allow us to enforce limits on those as explained
+// in ssue #6794. The "magnitude" of 9e123 is 123 and of -9e-123 is -123,
+// the "precision" of 12.34e56 is the number of significant digits - 4.
+//
+// Unfortunately it turned out to be quite difficult to take a big_decimal and
+// calculate its magnitude and precision from its scale() and unscaled_value().
+// So in the following ugly implementation we calculate them from the string
+// representation instead. We assume the number was already parsed
+// sucessfully to a big_decimal to it follows its syntax rules.
+//
+// FIXME: rewrite this function to take a big_decimal, not a string.
+// Maybe a snippet like this can help:
+// boost::multiprecision::cpp_int digits = boost::multiprecision::log10(num.unscaled_value().convert_to<boost::multiprecision::mpf_float_50>()).convert_to<boost::multiprecision::cpp_int>() + 1;
+
+
+internal::magnitude_and_precision internal::get_magnitude_and_precision(std::string_view s) {
+    size_t e_or_end = s.find_first_of("eE");
+    std::string_view base = s.substr(0, e_or_end);
+    if (s[0]=='-' || s[0]=='+') {
+        base = base.substr(1);
+    }
+    int magnitude = 0;
+    int precision = 0;
+    size_t dot_or_end = base.find_first_of(".");
+    size_t nonzero = base.find_first_not_of("0");
+    if (dot_or_end != std::string_view::npos) {
+        if (nonzero == dot_or_end) {
+            // 0.000031 => magnitude = -5 (like 3.1e-5), precision = 2.
+            std::string_view fraction = base.substr(dot_or_end + 1);
+            size_t nonzero2 = fraction.find_first_not_of("0");
+            if (nonzero2 != std::string_view::npos) {
+                magnitude = -nonzero2 - 1;
+                precision = fraction.size() - nonzero2;
+            }
+        } else {
+            // 000123.45678 => magnitude = 2, precision = 8.
+            magnitude = dot_or_end - nonzero - 1;
+            precision = base.size() - nonzero - 1;
+        }
+        // trailing zeros don't count to precision, e.g., precision
+        // of 1000.0, 1.0 or 1.0000 are just 1.
+        size_t last_significant = base.find_last_not_of(".0");
+        if (last_significant == std::string_view::npos) {
+            precision = 0;
+        } else if (last_significant < dot_or_end) {
+            // e.g., 1000.00 reduce 5 = 7 - (0+1) - 1 from precision
+            precision -= base.size() - last_significant - 2;
+        } else {
+            // e.g., 1235.60 reduce 5 = 7 - (5+1) from precision
+            precision -= base.size() - last_significant - 1;
+        }
+    } else if (nonzero == std::string_view::npos) {
+        // all-zero integer 000000
+        magnitude = 0;
+        precision = 0;
+    } else {
+        magnitude = base.size() - 1 - nonzero;
+        precision = base.size() - nonzero;
+        // trailing zeros don't count to precision, e.g., precision
+        // of 1000 is just 1.
+        size_t last_significant = base.find_last_not_of("0");
+        if (last_significant == std::string_view::npos) {
+            precision = 0;
+        } else {
+            // e.g., 1000 reduce 3 = 4 - (0+1)
+            precision -= base.size() - last_significant - 1;
+        }
+    }
+    if (precision && e_or_end != std::string_view::npos) {
+        std::string_view exponent = s.substr(e_or_end + 1);
+        if (exponent.size() > 4) {
+            // don't even bother atoi(), exponent is too large
+            magnitude = exponent[0]=='-' ? -9999 : 9999;
+        } else {
+            try {
+                magnitude += boost::lexical_cast<int32_t>(exponent);
+            } catch (...) {
+                magnitude = 9999;
+            }
+        }
+    }
+    return magnitude_and_precision {magnitude, precision};
+}
+
+// Parse a number read from user input, validating that it has a valid
+// numeric format and also in the allowed magnitude and precision ranges
+// (see issue #6794). Throws an api_error::validation if the validation
+// failed.
+static big_decimal parse_and_validate_number(std::string_view s) {
+    try {
+        big_decimal ret(s);
+        auto [magnitude, precision] = internal::get_magnitude_and_precision(s);
+        if (magnitude > 125) {
+            throw api_error::validation(format("Number overflow: {}. Attempting to store a number with magnitude larger than supported range.", s));
+        }
+        if (magnitude < -130) {
+            throw api_error::validation(format("Number underflow: {}. Attempting to store a number with magnitude lower than supported range.", s));
+        }
+        if (precision > 38) {
+            throw api_error::validation(format("Number too precise: {}. Attempting to store a number with more significant digits than supported.", s));
+        }
+        return ret;
+    } catch (const marshal_exception& e) {
+        throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", s));
+    }
+
+}
+
 struct from_json_visitor {
    const rjson::value& v;
    bytes_ostream& bo;
@@ -67,11 +176,7 @@ struct from_json_visitor {
        bo.write(boolean_type->decompose(v.GetBool()));
    }
    void operator()(const decimal_type_impl& t) const {
-        try {
-            bo.write(t.from_string(rjson::to_string_view(v)));
-        } catch (const marshal_exception& e) {
-            throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", v));
-        }
+        bo.write(decimal_type->decompose(parse_and_validate_number(rjson::to_string_view(v))));
    }
    // default
    void operator()(const abstract_type& t) const {
@@ -203,6 +308,8 @@ bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column
        // FIXME: it's difficult at this point to get information if value was provided
        // in request or comes from the storage, for now we assume it's user's fault.
        return *unwrap_bytes(value, true);
+    } else if (column.type == decimal_type) {
+        return decimal_type->decompose(parse_and_validate_number(rjson::to_string_view(value)));
    } else {
        return column.type->from_string(value_view);
    }
@@ -295,16 +402,13 @@ big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
    if (it->name != "N") {
        throw api_error::validation(format("{}: expected number, found type '{}'", diagnostic, it->name));
    }
-    try {
-        if (!it->value.IsString()) {
-            // We shouldn't reach here. Callers normally validate their input
-            // earlier with validate_value().
-            throw api_error::validation(format("{}: improperly formatted number constant", diagnostic));
-        }
-        return big_decimal(rjson::to_string_view(it->value));
-    } catch (const marshal_exception& e) {
-        throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", it->value));
+    if (!it->value.IsString()) {
+        // We shouldn't reach here. Callers normally validate their input
+        // earlier with validate_value().
+        throw api_error::validation(format("{}: improperly formatted number constant", diagnostic));
    }
+    big_decimal ret = parse_and_validate_number(rjson::to_string_view(it->value));
+    return ret;
 }

 std::optional<big_decimal> try_unwrap_number(const rjson::value& v) {
@@ -316,8 +420,8 @@ std::optional<big_decimal> try_unwrap_number(const rjson::value& v) {
        return std::nullopt;
    }
    try {
-        return big_decimal(rjson::to_string_view(it->value));
-    } catch (const marshal_exception& e) {
+        return parse_and_validate_number(rjson::to_string_view(it->value));
+    } catch (api_error&) {
        return std::nullopt;
    }
 }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -94,5 +94,12 @@ std::optional<rjson::value> set_diff(const rjson::value& v1, const rjson::value&
 // Returns a null value if one of the arguments is not actually a list.
 rjson::value list_concatenate(const rjson::value& v1, const rjson::value& v2);

+namespace internal {
+struct magnitude_and_precision {
+    int magnitude;
+    int precision;
+};
+magnitude_and_precision get_magnitude_and_precision(std::string_view);
+}

 }
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -241,7 +241,7 @@ static bool is_expired(const rjson::value& expiration_time, gc_clock::time_point
 // understands it is an expiration event - not a user-initiated deletion.
 static future<> expire_item(service::storage_proxy& proxy,
                            const service::query_state& qs,
-                            const std::vector<bytes_opt>& row,
+                            const std::vector<managed_bytes_opt>& row,
                            schema_ptr schema,
                            api::timestamp_type ts) {
    // Prepare the row key to delete
@@ -260,7 +260,7 @@ static future<> expire_item(service::storage_proxy& proxy,
            // FIXME: log or increment a metric if this happens.
            return make_ready_future<>();
        }
-        exploded_pk.push_back(*row_c);
+        exploded_pk.push_back(to_bytes(*row_c));
    }
    auto pk = partition_key::from_exploded(exploded_pk);
    mutation m(schema, pk);
@@ -280,7 +280,7 @@ static future<> expire_item(service::storage_proxy& proxy,
                // FIXME: log or increment a metric if this happens.
                return make_ready_future<>();
            }
-            exploded_ck.push_back(*row_c);
+            exploded_ck.push_back(to_bytes(*row_c));
        }
        auto ck = clustering_key::from_exploded(exploded_ck);
        m.partition().clustered_row(*schema, ck).apply(tombstone(ts, gc_clock::now()));
@@ -387,7 +387,7 @@ class token_ranges_owned_by_this_shard {
    class ranges_holder_primary {
        const dht::token_range_vector _token_ranges;
     public:
-        ranges_holder_primary(const locator::effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
+        ranges_holder_primary(const locator::vnode_effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
            : _token_ranges(erm->get_primary_ranges(ep)) {}
        std::size_t size() const { return _token_ranges.size(); }
        const dht::token_range& operator[](std::size_t i) const {
@@ -593,7 +593,7 @@ static future<> scan_table_ranges(
            continue;
        }
        for (const auto& row : rows) {
-            const bytes_opt& cell = row[*expiration_column];
+            const managed_bytes_opt& cell = row[*expiration_column];
            if (!cell) {
                continue;
            }
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -437,6 +437,68 @@
            }
         ]
      },
+      {
+         "path":"/column_family/tombstone_gc/{name}",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Check if tombstone GC is enabled for a given table",
+               "type":"boolean",
+               "nickname":"get_tombstone_gc",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The table name in keyspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            },
+            {
+               "method":"POST",
+               "summary":"Enable tombstone GC for a given table",
+               "type":"void",
+               "nickname":"enable_tombstone_gc",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The table name in keyspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Disable tombstone GC for a given table",
+               "type":"void",
+               "nickname":"disable_tombstone_gc",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The table name in keyspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/column_family/estimate_keys/{name}",
         "operations":[
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2110,6 +2110,65 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/tombstone_gc/{keyspace}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Enable tombstone GC",
+               "type":"void",
+               "nickname":"enable_tombstone_gc",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"cf",
+                     "description":"Comma-separated column family names",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Disable tombstone GC",
+               "type":"void",
+               "nickname":"disable_tombstone_gc",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"cf",
+                     "description":"Comma-separated column family names",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/deliver_hints",
         "operations":[
@@ -2631,7 +2690,7 @@
                "description":"File creation time"
            },
            "generation":{
-                "type":"long",
+                "type":"string",
                "description":"SSTable generation"
            },
            "level":{
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -871,6 +871,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        apilog.info("column_family/enable_auto_compaction: name={}", req->param["name"]);
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
@@ -882,6 +883,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        apilog.info("column_family/disable_auto_compaction: name={}", req->param["name"]);
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
@@ -892,6 +894,30 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        });
    });

+    cf::get_tombstone_gc.set(r, [&ctx] (const_req req) {
+        auto uuid = get_uuid(req.param["name"], ctx.db.local());
+        replica::table& t = ctx.db.local().find_column_family(uuid);
+        return t.tombstone_gc_enabled();
+    });
+
+    cf::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        apilog.info("column_family/enable_tombstone_gc: name={}", req->param["name"]);
+        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
+            t.set_tombstone_gc_enabled(true);
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    cf::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        apilog.info("column_family/disable_tombstone_gc: name={}", req->param["name"]);
+        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
+            t.set_tombstone_gc_enabled(false);
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
    cf::get_built_indexes.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
        auto ks_cf = parse_fully_qualified_cf_name(req->param["name"]);
        auto&& ks = std::get<0>(ks_cf);
@@ -955,6 +981,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::set_compaction_strategy_class.set(r, [&ctx](std::unique_ptr<http::request> req) {
        sstring strategy = req->get_query_param("class_name");
+        apilog.info("column_family/set_compaction_strategy_class: name={} strategy={}", req->param["name"], strategy);
        return foreach_column_family(ctx, req->param["name"], [strategy](replica::column_family& cf) {
            cf.set_compaction_strategy(sstables::compaction_strategy::type(strategy));
        }).then([] {
@@ -1023,6 +1050,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
            fail(unimplemented::cause::API);
        }

+        apilog.info("column_family/force_major_compaction: name={}", req->param["name"]);
        auto [ks, cf] = parse_fully_qualified_cf_name(req->param["name"]);
        auto keyspace = validate_keyspace(ctx, ks);
        std::vector<table_id> table_infos = {ctx.db.local().find_uuid(ks, cf)};
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -220,32 +220,47 @@ seastar::future<json::json_return_type> run_toppartitions_query(db::toppartition
    });
 }

-future<json::json_return_type> set_tables_autocompaction(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
+static future<json::json_return_type> set_tables(http_context& ctx, const sstring& keyspace, std::vector<sstring> tables, std::function<future<>(replica::table&)> set) {
    if (tables.empty()) {
        tables = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
    }

-    apilog.info("set_tables_autocompaction: enabled={} keyspace={} tables={}", enabled, keyspace, tables);
-    return do_with(keyspace, std::move(tables), [&ctx, enabled] (const sstring &keyspace, const std::vector<sstring>& tables) {
-        return ctx.db.invoke_on(0, [&ctx, &keyspace, &tables, enabled] (replica::database& db) {
-            auto g = replica::database::autocompaction_toggle_guard(db);
-            return ctx.db.invoke_on_all([&keyspace, &tables, enabled] (replica::database& db) {
-                return parallel_for_each(tables, [&db, &keyspace, enabled] (const sstring& table) {
-                    replica::column_family& cf = db.find_column_family(keyspace, table);
-                    if (enabled) {
-                        cf.enable_auto_compaction();
-                    } else {
-                        return cf.disable_auto_compaction();
-                    }
-                    return make_ready_future<>();
-                });
-            }).finally([g = std::move(g)] {});
+    return do_with(keyspace, std::move(tables), [&ctx, set] (const sstring& keyspace, const std::vector<sstring>& tables) {
+        return ctx.db.invoke_on_all([&keyspace, &tables, set] (replica::database& db) {
+            return parallel_for_each(tables, [&db, &keyspace, set] (const sstring& table) {
+                replica::table& t = db.find_column_family(keyspace, table);
+                return set(t);
+            });
        });
    }).then([] {
        return make_ready_future<json::json_return_type>(json_void());
    });
 }

+future<json::json_return_type> set_tables_autocompaction(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
+    apilog.info("set_tables_autocompaction: enabled={} keyspace={} tables={}", enabled, keyspace, tables);
+
+    return ctx.db.invoke_on(0, [&ctx, keyspace, tables = std::move(tables), enabled] (replica::database& db) {
+        auto g = replica::database::autocompaction_toggle_guard(db);
+        return set_tables(ctx, keyspace, tables, [enabled] (replica::table& cf) {
+            if (enabled) {
+                cf.enable_auto_compaction();
+            } else {
+                return cf.disable_auto_compaction();
+            }
+            return make_ready_future<>();
+        }).finally([g = std::move(g)] {});
+    });
+}
+
+future<json::json_return_type> set_tables_tombstone_gc(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
+    apilog.info("set_tables_tombstone_gc: enabled={} keyspace={} tables={}", enabled, keyspace, tables);
+    return set_tables(ctx, keyspace, std::move(tables), [enabled] (replica::table& t) {
+        t.set_tombstone_gc_enabled(enabled);
+        return make_ready_future<>();
+    });
+}
+
 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<http::request> req) {
        return smp::submit_to(0, [&] {
@@ -619,7 +634,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::describe_any_ring.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) {
        // Find an arbitrary non-system keyspace.
-        auto keyspaces = ctx.db.local().get_non_local_strategy_keyspaces();
+        auto keyspaces = ctx.db.local().get_non_local_vnode_based_strategy_keyspaces();
        if (keyspaces.empty()) {
            throw std::runtime_error("No keyspace provided and no non system kespace exist");
        }
@@ -1111,6 +1126,22 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return set_tables_autocompaction(ctx, keyspace, tables, false);
    });

+    ss::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+
+        apilog.info("enable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
+        return set_tables_tombstone_gc(ctx, keyspace, tables, true);
+    });
+
+    ss::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+
+        apilog.info("disable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
+        return set_tables_tombstone_gc(ctx, keyspace, tables, false);
+    });
+
    ss::deliver_hints.set(r, [](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
@@ -1257,7 +1288,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                            ss::sstable info;

                            info.timestamp = t;
-                            info.generation = sstables::generation_value(sstable->generation());
+                            info.generation = fmt::to_string(sstable->generation());
                            info.level = sstable->get_sstable_level();
                            info.size = sstable->bytes_on_disk();
                            info.data_size = sstable->ondisk_data_size();
@@ -1494,27 +1525,12 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
            throw httpd::bad_param_exception(fmt::format("Unknown argument for 'quarantine_mode' parameter: {}", quarantine_mode_str));
        }

-        const auto& reduce_compaction_stats = [] (const compaction_manager::compaction_stats_opt& lhs, const compaction_manager::compaction_stats_opt& rhs) {
-            sstables::compaction_stats stats{};
-            stats += lhs.value();
-            stats += rhs.value();
-            return stats;
-        };
-
+        sstables::compaction_stats stats;
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<scrub_sstables_compaction_task_impl>({}, std::move(keyspace), db, column_families, opts, stats);
        try {
-            auto opt_stats = co_await db.map_reduce0([&] (replica::database& db) {
-                return map_reduce(column_families, [&] (sstring cfname) -> future<std::optional<sstables::compaction_stats>> {
-                    auto& cm = db.get_compaction_manager();
-                    auto& cf = db.find_column_family(keyspace, cfname);
-                    sstables::compaction_stats stats{};
-                    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
-                        auto r = co_await cm.perform_sstable_scrub(ts, opts);
-                        stats += r.value_or(sstables::compaction_stats{});
-                    });
-                    co_return stats;
-                }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
-            }, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
-            if (opt_stats && opt_stats->validation_errors) {
+            co_await task->done();
+            if (stats.validation_errors) {
                co_return json::json_return_type(static_cast<int>(scrub_status::validation_errors));
            }
        } catch (const sstables::compaction_aborted_exception&) {
--- a/auth/authenticated_user.hh
+++ b/auth/authenticated_user.hh
@@ -35,16 +35,9 @@ public:
    ///
    authenticated_user() = default;
    explicit authenticated_user(std::string_view name);
+    friend bool operator==(const authenticated_user&, const authenticated_user&) noexcept = default;
 };

-inline bool operator==(const authenticated_user& u1, const authenticated_user& u2) noexcept {
-    return u1.name == u2.name;
-}
-
-inline bool operator!=(const authenticated_user& u1, const authenticated_user& u2) noexcept {
-    return !(u1 == u2);
-}
-
 const authenticated_user& anonymous_user() noexcept;

 inline bool is_anonymous(const authenticated_user& u) noexcept {
--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -39,10 +39,6 @@ inline bool operator==(const permission_details& pd1, const permission_details&
            == std::forward_as_tuple(pd2.role_name, pd2.resource, pd2.permissions.mask());
 }

-inline bool operator!=(const permission_details& pd1, const permission_details& pd2) {
-    return !(pd1 == pd2);
-}
-
 inline bool operator<(const permission_details& pd1, const permission_details& pd2) {
    return std::forward_as_tuple(pd1.role_name, pd1.resource, pd1.permissions)
            < std::forward_as_tuple(pd2.role_name, pd2.resource, pd2.permissions);
--- a/auth/resource.cc
+++ b/auth/resource.cc
@@ -79,6 +79,13 @@ static permission_set applicable_permissions(const service_level_resource_view &
 }

 static permission_set applicable_permissions(const functions_resource_view& fv) {
+    if (fv.function_name() || fv.function_signature()) {
+        return permission_set::of<
+                permission::ALTER,
+                permission::DROP,
+                permission::AUTHORIZE,
+                permission::EXECUTE>();
+    }
    return permission_set::of<
            permission::CREATE,
            permission::ALTER,
@@ -292,7 +299,7 @@ std::optional<std::vector<std::string_view>> functions_resource_view::function_a

    std::vector<std::string_view> parts;
    if (_resource._parts[3] == "") {
-        return {};
+        return parts;
    }
    for (size_t i = 3; i < _resource._parts.size(); i++) {
        parts.push_back(_resource._parts[i]);
--- a/auth/resource.hh
+++ b/auth/resource.hh
@@ -117,20 +117,12 @@ private:
    friend class functions_resource_view;

    friend bool operator<(const resource&, const resource&);
-    friend bool operator==(const resource&, const resource&);
+    friend bool operator==(const resource&, const resource&) = default;
    friend resource parse_resource(std::string_view);
 };

 bool operator<(const resource&, const resource&);

-inline bool operator==(const resource& r1, const resource& r2) {
-    return (r1._kind == r2._kind) && (r1._parts == r2._parts);
-}
-
-inline bool operator!=(const resource& r1, const resource& r2) {
-    return !(r1 == r2);
-}
-
 std::ostream& operator<<(std::ostream&, const resource&);

 class resource_kind_mismatch : public std::invalid_argument {
--- a/auth/role_or_anonymous.cc
+++ b/auth/role_or_anonymous.cc
@@ -17,10 +17,6 @@ std::ostream& operator<<(std::ostream& os, const role_or_anonymous& mr) {
    return os;
 }

-bool operator==(const role_or_anonymous& mr1, const role_or_anonymous& mr2) noexcept {
-    return mr1.name == mr2.name;
-}
-
 bool is_anonymous(const role_or_anonymous& mr) noexcept {
    return !mr.name.has_value();
 }
--- a/auth/role_or_anonymous.hh
+++ b/auth/role_or_anonymous.hh
@@ -26,16 +26,11 @@ public:
    role_or_anonymous() = default;
    role_or_anonymous(std::string_view name) : name(name) {
    }
+    friend bool operator==(const role_or_anonymous&, const role_or_anonymous&) noexcept = default;
 };

 std::ostream& operator<<(std::ostream&, const role_or_anonymous&);

-bool operator==(const role_or_anonymous&, const role_or_anonymous&) noexcept;
-
-inline bool operator!=(const role_or_anonymous& mr1, const role_or_anonymous& mr2) noexcept {
-    return !(mr1 == mr2);
-}
-
 bool is_anonymous(const role_or_anonymous&) noexcept;

 }
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -55,6 +55,7 @@ future<bool> default_role_row_satisfies(
        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
+                internal_distributed_query_state(),
                {meta::DEFAULT_SUPERUSER_NAME},
                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/coroutine.hh>
+#include "auth/resource.hh"
 #include "auth/service.hh"

 #include <algorithm>
@@ -20,6 +21,7 @@
 #include "auth/allow_all_authorizer.hh"
 #include "auth/common.hh"
 #include "auth/role_or_anonymous.hh"
+#include "cql3/functions/function_name.hh"
 #include "cql3/functions/functions.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
@@ -66,6 +68,7 @@ private:
    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
+    void on_update_tablet_metadata() override {}

    void on_drop_keyspace(const sstring& ks_name) override {
        // Do it in the background.
@@ -75,6 +78,12 @@ private:
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
        });
+        (void)_authorizer.revoke_all(
+            auth::make_functions_resource(ks_name)).handle_exception_type([](const unsupported_authorization_operation&) {
+            // Nothing.
+        }).handle_exception([] (std::exception_ptr e) {
+            log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
+        });
    }

    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
@@ -89,8 +98,22 @@ private:
    }

    void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
-    void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
+    void on_drop_function(const sstring& ks_name, const sstring& function_name) override {
+        (void)_authorizer.revoke_all(
+            auth::make_functions_resource(ks_name, function_name)).handle_exception_type([](const unsupported_authorization_operation&) {
+            // Nothing.
+        }).handle_exception([] (std::exception_ptr e) {
+            log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
+        });
+    }
+    void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {
+        (void)_authorizer.revoke_all(
+            auth::make_functions_resource(ks_name, aggregate_name)).handle_exception_type([](const unsupported_authorization_operation&) {
+            // Nothing.
+        }).handle_exception([] (std::exception_ptr e) {
+            log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
+        });
+    }
    void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
 };

--- a/bytes.hh
+++ b/bytes.hh
@@ -17,7 +17,7 @@
 #include <functional>
 #include <compare>
 #include "utils/mutable_view.hh"
-#include <xxhash.h>
+#include "utils/simple_hashers.hh"

 using bytes = basic_sstring<int8_t, uint32_t, 31, false>;
 using bytes_view = std::basic_string_view<int8_t>;
@@ -160,18 +160,7 @@ struct appending_hash<bytes_view> {
    }
 };

-struct bytes_view_hasher : public hasher {
-    XXH64_state_t _state;
-    bytes_view_hasher(uint64_t seed = 0) noexcept {
-        XXH64_reset(&_state, seed);
-    }
-    void update(const char* ptr, size_t length) noexcept {
-        XXH64_update(&_state, ptr, length);
-    }
-    size_t finalize() {
-        return static_cast<size_t>(XXH64_digest(&_state));
-    }
-};
+using bytes_view_hasher = simple_xx_hasher;

 namespace std {
 template <>
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -53,6 +53,10 @@ public:
        using difference_type = std::ptrdiff_t;
        using pointer = bytes_view*;
        using reference = bytes_view&;
+
+        struct implementation {
+            blob_storage* current_chunk;
+        };
    private:
        chunk* _current = nullptr;
    public:
@@ -75,11 +79,11 @@ public:
            ++(*this);
            return tmp;
        }
-        bool operator==(const fragment_iterator& other) const {
-            return _current == other._current;
-        }
-        bool operator!=(const fragment_iterator& other) const {
-            return _current != other._current;
+        bool operator==(const fragment_iterator&) const = default;
+        implementation extract_implementation() const {
+            return implementation {
+                .current_chunk = _current,
+            };
        }
    };
    using const_iterator = fragment_iterator;
@@ -432,10 +436,6 @@ public:
        return true;
    }

-    bool operator!=(const bytes_ostream& other) const {
-        return !(*this == other);
-    }
-
    // Makes this instance empty.
    //
    // The first buffer is not deallocated, so callers may rely on the
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -68,7 +68,6 @@ public:
            _pos = -1;
        }
        bool operator==(const iterator& o) const { return _pos == o._pos; }
-        bool operator!=(const iterator& o) const { return _pos != o._pos; }
    };
 public:
    cartesian_product(const std::vector<std::vector<T>>& vec_of_vecs) : _vec_of_vecs(vec_of_vecs) {}
--- a/cdc/cdc_options.hh
+++ b/cdc/cdc_options.hh
@@ -65,7 +65,6 @@ public:
    void ttl(int v) { _ttl = v; }

    bool operator==(const options& o) const;
-    bool operator!=(const options& o) const;
 };

 } // namespace cdc
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -1090,19 +1090,8 @@ shared_ptr<db::system_distributed_keyspace> generation_service::get_sys_dist_ks(
    return _sys_dist_ks.local_shared();
 }

-std::ostream& operator<<(std::ostream& os, const generation_id& gen_id) {
-    std::visit(make_visitor(
-    [&os] (const generation_id_v1& id) { os << id.ts; },
-    [&os] (const generation_id_v2& id) { os << "(" << id.ts << ", " << id.id << ")"; }
-    ), gen_id);
-    return os;
-}
-
 db_clock::time_point get_ts(const generation_id& gen_id) {
-    return std::visit(make_visitor(
-    [] (const generation_id_v1& id) { return id.ts; },
-    [] (const generation_id_v2& id) { return id.ts; }
-    ), gen_id);
+    return std::visit([] (auto& id) { return id.ts; }, gen_id);
 }

 } // namespace cdc
--- a/cdc/generation_id.hh
+++ b/cdc/generation_id.hh
@@ -28,7 +28,35 @@ struct generation_id_v2 {

 using generation_id = std::variant<generation_id_v1, generation_id_v2>;

-std::ostream& operator<<(std::ostream&, const generation_id&);
 db_clock::time_point get_ts(const generation_id&);

 } // namespace cdc
+
+template <>
+struct fmt::formatter<cdc::generation_id_v1> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+    template <typename FormatContext>
+    auto format(const cdc::generation_id_v1& gen_id, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "{}", gen_id.ts);
+    }
+};
+
+template <>
+struct fmt::formatter<cdc::generation_id_v2> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+    template <typename FormatContext>
+    auto format(const cdc::generation_id_v2& gen_id, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
+    }
+};
+
+template <>
+struct fmt::formatter<cdc::generation_id> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+    template <typename FormatContext>
+    auto format(const cdc::generation_id& gen_id, FormatContext& ctx) const {
+        return std::visit([&ctx] (auto& id) {
+            return fmt::format_to(ctx.out(), "{}", id);
+        }, gen_id);
+    }
+};
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -395,9 +395,6 @@ bool cdc::options::operator==(const options& o) const {
    return enabled() == o.enabled() && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl
            && _delta_mode == o._delta_mode;
 }
-bool cdc::options::operator!=(const options& o) const {
-    return !(*this == o);
-}

 namespace cdc {

@@ -635,9 +632,6 @@ public:
    bool operator==(const collection_iterator& x) const {
        return _v == x._v;
    }
-    bool operator!=(const collection_iterator& x) const {
-        return !(*this == x);
-    }
 private:
    void next() {
        --_rem;
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -389,7 +389,7 @@ struct extract_changes_visitor {
    }

    void partition_delete(const tombstone& t) {
-        _result[t.timestamp].partition_deletions = {t};
+        _result[t.timestamp].partition_deletions = partition_deletion{t};
    }

    constexpr bool finished() const { return false; }
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -93,9 +93,6 @@ public:
        bool operator==(const iterator& other) const {
            return _position == other._position;
        }
-        bool operator!=(const iterator& other) const {
-            return !(*this == other);
-        }
    };
 public:
    explicit partition_cells_range(const mutation_partition& mp) : _mp(mp) { }
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -15,12 +15,6 @@

 std::atomic<int64_t> clocks_offset;

-std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
-    auto t = db_clock::to_time_t(tp);
-    ::tm t_buf;
-    return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
-}
-
 std::string format_timestamp(api::timestamp_type ts) {
    auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
    ::tm t_buf;
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -75,8 +75,7 @@ public:
            const interval::interval_type& iv = *_i;
            return position_range{iv.lower().position(), iv.upper().position()};
        }
-        bool operator==(const position_range_iterator& other) const { return _i == other._i; }
-        bool operator!=(const position_range_iterator& other) const { return _i != other._i; }
+        bool operator==(const position_range_iterator& other) const = default;
        position_range_iterator& operator++() {
            ++_i;
            return *this;
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -1,9 +1,7 @@
 set(disabled_warnings
  c++11-narrowing
  mismatched-tags
-  missing-braces
  overloaded-virtual
-  parentheses-equality
  unsupported-friend)
 include(CheckCXXCompilerFlag)
 foreach(warning ${disabled_warnings})
@@ -13,7 +11,11 @@ foreach(warning ${disabled_warnings})
  endif()
 endforeach()
 list(TRANSFORM _supported_warnings PREPEND "-Wno-")
-string(JOIN " " CMAKE_CXX_FLAGS "-Wall" "-Werror" ${_supported_warnings})
+string(JOIN " " CMAKE_CXX_FLAGS
+  "-Wall"
+  "-Werror"
+  "-Wno-error=deprecated-declarations"
+  ${_supported_warnings})

 function(default_target_arch arch)
  set(x86_instruction_sets i386 i686 x86_64)
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -168,7 +168,11 @@ std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
 }

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
+    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
+        return api::min_timestamp;
+    }
+
    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -179,6 +183,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
            hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
        }
        if (sst->filter_has_key(*hk)) {
+            bloom_filter_checks++;
            timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
        }
    }
@@ -414,9 +419,12 @@ private:

 class formatted_sstables_list {
    bool _include_origin = true;
-    std::vector<sstring> _ssts;
+    std::vector<std::string> _ssts;
 public:
    formatted_sstables_list() = default;
+    void reserve(size_t n) {
+        _ssts.reserve(n);
+    }
    explicit formatted_sstables_list(const std::vector<shared_sstable>& ssts, bool include_origin) : _include_origin(include_origin) {
        _ssts.reserve(ssts.size());
        for (const auto& sst : ssts) {
@@ -431,9 +439,7 @@ public:
 };

 std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst) {
-    os << "[";
-    os << boost::algorithm::join(lst._ssts, ",");
-    os << "]";
+    fmt::print(os, "[{}]", fmt::join(lst._ssts, ","));
    return os;
 }

@@ -458,6 +464,7 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
    bool _can_split_large_partition = false;
@@ -571,7 +578,7 @@ protected:
    // Tombstone expiration is enabled based on the presence of sstable set.
    // If it's not present, we cannot purge tombstones without the risk of resurrecting data.
    bool tombstone_expiration_enabled() const {
-        return bool(_sstable_set);
+        return bool(_sstable_set) && _table_s.tombstone_gc_enabled();
    }

    compaction_writer create_gc_compaction_writer() const {
@@ -625,11 +632,6 @@ protected:

    flat_mutation_reader_v2::filter make_partition_filter() const {
        return [this] (const dht::decorated_key& dk) {
-#ifdef SEASTAR_DEBUG
-            // sstables should never be shared with other shards at this point.
-            assert(dht::shard_of(*_schema, dk.token()) == this_shard_id());
-#endif
-
            if (!_owned_ranges_checker->belongs_to_current_node(dk.token())) {
                log_trace("Token {} does not belong to this node, skipping", dk.token());
                return false;
@@ -668,6 +670,7 @@ private:
    future<> setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
        formatted_sstables_list formatted_msg;
+        formatted_msg.reserve(_sstables.size());
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

@@ -784,6 +787,7 @@ protected:
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
+                .bloom_filter_checks = _bloom_filter_checks,
            },
        };

@@ -824,7 +828,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
        };
    }

@@ -1241,62 +1245,8 @@ public:

 class scrub_compaction final : public regular_compaction {
 public:
-    static void report_invalid_partition(compaction_type type, mutation_fragment_stream_validator& validator, const dht::decorated_key& new_key,
-            std::string_view action = "") {
-        const auto& schema = validator.schema();
-        const auto& current_key = validator.previous_partition_key();
-        clogger.error("[{} compaction {}.{}] Invalid partition {} ({}), partition is out-of-order compared to previous partition {} ({}){}{}",
-                type,
-                schema.ks_name(),
-                schema.cf_name(),
-                new_key.key().with_schema(schema),
-                new_key,
-                current_key.key().with_schema(schema),
-                current_key,
-                action.empty() ? "" : "; ",
-                action);
-    }
-    static void report_invalid_partition_start(compaction_type type, mutation_fragment_stream_validator& validator, const dht::decorated_key& new_key,
-            std::string_view action = "") {
-        const auto& schema = validator.schema();
-        const auto& current_key = validator.previous_partition_key();
-        clogger.error("[{} compaction {}.{}] Invalid partition start for partition {} ({}), previous partition {} ({}) didn't end with a partition-end fragment{}{}",
-                type,
-                schema.ks_name(),
-                schema.cf_name(),
-                new_key.key().with_schema(schema),
-                new_key,
-                current_key.key().with_schema(schema),
-                current_key,
-                action.empty() ? "" : "; ",
-                action);
-    }
-    static void report_invalid_mutation_fragment(compaction_type type, mutation_fragment_stream_validator& validator, const mutation_fragment_v2& mf,
-            std::string_view action = "") {
-        const auto& schema = validator.schema();
-        const auto& key = validator.previous_partition_key();
-        const auto prev_pos = validator.previous_position();
-        clogger.error("[{} compaction {}.{}] Invalid {} fragment{} ({}) in partition {} ({}),"
-                " fragment is out-of-order compared to previous {} fragment{} ({}){}{}",
-                type,
-                schema.ks_name(),
-                schema.cf_name(),
-                mf.mutation_fragment_kind(),
-                mf.has_key() ? format(" with key {}", mf.key().with_schema(schema)) : "",
-                mf.position(),
-                key.key().with_schema(schema),
-                key,
-                prev_pos.region(),
-                prev_pos.has_key() ? format(" with key {}", prev_pos.key().with_schema(schema)) : "",
-                prev_pos,
-                action.empty() ? "" : "; ",
-                action);
-    }
-    static void report_invalid_end_of_stream(compaction_type type, mutation_fragment_stream_validator& validator, std::string_view action = "") {
-        const auto& schema = validator.schema();
-        const auto& key = validator.previous_partition_key();
-        clogger.error("[{} compaction {}.{}] Invalid end-of-stream, last partition {} ({}) didn't end with a partition-end fragment{}{}",
-                type, schema.ks_name(), schema.cf_name(), key.key().with_schema(schema), key, action.empty() ? "" : "; ", action);
+    static void report_validation_error(compaction_type type, const ::schema& schema, sstring what, std::string_view action = "") {
+        clogger.error("[{} compaction {}.{}] {}{}{}", type, schema.ks_name(), schema.cf_name(), what, action.empty() ? "" : "; ", action);
    }

 private:
@@ -1319,9 +1269,9 @@ private:
            ++_validation_errors;
        }

-        void on_unexpected_partition_start(const mutation_fragment_v2& ps) {
-            auto report_fn = [this, &ps] (std::string_view action = "") {
-                report_invalid_partition_start(compaction_type::Scrub, _validator, ps.as_partition_start().key(), action);
+        void on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
+            auto report_fn = [this, error] (std::string_view action = "") {
+                report_validation_error(compaction_type::Scrub, *_schema, error, action);
            };
            maybe_abort_scrub(report_fn);
            report_fn("Rectifying by adding assumed missing partition-end");
@@ -1343,9 +1293,9 @@ private:
            }
        }

-        skip on_invalid_partition(const dht::decorated_key& new_key) {
-            auto report_fn = [this, &new_key] (std::string_view action = "") {
-                report_invalid_partition(compaction_type::Scrub, _validator, new_key, action);
+        skip on_invalid_partition(const dht::decorated_key& new_key, sstring error) {
+            auto report_fn = [this, error] (std::string_view action = "") {
+                report_validation_error(compaction_type::Scrub, *_schema, error, action);
            };
            maybe_abort_scrub(report_fn);
            if (_scrub_mode == compaction_type_options::scrub::mode::segregate) {
@@ -1359,9 +1309,9 @@ private:
            return skip::yes;
        }

-        skip on_invalid_mutation_fragment(const mutation_fragment_v2& mf) {
-            auto report_fn = [this, &mf] (std::string_view action = "") {
-                report_invalid_mutation_fragment(compaction_type::Scrub, _validator, mf, "");
+        skip on_invalid_mutation_fragment(const mutation_fragment_v2& mf, sstring error) {
+            auto report_fn = [this, error] (std::string_view action = "") {
+                report_validation_error(compaction_type::Scrub, *_schema, error, action);
            };
            maybe_abort_scrub(report_fn);

@@ -1396,9 +1346,9 @@ private:
            return skip::yes;
        }

-        void on_invalid_end_of_stream() {
-            auto report_fn = [this] (std::string_view action = "") {
-                report_invalid_end_of_stream(compaction_type::Scrub, _validator, action);
+        void on_invalid_end_of_stream(sstring error) {
+            auto report_fn = [this, error] (std::string_view action = "") {
+                report_validation_error(compaction_type::Scrub, *_schema, error, action);
            };
            maybe_abort_scrub(report_fn);
            // Handle missing partition_end
@@ -1417,21 +1367,27 @@ private:
                    // and shouldn't be verified. We know the last fragment the
                    // validator saw is a partition-start, passing it another one
                    // will confuse it.
-                    if (!_skip_to_next_partition && !_validator(mf)) {
-                        on_unexpected_partition_start(mf);
+                    if (!_skip_to_next_partition) {
+                        if (auto res = _validator(mf); !res) {
+                            on_unexpected_partition_start(mf, res.what());
+                        }
                        // Continue processing this partition start.
                    }
                    _skip_to_next_partition = false;
                    // Then check that the partition monotonicity stands.
                    const auto& dk = mf.as_partition_start().key();
-                    if (!_validator(dk) && on_invalid_partition(dk) == skip::yes) {
-                        continue;
+                    if (auto res = _validator(dk); !res) {
+                        if (on_invalid_partition(dk, res.what()) == skip::yes) {
+                            continue;
+                        }
                    }
                } else if (_skip_to_next_partition) {
                    continue;
                } else {
-                    if (!_validator(mf) && on_invalid_mutation_fragment(mf) == skip::yes) {
-                        continue;
+                    if (auto res = _validator(mf); !res) {
+                        if (on_invalid_mutation_fragment(mf, res.what()) == skip::yes) {
+                            continue;
+                        }
                    }
                }
                push_mutation_fragment(std::move(mf));
@@ -1440,8 +1396,8 @@ private:
            _end_of_stream = _reader.is_end_of_stream() && _reader.is_buffer_empty();

            if (_end_of_stream) {
-                if (!_validator.on_end_of_stream()) {
-                    on_invalid_end_of_stream();
+                if (auto res = _validator.on_end_of_stream(); !res) {
+                    on_invalid_end_of_stream(res.what());
                }
            }
        }
@@ -1722,81 +1678,29 @@ static std::unique_ptr<compaction> make_compaction(table_state& table_s, sstable
    return descriptor.options.visit(visitor_factory);
 }

-future<uint64_t> scrub_validate_mode_validate_reader(flat_mutation_reader_v2 reader, const compaction_data& cdata) {
-    auto schema = reader.schema();
-
-    uint64_t errors = 0;
-    std::exception_ptr ex;
-
-    try {
-        auto validator = mutation_fragment_stream_validator(*schema);
-
-        while (auto mf_opt = co_await reader()) {
-            if (cdata.is_stop_requested()) [[unlikely]] {
-                // Compaction manager will catch this exception and re-schedule the compaction.
-                throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
-            }
-
-            const auto& mf = *mf_opt;
-
-            if (mf.is_partition_start()) {
-                const auto& ps = mf.as_partition_start();
-                if (!validator(mf)) {
-                    scrub_compaction::report_invalid_partition_start(compaction_type::Scrub, validator, ps.key());
-                    validator.reset(mf);
-                    ++errors;
-                }
-                if (!validator(ps.key())) {
-                    scrub_compaction::report_invalid_partition(compaction_type::Scrub, validator, ps.key());
-                    validator.reset(ps.key());
-                    ++errors;
-                }
-            } else {
-                if (!validator(mf)) {
-                    scrub_compaction::report_invalid_mutation_fragment(compaction_type::Scrub, validator, mf);
-                    validator.reset(mf);
-                    ++errors;
-                }
-            }
-        }
-        if (!validator.on_end_of_stream()) {
-            scrub_compaction::report_invalid_end_of_stream(compaction_type::Scrub, validator);
-            ++errors;
-        }
-    } catch (...) {
-        ex = std::current_exception();
-    }
-
-    co_await reader.close();
-
-    if (ex) {
-        co_return coroutine::exception(std::move(ex));
-    }
-
-    co_return errors;
-}
-
 static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, table_state& table_s) {
    auto schema = table_s.schema();
+    auto permit = table_s.make_compaction_reader_permit();
+
+    uint64_t validation_errors = 0;

-    formatted_sstables_list sstables_list_msg;
-    auto sstables = make_lw_shared<sstables::sstable_set>(sstables::make_partitioned_sstable_set(schema, false));
    for (const auto& sst : descriptor.sstables) {
-        sstables_list_msg += sst;
-        sstables->insert(sst);
+        clogger.info("Scrubbing in validate mode {}", sst->get_filename());
+
+        validation_errors += co_await sst->validate(permit, descriptor.io_priority, cdata.abort, [&schema] (sstring what) {
+            scrub_compaction::report_validation_error(compaction_type::Scrub, *schema, what);
+        });
+        // Did validation actually finish because aborted?
+        if (cdata.is_stop_requested()) {
+            // Compaction manager will catch this exception and re-schedule the compaction.
+            throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
+        }
+
+        clogger.info("Finished scrubbing in validate mode {} - sstable is {}", sst->get_filename(), validation_errors == 0 ? "valid" : "invalid");
    }

-    clogger.info("Scrubbing in validate mode {}", sstables_list_msg);
-
-    auto permit = table_s.make_compaction_reader_permit();
-    auto reader = sstables->make_crawling_reader(schema, permit, descriptor.io_priority, nullptr);
-
-    const auto validation_errors = co_await scrub_validate_mode_validate_reader(std::move(reader), cdata);
-
-    clogger.info("Finished scrubbing in validate mode {} - sstable(s) are {}", sstables_list_msg, validation_errors == 0 ? "valid" : "invalid");
-
    if (validation_errors != 0) {
-        for (auto& sst : *sstables->all()) {
+        for (auto& sst : descriptor.sstables) {
            co_await sst->change_state(sstables::quarantine_dir);
        }
    }
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -92,12 +92,15 @@ struct compaction_stats {
    uint64_t start_size = 0;
    uint64_t end_size = 0;
    uint64_t validation_errors = 0;
+    // Bloom filter checks during max purgeable calculation
+    uint64_t bloom_filter_checks = 0;

    compaction_stats& operator+=(const compaction_stats& r) {
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
+        bloom_filter_checks += r.bloom_filter_checks;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
@@ -130,7 +133,4 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
 // For tests, can drop after we virtualize sstables.
 flat_mutation_reader_v2 make_scrubbing_reader(flat_mutation_reader_v2 rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);

-// For tests, can drop after we virtualize sstables.
-future<uint64_t> scrub_validate_mode_validate_reader(flat_mutation_reader_v2 rd, const compaction_data& info);
-
 }
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -453,7 +453,7 @@ protected:
        };
        setup_new_compaction(descriptor.run_identifier);

-        cmlog.info0("User initiated compaction started on behalf of {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
+        cmlog.info0("User initiated compaction started on behalf of {}", *t);

        // Now that the sstables for major compaction are registered
        // and the user_initiated_backlog_tracker is set up
@@ -533,8 +533,8 @@ compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manage
    , _holder(_compaction_state.gate.hold())
 {
    _compaction_state.compaction_disabled_counter++;
-    cmlog.debug("Temporarily disabled compaction for {}.{}. compaction_disabled_counter={}",
-            _table->schema()->ks_name(), _table->schema()->cf_name(), _compaction_state.compaction_disabled_counter);
+    cmlog.debug("Temporarily disabled compaction for {}. compaction_disabled_counter={}",
+            t, _compaction_state.compaction_disabled_counter);
 }

 compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
@@ -547,13 +547,12 @@ compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenab
 compaction_manager::compaction_reenabler::~compaction_reenabler() {
    // submit compaction request if we're the last holder of the gate which is still opened.
    if (_table && --_compaction_state.compaction_disabled_counter == 0 && !_compaction_state.gate.is_closed()) {
-        cmlog.debug("Reenabling compaction for {}.{}",
-                _table->schema()->ks_name(), _table->schema()->cf_name());
+        cmlog.debug("Reenabling compaction for {}", *_table);
        try {
            _cm.submit(*_table);
        } catch (...) {
-            cmlog.warn("compaction_reenabler could not reenable compaction for {}.{}: {}",
-                    _table->schema()->ks_name(), _table->schema()->cf_name(), std::current_exception());
+            cmlog.warn("compaction_reenabler could not reenable compaction for {}: {}",
+                    *_table, std::current_exception());
        }
    }
 }
@@ -606,8 +605,7 @@ compaction::compaction_state::~compaction_state() {

 std::string compaction_task_executor::describe() const {
    auto* t = _compacting_table;
-    auto s = t->schema();
-    return fmt::format("{} task {} for table {}.{} [{}]", _description, fmt::ptr(this), s->ks_name(), s->cf_name(), fmt::ptr(t));
+    return fmt::format("{} task {} for table {} [{}]", _description, fmt::ptr(this), *t, fmt::ptr(t));
 }

 compaction_task_executor::~compaction_task_executor() {
@@ -844,8 +842,7 @@ future<> compaction_manager::postponed_compactions_reevaluation() {
                if (!_compaction_state.contains(t)) {
                    continue;
                }
-                auto s = t->schema();
-                cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
+                cmlog.debug("resubmitting postponed compaction for table {} [{}]", *t, fmt::ptr(t));
                submit(*t);
                co_await coroutine::maybe_yield();
            }
@@ -894,7 +891,7 @@ future<> compaction_manager::stop_ongoing_compactions(sstring reason, table_stat
        if (cmlog.is_enabled(level)) {
            std::string scope = "";
            if (t) {
-                scope = fmt::format(" for table {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
+                scope = fmt::format(" for table {}", *t);
            }
            if (type_opt) {
                scope += fmt::format(" {} type={}", scope.size() ? "and" : "for", *type_opt);
@@ -1037,8 +1034,8 @@ protected:
                co_return std::nullopt;
            }
            if (!_cm.can_register_compaction(t, weight, descriptor.fan_in())) {
-                cmlog.debug("Refused compaction job ({} sstable(s)) of weight {} for {}.{}, postponing it...",
-                    descriptor.sstables.size(), weight, t.schema()->ks_name(), t.schema()->cf_name());
+                cmlog.debug("Refused compaction job ({} sstable(s)) of weight {} for {}, postponing it...",
+                    descriptor.sstables.size(), weight, t);
                switch_state(state::postponed);
                _cm.postpone_compaction_for_table(&t);
                co_return std::nullopt;
@@ -1048,8 +1045,8 @@ protected:
            auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting.release_compacting(exhausted_sstables);
            };
-            cmlog.debug("Accepted compaction job: task={} ({} sstable(s)) of weight {} for {}.{}",
-                fmt::ptr(this), descriptor.sstables.size(), weight, t.schema()->ks_name(), t.schema()->cf_name());
+            cmlog.debug("Accepted compaction job: task={} ({} sstable(s)) of weight {} for {}",
+                fmt::ptr(this), descriptor.sstables.size(), weight, t);

            setup_new_compaction(descriptor.run_identifier);
            std::exception_ptr ex;
@@ -1109,8 +1106,7 @@ bool compaction_manager::can_perform_regular_compaction(table_state& t) {
 future<> compaction_manager::maybe_wait_for_sstable_count_reduction(table_state& t) {
    auto schema = t.schema();
    if (!can_perform_regular_compaction(t)) {
-        cmlog.trace("maybe_wait_for_sstable_count_reduction in {}.{}: cannot perform regular compaction",
-                schema->ks_name(), schema->cf_name());
+        cmlog.trace("maybe_wait_for_sstable_count_reduction in {}: cannot perform regular compaction", t);
        co_return;
    }
    auto num_runs_for_compaction = [&, this] {
@@ -1123,8 +1119,8 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(table_state&
    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = num_runs_for_compaction();
    if (count <= threshold) {
-        cmlog.trace("No need to wait for sstable count reduction in {}.{}: {} <= {}",
-                schema->ks_name(), schema->cf_name(), count, threshold);
+        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
+                t, count, threshold);
        co_return;
    }
    // Reduce the chances of falling into an endless wait, if compaction
@@ -1142,8 +1138,8 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(table_state&
    }
    auto end = db_clock::now();
    auto elapsed_ms = (end - start) / 1ms;
-    cmlog.warn("Waited {}ms for compaction of {}.{} to catch up on {} sstable runs",
-            elapsed_ms, schema->ks_name(), schema->cf_name(), count);
+    cmlog.warn("Waited {}ms for compaction of {} to catch up on {} sstable runs",
+            elapsed_ms, t, count);
 }

 namespace compaction {
@@ -1264,12 +1260,16 @@ protected:
            std::exception_ptr ex;
            try {
                table_state& t = *_compacting_table;
-                auto maintenance_sstables = t.maintenance_sstable_set().all();
-                cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
-                        t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                auto size = t.maintenance_sstable_set().size();
+                if (!size) {
+                    cmlog.debug("Skipping off-strategy compaction for {}, No candidates were found", t);
+                    finish_compaction();
+                    co_return std::nullopt;
+                }
+                cmlog.info("Starting off-strategy compaction for {}, {} candidates were found", t, size);
                co_await run_offstrategy_compaction(_compaction_data);
                finish_compaction();
-                cmlog.info("Done with off-strategy compaction for {}.{}", t.schema()->ks_name(), t.schema()->cf_name());
+                cmlog.info("Done with off-strategy compaction for {}", t);
                co_return std::nullopt;
            } catch (...) {
                ex = std::current_exception();
@@ -1524,14 +1524,18 @@ protected:
        co_return std::nullopt;
    }
 private:
-    // Releases reference to cleaned files such that respective used disk space can be freed.
-    void release_exhausted(std::vector<sstables::shared_sstable> exhausted_sstables) {
-        _compacting.release_compacting(exhausted_sstables);
-    }
-
    future<> run_cleanup_job(sstables::compaction_descriptor descriptor) {
        co_await coroutine::switch_to(_cm.compaction_sg().cpu);

+        // Releases reference to cleaned files such that respective used disk space can be freed.
+        auto release_exhausted = [this, &descriptor] (std::vector<sstables::shared_sstable> exhausted_sstables) mutable {
+            auto exhausted = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(exhausted_sstables);
+            std::erase_if(descriptor.sstables, [&] (const sstables::shared_sstable& sst) {
+                return exhausted.contains(sst);
+            });
+            _compacting.release_compacting(exhausted_sstables);
+        };
+
        for (;;) {
            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_cm._compaction_controller.backlog_of_shares(200), _cm.available_memory()));
            _cm.register_backlog_tracker(user_initiated);
@@ -1539,8 +1543,7 @@ private:
            std::exception_ptr ex;
            try {
                setup_new_compaction(descriptor.run_identifier);
-                co_await compact_sstables_and_update_history(descriptor, _compaction_data,
-                                          std::bind(&cleanup_sstables_compaction_task_executor::release_exhausted, this, std::placeholders::_1));
+                co_await compact_sstables_and_update_history(descriptor, _compaction_data, release_exhausted);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return;  // done with current job
@@ -1561,6 +1564,11 @@ private:

 bool needs_cleanup(const sstables::shared_sstable& sst,
                   const dht::token_range_vector& sorted_owned_ranges) {
+    // Finish early if the keyspace has no owned token ranges (in this data center)
+    if (sorted_owned_ranges.empty()) {
+        return true;
+    }
+
    auto first_token = sst->get_first_decorated_key().token();
    auto last_token = sst->get_last_decorated_key().token();
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);
@@ -1580,9 +1588,13 @@ bool needs_cleanup(const sstables::shared_sstable& sst,
    return true;
 }

-bool compaction_manager::update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, owned_ranges_ptr owned_ranges_ptr) {
+bool compaction_manager::update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges) {
    auto& cs = get_compaction_state(&t);
-    if (owned_ranges_ptr && needs_cleanup(sst, *owned_ranges_ptr)) {
+    if (sst->is_shared()) {
+        throw std::runtime_error(format("Shared SSTable {} cannot be marked as requiring cleanup, as it can only be processed by resharding",
+                                        sst->get_filename()));
+    }
+    if (needs_cleanup(sst, sorted_owned_ranges)) {
        cs.sstables_requiring_cleanup.insert(sst);
        return true;
    } else {
@@ -1591,46 +1603,97 @@ bool compaction_manager::update_sstable_cleanup_state(table_state& t, const ssta
    }
 }

+bool compaction_manager::erase_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst) {
+    auto& cs = get_compaction_state(&t);
+    return cs.sstables_requiring_cleanup.erase(sst);
+}
+
 bool compaction_manager::requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const {
    const auto& cs = get_compaction_state(&t);
    return cs.sstables_requiring_cleanup.contains(sst);
 }

 future<> compaction_manager::perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t) {
+    constexpr auto sleep_duration = std::chrono::seconds(10);
+    constexpr auto max_idle_duration = std::chrono::seconds(300);
+    auto& cs = get_compaction_state(&t);
+
+    co_await try_perform_cleanup(sorted_owned_ranges, t);
+    auto last_idle = seastar::lowres_clock::now();
+
+    while (!cs.sstables_requiring_cleanup.empty()) {
+        auto idle = seastar::lowres_clock::now() - last_idle;
+        if (idle >= max_idle_duration) {
+            auto msg = ::format("Cleanup timed out after {} seconds of no progress", std::chrono::duration_cast<std::chrono::seconds>(idle).count());
+            cmlog.warn("{}", msg);
+            co_await coroutine::return_exception(std::runtime_error(msg));
+        }
+
+        auto has_sstables_eligible_for_compaction = [&] {
+            for (auto& sst : cs.sstables_requiring_cleanup) {
+                if (sstables::is_eligible_for_compaction(sst)) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        cmlog.debug("perform_cleanup: waiting for sstables to become eligible for cleanup");
+        co_await t.get_staging_done_condition().when(sleep_duration, [&] { return has_sstables_eligible_for_compaction(); });
+
+        if (!has_sstables_eligible_for_compaction()) {
+            continue;
+        }
+        co_await try_perform_cleanup(sorted_owned_ranges, t);
+        last_idle = seastar::lowres_clock::now();
+    }
+}
+
+future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t) {
    auto check_for_cleanup = [this, &t] {
        return boost::algorithm::any_of(_tasks, [&t] (auto& task) {
            return task->compacting_table() == &t && task->type() == sstables::compaction_type::Cleanup;
        });
    };
    if (check_for_cleanup()) {
-        throw std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
-            t.schema()->ks_name(), t.schema()->cf_name()));
+        throw std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}", t));
    }

-    if (sorted_owned_ranges->empty()) {
-        throw std::runtime_error("cleanup request failed: sorted_owned_ranges is empty");
+    co_await run_with_compaction_disabled(t, [&] () -> future<> {
+        auto update_sstables_cleanup_state = [&] (const sstables::sstable_set& set) -> future<> {
+            // Hold on to the sstable set since it may be overwritten
+            // while we yield in this loop.
+            auto set_holder = set.shared_from_this();
+            co_await set.for_each_sstable_gently([&] (const sstables::shared_sstable& sst) {
+                update_sstable_cleanup_state(t, sst, *sorted_owned_ranges);
+            });
+        };
+        co_await update_sstables_cleanup_state(t.main_sstable_set());
+        co_await update_sstables_cleanup_state(t.maintenance_sstable_set());
+    });
+
+    auto& cs = get_compaction_state(&t);
+    if (cs.sstables_requiring_cleanup.empty()) {
+        cmlog.debug("perform_cleanup for {} found no sstables requiring cleanup", t);
+        co_return;
+    }
+
+    // Some sstables may remain in sstables_requiring_cleanup
+    // for later processing if they can't be cleaned up right now.
+    // They are erased from sstables_requiring_cleanup by compacting.release_compacting
+    cs.owned_ranges_ptr = std::move(sorted_owned_ranges);
+
+    auto found_maintenance_sstables = bool(t.maintenance_sstable_set().for_each_sstable_until([this, &t] (const sstables::shared_sstable& sst) {
+        return stop_iteration(requires_cleanup(t, sst));
+    }));
+    if (found_maintenance_sstables) {
+        co_await perform_offstrategy(t);
    }

    // Called with compaction_disabled
-    auto get_sstables = [this, &t, sorted_owned_ranges] () -> future<std::vector<sstables::shared_sstable>> {
-        return seastar::async([this, &t, sorted_owned_ranges = std::move(sorted_owned_ranges)] {
-            auto update_sstables_cleanup_state = [&] (const sstables::sstable_set& set) {
-                set.for_each_sstable([&] (const sstables::shared_sstable& sst) {
-                    update_sstable_cleanup_state(t, sst, sorted_owned_ranges);
-                    seastar::thread::maybe_yield();
-                });
-            };
-            update_sstables_cleanup_state(t.main_sstable_set());
-            update_sstables_cleanup_state(t.maintenance_sstable_set());
-            // Some sstables may remain in sstables_requiring_cleanup
-            // for later processing if they can't be cleaned up right now.
-            // They are erased from sstables_requiring_cleanup by compacting.release_compacting
-            auto& cs = get_compaction_state(&t);
-            if (!cs.sstables_requiring_cleanup.empty()) {
-                cs.owned_ranges_ptr = std::move(sorted_owned_ranges);
-            }
-            return get_candidates(t, cs.sstables_requiring_cleanup);
-        });
+    auto get_sstables = [this, &t] () -> future<std::vector<sstables::shared_sstable>> {
+        auto& cs = get_compaction_state(&t);
+        co_return get_candidates(t, cs.sstables_requiring_cleanup);
    };

    co_await perform_task_on_all_files<cleanup_sstables_compaction_task_executor>(t, sstables::compaction_type_options::make_cleanup(), std::move(sorted_owned_ranges),
@@ -1701,8 +1764,7 @@ compaction::compaction_state::compaction_state(table_state& t)
 void compaction_manager::add(table_state& t) {
    auto [_, inserted] = _compaction_state.try_emplace(&t, t);
    if (!inserted) {
-        auto s = t.schema();
-        on_internal_error(cmlog, format("compaction_state for table {}.{} [{}] already exists", s->ks_name(), s->cf_name(), fmt::ptr(&t)));
+        on_internal_error(cmlog, format("compaction_state for table {} [{}] already exists", t, fmt::ptr(&t)));
    }
 }

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -304,7 +304,12 @@ public:
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t);
+private:
+    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t);

+    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
+    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
+public:
    // Submit a table to be upgraded and wait for its termination.
    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version);

@@ -404,8 +409,9 @@ public:
        return _tombstone_gc_state;
    };

-    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
-    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, owned_ranges_ptr owned_ranges_ptr);
+    // Uncoditionally erase sst from `sstables_requiring_cleanup`
+    // Returns true iff sst was found and erased.
+    bool erase_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst);

    // checks if the sstable is in the respective compaction_state.sstables_requiring_cleanup set.
    bool requires_cleanup(table_state& t, const sstables::shared_sstable& sst) const;
--- a/compaction/compaction_state.hh
+++ b/compaction/compaction_state.hh
@@ -35,7 +35,7 @@ struct compaction_state {
    compaction_backlog_tracker backlog_tracker;

    std::unordered_set<sstables::shared_sstable> sstables_requiring_cleanup;
-    owned_ranges_ptr owned_ranges_ptr;
+    compaction::owned_ranges_ptr owned_ranges_ptr;

    explicit compaction_state(table_state& t);
    compaction_state(compaction_state&&) = delete;
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -37,6 +37,10 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
        return candidate;
    }

+    if (!table_s.tombstone_gc_enabled()) {
+        return compaction_descriptor();
+    }
+
    // if there is no sstable to compact in standard way, try compacting based on droppable tombstone ratio
    // unlike stcs, lcs can look for sstable with highest droppable tombstone ratio, so as not to choose
    // a sstable which droppable data shadow data in older sstable, by starting from highest levels which
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -164,6 +164,10 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
        return sstables::compaction_descriptor(std::move(most_interesting), service::get_local_compaction_priority());
    }

+    if (!table_s.tombstone_gc_enabled()) {
+        return compaction_descriptor();
+    }
+
    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
    // ratio is greater than threshold.
    // prefer oldest sstables from biggest size tiers because they will be easier to satisfy conditions for
--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -9,6 +9,8 @@

 #pragma once

+#include <seastar/core/condition-variable.hh>
+
 #include "schema/schema_fwd.hh"
 #include "compaction_descriptor.hh"

@@ -48,9 +50,24 @@ public:
    virtual api::timestamp_type min_memtable_timestamp() const = 0;
    virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
    virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
+    virtual bool tombstone_gc_enabled() const noexcept = 0;
    virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;
    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
+    virtual const std::string& get_group_id() const noexcept = 0;
+    virtual seastar::condition_variable& get_staging_done_condition() noexcept = 0;
 };

-}
+} // namespace compaction

+namespace fmt {
+
+template <>
+struct formatter<compaction::table_state> : formatter<std::string_view> {
+    template <typename FormatContext>
+    auto format(const compaction::table_state& t, FormatContext& ctx) const {
+        auto s = t.schema();
+        return fmt::format_to(ctx.out(), "{}.{} compaction_group={}", s->ks_name(), s->cf_name(), t.get_group_id());
+    }
+};
+
+} // namespace fmt
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -128,4 +128,44 @@ future<> shard_upgrade_sstables_compaction_task_impl::run() {
    });
 }

+future<> scrub_sstables_compaction_task_impl::run() {
+    _stats = co_await _db.map_reduce0([&] (replica::database& db) -> future<sstables::compaction_stats> {
+        sstables::compaction_stats stats;
+        tasks::task_info parent_info{_status.id, _status.shard};
+        auto& compaction_module = db.get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<shard_scrub_sstables_compaction_task_impl>(parent_info, _status.keyspace, _status.id, db, _column_families, _opts, stats);
+        co_await task->done();
+        co_return stats;
+    }, sstables::compaction_stats{}, std::plus<sstables::compaction_stats>());
+}
+
+tasks::is_internal shard_scrub_sstables_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
+}
+
+future<> shard_scrub_sstables_compaction_task_impl::run() {
+    _stats = co_await map_reduce(_column_families, [&] (sstring cfname) -> future<sstables::compaction_stats> {
+        sstables::compaction_stats stats{};
+        tasks::task_info parent_info{_status.id, _status.shard};
+        auto& compaction_module = _db.get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<table_scrub_sstables_compaction_task_impl>(parent_info, _status.keyspace, cfname, _status.id, _db, _opts, stats);
+        co_await task->done();
+        co_return stats;
+    }, sstables::compaction_stats{}, std::plus<sstables::compaction_stats>());
+}
+
+tasks::is_internal table_scrub_sstables_compaction_task_impl::is_internal() const noexcept {
+    return tasks::is_internal::yes;
+}
+
+future<> table_scrub_sstables_compaction_task_impl::run() {
+    auto& cm = _db.get_compaction_manager();
+    auto& cf = _db.find_column_family(_status.keyspace, _status.table);
+    co_await cf.parallel_foreach_table_state([&] (compaction::table_state& ts) mutable -> future<> {
+        auto r = co_await cm.perform_sstable_scrub(ts, _opts);
+        _stats += r.value_or(sstables::compaction_stats{});
+    });
+}
+
+
 }
--- a/compaction/task_manager_module.hh
+++ b/compaction/task_manager_module.hh
@@ -8,6 +8,7 @@

 #pragma once

+#include "compaction/compaction.hh"
 #include "replica/database_fwd.hh"
 #include "schema/schema_fwd.hh"
 #include "tasks/task_manager.hh"
@@ -213,9 +214,9 @@ protected:
    virtual future<> run() override;
 };

-class rewrite_sstables_compaction_task_impl : public compaction_task_impl {
+class sstables_compaction_task_impl : public compaction_task_impl {
 public:
-    rewrite_sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
+    sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
            tasks::task_id id,
            unsigned sequence_number,
            std::string keyspace,
@@ -234,7 +235,7 @@ protected:
    virtual future<> run() override = 0;
 };

-class upgrade_sstables_compaction_task_impl : public rewrite_sstables_compaction_task_impl {
+class upgrade_sstables_compaction_task_impl : public sstables_compaction_task_impl {
 private:
    sharded<replica::database>& _db;
    std::vector<table_id> _table_infos;
@@ -245,7 +246,7 @@ public:
            sharded<replica::database>& db,
            std::vector<table_id> table_infos,
            bool exclude_current_version) noexcept
-        : rewrite_sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
        , _db(db)
        , _table_infos(std::move(table_infos))
        , _exclude_current_version(exclude_current_version)
@@ -254,7 +255,7 @@ protected:
    virtual future<> run() override;
 };

-class shard_upgrade_sstables_compaction_task_impl : public rewrite_sstables_compaction_task_impl {
+class shard_upgrade_sstables_compaction_task_impl : public sstables_compaction_task_impl {
 private:
    replica::database& _db;
    std::vector<table_id> _table_infos;
@@ -266,7 +267,7 @@ public:
            replica::database& db,
            std::vector<table_id> table_infos,
            bool exclude_current_version) noexcept
-        : rewrite_sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
        , _db(db)
        , _table_infos(std::move(table_infos))
        , _exclude_current_version(exclude_current_version)
@@ -277,6 +278,79 @@ protected:
    virtual future<> run() override;
 };

+class scrub_sstables_compaction_task_impl : public sstables_compaction_task_impl {
+private:
+    sharded<replica::database>& _db;
+    std::vector<sstring> _column_families;
+    sstables::compaction_type_options::scrub _opts;
+    sstables::compaction_stats& _stats;
+public:
+    scrub_sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
+            std::string keyspace,
+            sharded<replica::database>& db,
+            std::vector<sstring> column_families,
+            sstables::compaction_type_options::scrub opts,
+            sstables::compaction_stats& stats) noexcept
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", tasks::task_id::create_null_id())
+        , _db(db)
+        , _column_families(std::move(column_families))
+        , _opts(opts)
+        , _stats(stats)
+    {}
+protected:
+    virtual future<> run() override;
+};
+
+class shard_scrub_sstables_compaction_task_impl : public sstables_compaction_task_impl {
+private:
+    replica::database& _db;
+    std::vector<sstring> _column_families;
+    sstables::compaction_type_options::scrub _opts;
+    sstables::compaction_stats& _stats;
+public:
+    shard_scrub_sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
+            std::string keyspace,
+            tasks::task_id parent_id,
+            replica::database& db,
+            std::vector<sstring> column_families,
+            sstables::compaction_type_options::scrub opts,
+            sstables::compaction_stats& stats) noexcept
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), "", "", parent_id)
+        , _db(db)
+        , _column_families(std::move(column_families))
+        , _opts(opts)
+        , _stats(stats)
+    {}
+
+    virtual tasks::is_internal is_internal() const noexcept override;
+protected:
+    virtual future<> run() override;
+};
+
+class table_scrub_sstables_compaction_task_impl : public sstables_compaction_task_impl {
+private:
+    replica::database& _db;
+    sstables::compaction_type_options::scrub _opts;
+    sstables::compaction_stats& _stats;
+public:
+    table_scrub_sstables_compaction_task_impl(tasks::task_manager::module_ptr module,
+            std::string keyspace,
+            std::string table,
+            tasks::task_id parent_id,
+            replica::database& db,
+            sstables::compaction_type_options::scrub opts,
+            sstables::compaction_stats& stats) noexcept
+        : sstables_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), std::move(keyspace), std::move(table), "", parent_id)
+        , _db(db)
+        , _opts(opts)
+        , _stats(stats)
+    {}
+
+    virtual tasks::is_internal is_internal() const noexcept override;
+protected:
+    virtual future<> run() override;
+};
+
 class task_manager_module : public tasks::task_manager::module {
 public:
    task_manager_module(tasks::task_manager& tm) noexcept : tasks::task_manager::module(tm, "compaction") {}
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -284,6 +284,10 @@ time_window_compaction_strategy::get_next_non_expired_sstables(table_state& tabl
        return most_interesting;
    }

+    if (!table_s.tombstone_gc_enabled()) {
+        return {};
+    }
+
    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
    // ratio is greater than threshold.
    auto e = boost::range::remove_if(non_expiring_sstables, [this, compaction_time, &table_s] (const shared_sstable& sst) -> bool {
--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -31,25 +31,10 @@ public:
    const dht::ring_position_view& position() const {
        return *_rpv;
    }
-    friend std::strong_ordering tri_compare(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        return dht::ring_position_tri_compare(*x._schema, x.position(), y.position());
+    std::strong_ordering operator<=>(const compatible_ring_position_or_view& other) const {
+        return dht::ring_position_tri_compare(*_schema, position(), other.position());
    }
-    friend bool operator<(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        return tri_compare(x, y) < 0;
-    }
-    friend bool operator<=(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        return tri_compare(x, y) <= 0;
-    }
-    friend bool operator>(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        return tri_compare(x, y) > 0;
-    }
-    friend bool operator>=(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        return tri_compare(x, y) >= 0;
-    }
-    friend bool operator==(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        return tri_compare(x, y) == 0;
-    }
-    friend bool operator!=(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        return tri_compare(x, y) != 0;
+    bool operator==(const compatible_ring_position_or_view& other) const {
+        return *this <=> other == 0;
    }
 };
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -123,10 +123,6 @@ public:
        bool operator==(const iterator& other) const {
            return _offset == other._offset && other._i == _i;
        }
-
-        bool operator!=(const iterator& other) const {
-            return !(*this == other);
-        }
    };

    // A trichotomic comparator defined on @CompoundType representations which
@@ -429,7 +425,6 @@ public:

        const value_type& operator*() const { return _current; }
        const value_type* operator->() const { return &_current; }
-        bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin(); }
        bool operator==(const iterator& i) const { return _v.begin() == i._v.begin(); }

        friend class composite;
@@ -636,7 +631,6 @@ public:
    }

    bool operator==(const composite_view& k) const { return k._bytes == _bytes && k._is_compound == _is_compound; }
-    bool operator!=(const composite_view& k) const { return !(k == *this); }

    friend fmt::formatter<composite_view>;
 };
--- a/compress.cc
+++ b/compress.cc
@@ -175,10 +175,6 @@ bool compression_parameters::operator==(const compression_parameters& other) con
           && _crc_check_chance == other._crc_check_chance;
 }

-bool compression_parameters::operator!=(const compression_parameters& other) const {
-    return !(*this == other);
-}
-
 void compression_parameters::validate_options(const std::map<sstring, sstring>& options) {
    // currently, there are no options specific to a particular compressor
    static std::set<sstring> keywords({
--- a/compress.hh
+++ b/compress.hh
@@ -105,7 +105,6 @@ public:
    void validate();
    std::map<sstring, sstring> get_options() const;
    bool operator==(const compression_parameters& other) const;
-    bool operator!=(const compression_parameters& other) const;

    static compression_parameters no_compression() {
        return compression_parameters(nullptr);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -272,6 +272,7 @@ batch_size_fail_threshold_in_kb: 1024
 #     - alternator-streams
 #     - alternator-ttl
 #     - raft
+#     - tablets

 # The directory where hints files are stored if hinted handoff is enabled.
 # hints_directory: /var/lib/scylla/hints
--- a/configure.py
+++ b/configure.py
@@ -435,6 +435,8 @@ scylla_tests = set([
    'test/boost/mutation_writer_test',
    'test/boost/mvcc_test',
    'test/boost/network_topology_strategy_test',
+    'test/boost/token_metadata_test',
+    'test/boost/tablets_test',
    'test/boost/nonwrapping_range_test',
    'test/boost/observable_test',
    'test/boost/partitioner_test',
@@ -507,6 +509,8 @@ scylla_tests = set([
    'test/boost/exceptions_fallback_test',
    'test/boost/s3_test',
    'test/boost/locator_topology_test',
+    'test/boost/string_format_test',
+    'test/boost/tagged_integer_test',
    'test/manual/ec2_snitch_test',
    'test/manual/enormous_table_scan_test',
    'test/manual/gce_snitch_test',
@@ -561,6 +565,20 @@ raft_tests = set([
    'test/raft/failure_detector_test',
 ])

+wasms = set([
+    'wasm/return_input.wat',
+    'wasm/test_complex_null_values.wat',
+    'wasm/test_fib_called_on_null.wat',
+    'wasm/test_functions_with_frozen_types.wat',
+    'wasm/test_mem_grow.wat',
+    'wasm/test_pow.wat',
+    'wasm/test_short_ints.wat',
+    'wasm/test_types_with_and_without_nulls.wat',
+    'wasm/test_UDA_final.wat',
+    'wasm/test_UDA_scalar.wat',
+    'wasm/test_word_double.wat',
+])
+
 apps = set([
    'scylla',
 ])
@@ -571,7 +589,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | tests | other
+all_artifacts = apps | tests | other | wasms

 arg_parser = argparse.ArgumentParser('Configure scylla')
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -663,6 +681,7 @@ scylla_raft_core = [
 scylla_core = (['message/messaging_service.cc',
                'replica/database.cc',
                'replica/table.cc',
+                'replica/tablets.cc',
                'replica/distributed_loader.cc',
                'replica/memtable.cc',
                'replica/exceptions.cc',
@@ -672,6 +691,7 @@ scylla_core = (['message/messaging_service.cc',
                'mutation/frozen_mutation.cc',
                'mutation/mutation.cc',
                'mutation/mutation_fragment.cc',
+                'mutation/mutation_fragment_stream_validator.cc',
                'mutation/mutation_partition.cc',
                'mutation/mutation_partition_v2.cc',
                'mutation/mutation_partition_view.cc',
@@ -717,6 +737,7 @@ scylla_core = (['message/messaging_service.cc',
                'sstables/sstables.cc',
                'sstables/sstables_manager.cc',
                'sstables/sstable_set.cc',
+                'sstables/storage.cc',
                'sstables/mx/partition_reversing_data_source.cc',
                'sstables/mx/reader.cc',
                'sstables/mx/writer.cc',
@@ -842,6 +863,7 @@ scylla_core = (['message/messaging_service.cc',
                'validation.cc',
                'service/priority_manager.cc',
                'service/migration_manager.cc',
+                'service/tablet_allocator.cc',
                'service/storage_proxy.cc',
                'query_ranges_to_vnodes.cc',
                'service/forward_service.cc',
@@ -931,6 +953,7 @@ scylla_core = (['message/messaging_service.cc',
                'query.cc',
                'query-result-set.cc',
                'locator/abstract_replication_strategy.cc',
+                'locator/tablets.cc',
                'locator/azure_snitch.cc',
                'locator/simple_strategy.cc',
                'locator/local_strategy.cc',
@@ -1169,7 +1192,7 @@ scylla_tests_generic_dependencies = [
    'test/lib/sstable_run_based_compaction_strategy_for_tests.cc',
 ]

-scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependencies + [
+scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_generic_dependencies + [
    'test/lib/cql_assertions.cc',
    'test/lib/result_set_assertions.cc',
    'test/lib/mutation_source_test.cc',
@@ -1187,6 +1210,7 @@ scylla_perfs = ['test/perf/perf_fast_forward.cc',
                'test/perf/perf_row_cache_update.cc',
                'test/perf/perf_simple_query.cc',
                'test/perf/perf_sstable.cc',
+                'test/perf/perf_tablets.cc',
                'test/perf/perf.cc',
                'test/lib/alternator_test_env.cc',
                'test/lib/cql_test_env.cc',
@@ -1235,6 +1259,7 @@ pure_boost_tests = set([
    'test/boost/vint_serialization_test',
    'test/boost/bptree_test',
    'test/boost/utf8_test',
+    'test/boost/string_format_test',
    'test/manual/streaming_histogram_test',
 ])

@@ -1272,7 +1297,7 @@ for t in sorted(scylla_tests):
    if t not in tests_not_using_seastar_test_framework:
        deps[t] += scylla_tests_dependencies
    else:
-        deps[t] += scylla_core + idls + scylla_tests_generic_dependencies
+        deps[t] += scylla_core + alternator + idls + scylla_tests_generic_dependencies

 perf_tests_seastar_deps = [
    'seastar/tests/perf/perf_tests.cc'
@@ -1338,15 +1363,27 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',
                                     'test/lib/log.cc',
                                     'service/raft/discovery.cc'] + scylla_raft_dependencies

+wasm_deps = {}
+
+wasm_deps['wasm/return_input.wat'] = 'test/resource/wasm/rust/return_input.rs'
+wasm_deps['wasm/test_short_ints.wat'] = 'test/resource/wasm/rust/test_short_ints.rs'
+wasm_deps['wasm/test_complex_null_values.wat'] = 'test/resource/wasm/rust/test_complex_null_values.rs'
+wasm_deps['wasm/test_functions_with_frozen_types.wat'] = 'test/resource/wasm/rust/test_functions_with_frozen_types.rs'
+wasm_deps['wasm/test_types_with_and_without_nulls.wat'] = 'test/resource/wasm/rust/test_types_with_and_without_nulls.rs'
+
+wasm_deps['wasm/test_fib_called_on_null.wat'] = 'test/resource/wasm/c/test_fib_called_on_null.c'
+wasm_deps['wasm/test_mem_grow.wat'] = 'test/resource/wasm/c/test_mem_grow.c'
+wasm_deps['wasm/test_pow.wat'] = 'test/resource/wasm/c/test_pow.c'
+wasm_deps['wasm/test_UDA_final.wat'] = 'test/resource/wasm/c/test_UDA_final.c'
+wasm_deps['wasm/test_UDA_scalar.wat'] = 'test/resource/wasm/c/test_UDA_scalar.c'
+wasm_deps['wasm/test_word_double.wat'] = 'test/resource/wasm/c/test_word_double.c'

 warnings = [
    '-Wall',
    '-Werror',
    '-Wno-mismatched-tags',  # clang-only
    '-Wno-tautological-compare',
-    '-Wno-parentheses-equality',
    '-Wno-c++11-narrowing',
-    '-Wno-missing-braces',
    '-Wno-ignored-attributes',
    '-Wno-overloaded-virtual',
    '-Wno-unused-command-line-argument',
@@ -1502,10 +1539,10 @@ default_modes = args.selected_modes or [mode for mode, mode_cfg in modes.items()
 build_modes =  {m: modes[m] for m in selected_modes}

 if args.artifacts:
-    build_artifacts = []
+    build_artifacts = set()
    for artifact in args.artifacts:
        if artifact in all_artifacts:
-            build_artifacts.append(artifact)
+            build_artifacts.add(artifact)
        else:
            print("Ignoring unknown build artifact: {}".format(artifact))
    if not build_artifacts:
@@ -1787,7 +1824,32 @@ with open(buildfile, 'w') as f:
            description = RUST_SOURCE $out
        rule cxxbridge_header
            command = cxxbridge --header > $out
+        rule c2wasm
+            command = clang --target=wasm32 --no-standard-libraries -Wl,--export-all -Wl,--no-entry $in -o $out
+            description = C2WASM $out
+        rule rust2wasm
+            # The default stack size in Rust is 1MB, which causes oversized allocation warnings,
+            # because it's allocated in a single chunk as a part of a Wasm Linear Memory.
+            # We change the stack size to 128KB using the RUSTFLAGS environment variable
+            # in the command below.
+            command = RUSTFLAGS="-C link-args=-zstack-size=131072" cargo build --target=wasm32-wasi --example=$example --locked --manifest-path=test/resource/wasm/rust/Cargo.toml --target-dir=$builddir/wasm/ $
+                && wasm-opt -Oz $builddir/wasm/wasm32-wasi/debug/examples/$example.wasm -o $builddir/wasm/$example.wasm $
+                && wasm-strip $builddir/wasm/$example.wasm
+            description = RUST2WASM $out
+        rule wasm2wat
+            command = wasm2wat $in > $out
+            description = WASM2WAT $out
        ''').format(**globals()))
+    for binary in sorted(wasms):
+        src = wasm_deps[binary]
+        wasm = binary[:-4] + '.wasm'
+        if src.endswith('.rs'):
+            f.write(f'build $builddir/{wasm}: rust2wasm {src} | test/resource/wasm/rust/Cargo.lock\n')
+            example_name = binary[binary.rindex('/')+1:-4]
+            f.write(f'   example = {example_name}\n')
+        else:
+            f.write(f'build $builddir/{wasm}: c2wasm {src}\n')
+        f.write(f'build $builddir/{binary}: wasm2wat $builddir/{wasm}\n')
    for mode in build_modes:
        modeval = modes[mode]
        fmt_lib = 'fmt'
@@ -1852,9 +1914,10 @@ with open(buildfile, 'w') as f:
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, test_repeat=test_repeat, test_timeout=test_timeout, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts)])
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
+                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
            )
        )
        include_cxx_target = f'{mode}-build' if not args.dist_only else ''
@@ -1871,7 +1934,7 @@ with open(buildfile, 'w') as f:
        seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
        seastar_testing_dep = f'$builddir/{mode}/seastar/libseastar_testing.{seastar_lib_ext}'
        for binary in sorted(build_artifacts):
-            if binary in other:
+            if binary in other or binary in wasms:
                continue
            srcs = deps[binary]
            objs = ['$builddir/' + mode + '/' + src.replace('.cc', '.o')
@@ -1904,7 +1967,7 @@ with open(buildfile, 'w') as f:
                if binary not in tests_not_using_seastar_test_framework:
                    local_libs += ' ' + "$seastar_testing_libs_{}".format(mode)
                else:
-                    local_libs += ' ' + '-lgnutls'
+                    local_libs += ' ' + '-lgnutls' + ' ' + '-lboost_unit_test_framework'
                # Our code's debugging information is huge, and multiplied
                # by many tests yields ridiculous amounts of disk space.
                # So we strip the tests by default; The user can very
@@ -1959,9 +2022,10 @@ with open(buildfile, 'w') as f:
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
+                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
            )
        )
        f.write(
@@ -2025,13 +2089,14 @@ with open(buildfile, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
+                flags = '-Wno-parentheses-equality'
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+                    flags += ' -O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''

                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
-                    f.write('  obj_cxxflags = %s\n' % flags)
+                f.write(f'  obj_cxxflags = {flags}\n')
        f.write(f'build $builddir/{mode}/gen/empty.cc: gen\n')
        for hh in headers:
            f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} | $builddir/{mode}/gen/empty.cc || {gen_headers_dep}\n'.format(
@@ -2104,6 +2169,9 @@ with open(buildfile, 'w') as f:
    f.write(
            'build check: phony {}\n'.format(' '.join(['{mode}-check'.format(mode=mode) for mode in default_modes]))
    )
+    f.write(
+            'build wasm: phony {}\n'.format(' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]))
+    )

    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
--- a/counters.hh
+++ b/counters.hh
@@ -78,9 +78,6 @@ public:
        return id() == other.id() && value() == other.value()
               && logical_clock() == other.logical_clock();
    }
-    bool operator!=(const basic_counter_shard_view& other) const {
-        return !(*this == other);
-    }

    struct less_compare_by_id {
        bool operator()(const basic_counter_shard_view& x, const basic_counter_shard_view& y) const {
--- a/cql3/CMakeLists.txt
+++ b/cql3/CMakeLists.txt
@@ -7,7 +7,7 @@ generate_cql_grammar(
  SOURCES cql_grammar_srcs)
 set_source_files_properties(${cql_grammar_srcs}
  PROPERTIES
-    COMPILE_FLAGS "-Wno-uninitialized")
+    COMPILE_FLAGS "-Wno-uninitialized -Wno-parentheses-equality")

 add_library(cql3 STATIC)
 target_sources(cql3
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1773,7 +1773,12 @@ relation returns [expression e]
    : name=cident type=relationType t=term { $e = binary_operator(unresolved_identifier{std::move(name)}, type, std::move(t)); }

    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
-        { $e = binary_operator(token{std::move(l.elements)}, type, std::move(t)); }
+        {
+          $e = binary_operator(
+            function_call{functions::function_name::native_function("token"), std::move(l.elements)},
+            type,
+            std::move(t));
+        }
    | name=cident K_IS K_NOT K_NULL {
          $e = binary_operator(unresolved_identifier{std::move(name)}, oper_t::IS_NOT, make_untyped_null()); }
    | name=cident K_IN marker1=marker
--- a/cql3/authorized_prepared_statements_cache.hh
+++ b/cql3/authorized_prepared_statements_cache.hh
@@ -57,13 +57,7 @@ public:

    const cache_key_type& key() const { return _key; }

-    bool operator==(const authorized_prepared_statements_cache_key& other) const {
-        return _key == other._key;
-    }
-
-    bool operator!=(const authorized_prepared_statements_cache_key& other) const {
-        return !(*this == other);
-    }
+    bool operator==(const authorized_prepared_statements_cache_key&) const = default;

    static size_t hash(const auth::authenticated_user& user, const cql3::prepared_cache_key_type::cache_key_type& prep_cache_key) {
        return utils::hash_combine(std::hash<auth::authenticated_user>()(user), utils::tuple_hash()(prep_cache_key));
--- a/cql3/column_identifier.cc
+++ b/cql3/column_identifier.cc
@@ -11,8 +11,6 @@
 #include "cql3/util.hh"
 #include "cql3/query_options.hh"

-#include <regex>
-
 namespace cql3 {

 column_identifier::column_identifier(sstring raw_text, bool keep_case) {
@@ -96,10 +94,6 @@ bool column_identifier_raw::operator==(const column_identifier_raw& other) const
    return _text == other._text;
 }

-bool column_identifier_raw::operator!=(const column_identifier_raw& other) const {
-    return !operator==(other);
-}
-
 sstring column_identifier_raw::to_string() const {
    return _text;
 }
--- a/cql3/column_identifier.hh
+++ b/cql3/column_identifier.hh
@@ -88,8 +88,6 @@ public:

    bool operator==(const column_identifier_raw& other) const;

-    bool operator!=(const column_identifier_raw& other) const;
-
    virtual sstring to_string() const;
    sstring to_cql_string() const;

--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -205,10 +205,10 @@ class cql3_type::raw_ut : public raw {

    virtual sstring to_string() const override {
        if (is_frozen()) {
-            return format("frozen<{}>", _name.to_string());
+            return format("frozen<{}>", _name.to_cql_string());
        }

-        return _name.to_string();
+        return _name.to_cql_string();
    }
 public:
    raw_ut(ut_name name)
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -86,57 +86,43 @@ private:
    {
        using namespace antlr3;
        std::stringstream msg;
+        // Antlr3 has a function ex->displayRecognitionError() which is
+        // supposed to nicely print the recognition exception. Unfortunately
+        // it is buggy - see https://github.com/antlr/antlr3/issues/191
+        // and not being fixed, so let's copy it here and fix it here.
        switch (ex->getType()) {
-        case ExceptionType::UNWANTED_TOKEN_EXCEPTION: {
-            msg << "extraneous input " << get_token_error_display(recognizer, ex->get_token());
-            if (token_names != nullptr) {
-                std::string token_name;
-                if (recognizer.is_eof_token(ex->get_expecting())) {
-                    token_name = "EOF";
-                } else {
-                    token_name = reinterpret_cast<const char*>(token_names[ex->get_expecting()]);
-                }
-                msg << " expecting " << token_name;
-            }
-            break;
-        }
-        case ExceptionType::MISSING_TOKEN_EXCEPTION: {
-            std::string token_name;
-            if (token_names == nullptr) {
-                token_name = "(" + std::to_string(ex->get_expecting()) + ")";
-            } else {
-                if (recognizer.is_eof_token(ex->get_expecting())) {
-                    token_name = "EOF";
-                } else {
-                    token_name = reinterpret_cast<const char*>(token_names[ex->get_expecting()]);
-                }
-            }
-            msg << "missing " << token_name << " at " << get_token_error_display(recognizer, ex->get_token());
-            break;
-        }
-        case ExceptionType::NO_VIABLE_ALT_EXCEPTION: {
-            msg << "no viable alternative at input " << get_token_error_display(recognizer, ex->get_token());
-            break;
-        }
+        case ExceptionType::RECOGNITION_EXCEPTION:
+        case ExceptionType::EARLY_EXIT_EXCEPTION:
        default:
-            // AntLR Exception class has a bug of dereferencing a null
-            // pointer in the displayRecognitionError. The following
-            // if statement makes sure it will not be null before the
-            // call to that function (displayRecognitionError).
-            // bug reference: https://github.com/antlr/antlr3/issues/191
-            if (!ex->get_expectingSet()) {
-                ex->set_expectingSet(&_empty_bit_list);
+            // Unknown syntax error - the parser can't figure out what
+            // specific token is missing or unwanted.
+            msg << ": Syntax error";
+            break;
+        case ExceptionType::MISSING_TOKEN_EXCEPTION:
+            msg << ": Missing ";
+            if (recognizer.is_eof_token(ex->get_expecting())) {
+                msg << "EOF";
+            } else if (token_names) {
+                msg << reinterpret_cast<const char*>(token_names[ex->get_expecting()]);
+            } else {
+                msg << ex->get_expecting();
            }
-            ex->displayRecognitionError(token_names, msg);
+            break;
+        case ExceptionType::UNWANTED_TOKEN_EXCEPTION:
+        case ExceptionType::MISMATCHED_SET_EXCEPTION:
+            msg << ": Unexpected '";
+            msg << recognizer.token_text(ex->get_token());
+            msg << "'";
+            break;
+        case ExceptionType::NO_VIABLE_ALT_EXCEPTION:
+            msg << "no viable alternative at input '";
+            msg << recognizer.token_text(ex->get_token());
+            msg << "'";
+            break;
        }
        return msg.str();
    }

-    std::string get_token_error_display(RecognizerType& recognizer, const TokenType* token)
-    {
-        return "'" + recognizer.token_text(token) + "'";
-    }
-
 #if 0

    /**
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -56,10 +56,6 @@ bool operator==(const expression& e1, const expression& e2) {
    }, e1);
 }

-bool operator!=(const expression& e1, const expression& e2) {
-    return !(e1 == e2);
-}
-
 expression::expression(const expression& o)
        : _v(std::make_unique<impl>(*o._v)) {
 }
@@ -70,24 +66,6 @@ expression::operator=(const expression& o) {
    return *this;
 }

-token::token(std::vector<expression> args_in)
-    : args(std::move(args_in)) {
-}
-
-token::token(const std::vector<const column_definition*>& col_defs) {
-    args.reserve(col_defs.size());
-    for (const column_definition* col_def : col_defs) {
-        args.push_back(column_value(col_def));
-    }
-}
-
-token::token(const std::vector<::shared_ptr<column_identifier_raw>>& cols) {
-    args.reserve(cols.size());
-    for(const ::shared_ptr<column_identifier_raw>& col : cols) {
-        args.push_back(unresolved_identifier{col});
-    }
-}
-
 binary_operator::binary_operator(expression lhs, oper_t op, expression rhs, comparison_order order)
            : lhs(std::move(lhs))
            , op(op)
@@ -564,89 +542,11 @@ value_set intersection(value_set a, value_set b, const abstract_type* type) {
    return std::visit(intersection_visitor{type}, std::move(a), std::move(b));
 }

-bool is_satisfied_by(const binary_operator& opr, const evaluation_inputs& inputs) {
-    if (is<token>(opr.lhs)) {
-        // The RHS value was already used to ensure we fetch only rows in the specified
-        // token range. It is impossible for any fetched row not to match now.
-        // When token restrictions are present we forbid all other restrictions on partition key.
-        // This means that the partition range is defined solely by restrictions on token.
-        // When is_satisifed_by is used by filtering we can be sure that the token restrictions
-        // are fulfilled. In the future it will be possible to evaluate() a token,
-        // and we will be able to get rid of this risky if.
-        return true;
-    }
-
-    raw_value binop_eval_result = evaluate(opr, inputs);
-
-    if (binop_eval_result.is_null()) {
-        return false;
-    }
-    if (binop_eval_result.is_empty_value()) {
-        on_internal_error(expr_logger, format("is_satisfied_by: binary operator evaluated to EMPTY_VALUE: {}", opr));
-    }
-
-    return binop_eval_result.view().deserialize<bool>(*boolean_type);
-}
-
 } // anonymous namespace

 bool is_satisfied_by(const expression& restr, const evaluation_inputs& inputs) {
-    return expr::visit(overloaded_functor{
-            [] (const constant& constant_val) {
-                std::optional<bool> bool_val = get_bool_value(constant_val);
-                if (bool_val.has_value()) {
-                    return *bool_val;
-                }
-
-                on_internal_error(expr_logger,
-                    "is_satisfied_by: a constant that is not a bool value cannot serve as a restriction by itself");
-            },
-            [&] (const conjunction& conj) {
-                return boost::algorithm::all_of(conj.children, [&] (const expression& c) {
-                    return is_satisfied_by(c, inputs);
-                });
-            },
-            [&] (const binary_operator& opr) { return is_satisfied_by(opr, inputs); },
-            [] (const column_value&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a column cannot serve as a restriction by itself");
-            },
-            [] (const subscript&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a subscript cannot serve as a restriction by itself");
-            },
-            [] (const token&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: the token function cannot serve as a restriction by itself");
-            },
-            [] (const unresolved_identifier&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: an unresolved identifier cannot serve as a restriction");
-            },
-            [] (const column_mutation_attribute&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: the writetime/ttl cannot serve as a restriction by itself");
-            },
-            [] (const function_call&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a function call cannot serve as a restriction by itself");
-            },
-            [] (const cast&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a a type cast cannot serve as a restriction by itself");
-            },
-            [] (const field_selection&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a field selection cannot serve as a restriction by itself");
-            },
-            [] (const bind_variable&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a bind variable cannot serve as a restriction by itself");
-            },
-            [] (const untyped_constant&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: an untyped constant cannot serve as a restriction by itself");
-            },
-            [] (const tuple_constructor&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a tuple constructor cannot serve as a restriction by itself");
-            },
-            [] (const collection_constructor&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a collection constructor cannot serve as a restriction by itself");
-            },
-            [] (const usertype_constructor&) -> bool {
-                on_internal_error(expr_logger, "is_satisfied_by: a user type constructor cannot serve as a restriction by itself");
-            },
-        }, restr);
+    static auto true_value = managed_bytes_opt(data_value(true).serialize_nonnull());
+    return evaluate(restr, inputs).to_managed_bytes_opt() == true_value;
 }

 namespace {
@@ -767,7 +667,15 @@ nonwrapping_range<clustering_key_prefix> to_range(oper_t op, const clustering_ke
    return to_range<const clustering_key_prefix&>(op, val);
 }

-value_set possible_lhs_values(const column_definition* cdef, const expression& expr, const query_options& options) {
+// When cdef == nullptr it finds possible token values instead of column values.
+// When finding token values the table_schema_opt argument has to point to a valid schema,
+// but it isn't used when finding values for column.
+// The schema is needed to find out whether a call to token() function represents
+// the partition token.
+static value_set possible_lhs_values(const column_definition* cdef,
+                                        const expression& expr,
+                                        const query_options& options,
+                                        const schema* table_schema_opt) {
    const auto type = cdef ? &cdef->type->without_reversed() : long_type.get();
    return expr::visit(overloaded_functor{
            [] (const constant& constant_val) {
@@ -783,7 +691,7 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                return boost::accumulate(conj.children, unbounded_value_set,
                        [&] (const value_set& acc, const expression& child) {
                            return intersection(
-                                    std::move(acc), possible_lhs_values(cdef, child, options), type);
+                                    std::move(acc), possible_lhs_values(cdef, child, options, table_schema_opt), type);
                        });
            },
            [&] (const binary_operator& oper) -> value_set {
@@ -863,7 +771,11 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                            }
                            return unbounded_value_set;
                        },
-                        [&] (token) -> value_set {
+                        [&] (const function_call& token_fun_call) -> value_set {
+                            if (!is_partition_token_for_schema(token_fun_call, *table_schema_opt)) {
+                                on_internal_error(expr_logger, "possible_lhs_values: function calls are not supported as the LHS of a binary expression");
+                            }
+
                            if (cdef) {
                                return unbounded_value_set;
                            }
@@ -905,9 +817,6 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                        [] (const column_mutation_attribute&) -> value_set {
                            on_internal_error(expr_logger, "possible_lhs_values: writetime/ttl are not supported as the LHS of a binary expression");
                        },
-                        [] (const function_call&) -> value_set {
-                            on_internal_error(expr_logger, "possible_lhs_values: function calls are not supported as the LHS of a binary expression");
-                        },
                        [] (const cast&) -> value_set {
                            on_internal_error(expr_logger, "possible_lhs_values: typecasts are not supported as the LHS of a binary expression");
                        },
@@ -934,11 +843,8 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
            [] (const subscript&) -> value_set {
                on_internal_error(expr_logger, "possible_lhs_values: a subscript cannot serve as a restriction by itself");
            },
-            [] (const token&) -> value_set {
-                on_internal_error(expr_logger, "possible_lhs_values: the token function cannot serve as a restriction by itself");
-            },
            [] (const unresolved_identifier&) -> value_set {
-                on_internal_error(expr_logger, "is_satisfied_by: an unresolved identifier cannot serve as a restriction");
+                on_internal_error(expr_logger, "possible_lhs_values: an unresolved identifier cannot serve as a restriction");
            },
            [] (const column_mutation_attribute&) -> value_set {
                on_internal_error(expr_logger, "possible_lhs_values: the writetime/ttl functions cannot serve as a restriction by itself");
@@ -970,6 +876,14 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
        }, expr);
 }

+value_set possible_column_values(const column_definition* col, const expression& e, const query_options& options) {
+    return possible_lhs_values(col, e, options, nullptr);
+}
+
+value_set possible_partition_token_values(const expression& e, const query_options& options, const schema& table_schema) {
+    return possible_lhs_values(nullptr, e, options, &table_schema);
+}
+
 nonwrapping_range<managed_bytes> to_range(const value_set& s) {
    return std::visit(overloaded_functor{
            [] (const nonwrapping_range<managed_bytes>& r) { return r; },
@@ -1017,7 +931,7 @@ secondary_index::index::supports_expression_v is_supported_by_helper(const expre
                            // We don't use index table for multi-column restrictions, as it cannot avoid filtering.
                            return index::supports_expression_v::from_bool(false);
                        },
-                        [&] (const token&) { return index::supports_expression_v::from_bool(false); },
+                        [&] (const function_call&) { return index::supports_expression_v::from_bool(false); },
                        [&] (const subscript& s) -> ret_t {
                            const column_value& col = get_subscripted_column(s);
                            return idx.supports_subscript_expression(*col.col, oper.op);
@@ -1037,9 +951,6 @@ secondary_index::index::supports_expression_v is_supported_by_helper(const expre
                        [&] (const column_mutation_attribute&) -> ret_t {
                            on_internal_error(expr_logger, "is_supported_by: writetime/ttl are not supported as the LHS of a binary expression");
                        },
-                        [&] (const function_call&) -> ret_t {
-                            on_internal_error(expr_logger, "is_supported_by: function calls are not supported as the LHS of a binary expression");
-                        },
                        [&] (const cast&) -> ret_t {
                            on_internal_error(expr_logger, "is_supported_by: typecasts are not supported as the LHS of a binary expression");
                        },
@@ -1106,7 +1017,7 @@ std::ostream& operator<<(std::ostream& os, const column_value& cv) {
 std::ostream& operator<<(std::ostream& os, const expression& expr) {
    expression::printer pr {
        .expr_to_print = expr,
-        .debug_mode = true
+        .debug_mode = false
    };

    return os << pr;
@@ -1163,9 +1074,6 @@ std::ostream& operator<<(std::ostream& os, const expression::printer& pr) {
                    }
                }
            },
-            [&] (const token& t) {
-                fmt::print(os, "token({})", fmt::join(t.args | transformed(to_printer), ", "));
-            },
            [&] (const column_value& col) {
                fmt::print(os, "{}", cql3::util::maybe_quote(col.col->name_as_text()));
            },
@@ -1185,14 +1093,18 @@ std::ostream& operator<<(std::ostream& os, const expression::printer& pr) {
                        to_printer(cma.column));
            },
            [&] (const function_call& fc)  {
-                std::visit(overloaded_functor{
-                    [&] (const functions::function_name& named) {
-                        fmt::print(os, "{}({})", named, fmt::join(fc.args | transformed(to_printer), ", "));
-                    },
-                    [&] (const shared_ptr<functions::function>& anon) {
-                        fmt::print(os, "<anonymous function>({})", fmt::join(fc.args | transformed(to_printer), ", "));
-                    },
-                }, fc.func);
+                if (is_token_function(fc)) {
+                    fmt::print(os, "token({})", fmt::join(fc.args | transformed(to_printer), ", "));
+                } else {
+                    std::visit(overloaded_functor{
+                        [&] (const functions::function_name& named) {
+                            fmt::print(os, "{}({})", named, fmt::join(fc.args | transformed(to_printer), ", "));
+                        },
+                        [&] (const shared_ptr<functions::function>& anon) {
+                            fmt::print(os, "<anonymous function>({})", fmt::join(fc.args | transformed(to_printer), ", "));
+                        },
+                    }, fc.func);
+                }
            },
            [&] (const cast& c)  {
                std::visit(overloaded_functor{
@@ -1367,9 +1279,9 @@ expression replace_column_def(const expression& expr, const column_definition* n
    });
 }

-expression replace_token(const expression& expr, const column_definition* new_cdef) {
+expression replace_partition_token(const expression& expr, const column_definition* new_cdef, const schema& table_schema) {
    return search_and_replace(expr, [&] (const expression& expr) -> std::optional<expression> {
-        if (expr::is<token>(expr)) {
+        if (is_partition_token_for_schema(expr, table_schema)) {
            return column_value{new_cdef};
        } else {
            return std::nullopt;
@@ -1443,14 +1355,6 @@ bool recurse_until(const expression& e, const noncopyable_function<bool (const e
                }
                return false;
            },
-            [&] (const token& tok) {
-                for (auto& a : tok.args) {
-                    if (auto found = recurse_until(a, predicate_fun)) {
-                        return found;
-                    }
-                }
-                return false;
-            },
            [](LeafExpression auto const&) {
                return false;
            }
@@ -1526,13 +1430,6 @@ expression search_and_replace(const expression& e,
                        .type = s.type,
                    };
                },
-                [&](const token& tok) -> expression {
-                    return token {
-                        boost::copy_range<std::vector<expression>>(
-                            tok.args | boost::adaptors::transformed(recurse)
-                        )
-                    };
-                },
                [&] (LeafExpression auto const& e) -> expression {
                    return e;
                },
@@ -1607,7 +1504,6 @@ std::vector<expression> extract_single_column_restrictions_for_column(const expr
            }
        }

-        void operator()(const token&) {}
        void operator()(const unresolved_identifier&) {}
        void operator()(const column_mutation_attribute&) {}
        void operator()(const function_call&) {}
@@ -1771,9 +1667,6 @@ cql3::raw_value evaluate(const expression& e, const evaluation_inputs& inputs) {
        [&](const conjunction& conj) -> cql3::raw_value {
            return evaluate(conj, inputs);
        },
-        [](const token&) -> cql3::raw_value {
-            on_internal_error(expr_logger, "Can't evaluate token");
-        },
        [](const unresolved_identifier&) -> cql3::raw_value {
            on_internal_error(expr_logger, "Can't evaluate unresolved_identifier");
        },
@@ -2313,11 +2206,6 @@ void fill_prepare_context(expression& e, prepare_context& ctx) {
                fill_prepare_context(child, ctx);
            }
        },
-        [&](token& tok) {
-            for (expression& arg : tok.args) {
-                fill_prepare_context(arg, ctx);
-            }
-        },
        [](unresolved_identifier&) {},
        [&](column_mutation_attribute& a) {
            fill_prepare_context(a.column, ctx);
@@ -2367,9 +2255,6 @@ type_of(const expression& e) {
        [] (const column_value& e) {
            return e.col->type;
        },
-        [] (const token& e) {
-            return long_type;
-        },
        [] (const unresolved_identifier& e) -> data_type {
            on_internal_error(expr_logger, "evaluating type of unresolved_identifier");
        },
@@ -2550,7 +2435,7 @@ sstring get_columns_in_commons(const expression& a, const expression& b) {
 }

 bytes_opt value_for(const column_definition& cdef, const expression& e, const query_options& options) {
-    value_set possible_vals = possible_lhs_values(&cdef, e, options);
+    value_set possible_vals = possible_column_values(&cdef, e, options);
    return std::visit(overloaded_functor {
        [&](const value_list& val_list) -> bytes_opt {
            if (val_list.empty()) {
@@ -2694,5 +2579,69 @@ adjust_for_collection_as_maps(const expression& e) {
    });
 }

+bool is_token_function(const function_call& fun_call) {
+    static thread_local const functions::function_name token_function_name =
+        functions::function_name::native_function("token");
+
+    // Check that function name is "token"
+    const functions::function_name& fun_name =
+        std::visit(overloaded_functor{[](const functions::function_name& fname) { return fname; },
+                                      [](const shared_ptr<functions::function>& fun) { return fun->name(); }},
+                   fun_call.func);
+
+    return fun_name.has_keyspace() ? fun_name == token_function_name : fun_name.name == token_function_name.name;
+}
+
+bool is_token_function(const expression& e) {
+    const function_call* fun_call = as_if<function_call>(&e);
+    if (fun_call == nullptr) {
+        return false;
+    }
+
+    return is_token_function(*fun_call);
+}
+
+bool is_partition_token_for_schema(const function_call& fun_call, const schema& table_schema) {
+    if (!is_token_function(fun_call)) {
+        return false;
+    }
+
+    if (fun_call.args.size() != table_schema.partition_key_size()) {
+        return false;
+    }
+
+    auto arguments_iter = fun_call.args.begin();
+    for (const column_definition& partition_key_col : table_schema.partition_key_columns()) {
+        const expression& cur_argument = *arguments_iter;
+
+        const column_value* cur_col = as_if<column_value>(&cur_argument);
+        if (cur_col == nullptr) {
+            // A sanity check that we didn't call the function on an unprepared expression.
+            if (is<unresolved_identifier>(cur_argument)) {
+                on_internal_error(expr_logger,
+                                  format("called is_partition_token with unprepared expression: {}", fun_call));
+            }
+
+            return false;
+        }
+
+        if (cur_col->col != &partition_key_col) {
+            return false;
+        }
+
+        arguments_iter++;
+    }
+
+    return true;
+}
+
+bool is_partition_token_for_schema(const expression& maybe_token, const schema& table_schema) {
+    const function_call* fun_call = as_if<function_call>(&maybe_token);
+    if (fun_call == nullptr) {
+        return false;
+    }
+
+    return is_partition_token_for_schema(*fun_call, table_schema);
+}
 } // namespace expr
 } // namespace cql3
--- a/cql3/expr/expression.hh
+++ b/cql3/expr/expression.hh
@@ -70,7 +70,6 @@ struct binary_operator;
 struct conjunction;
 struct column_value;
 struct subscript;
-struct token;
 struct unresolved_identifier;
 struct column_mutation_attribute;
 struct function_call;
@@ -89,7 +88,6 @@ concept ExpressionElement
        || std::same_as<T, binary_operator>
        || std::same_as<T, column_value>
        || std::same_as<T, subscript>
-        || std::same_as<T, token>
        || std::same_as<T, unresolved_identifier>
        || std::same_as<T, column_mutation_attribute>
        || std::same_as<T, function_call>
@@ -109,7 +107,6 @@ concept invocable_on_expression
        && std::invocable<Func, binary_operator>
        && std::invocable<Func, column_value>
        && std::invocable<Func, subscript>
-        && std::invocable<Func, token>
        && std::invocable<Func, unresolved_identifier>
        && std::invocable<Func, column_mutation_attribute>
        && std::invocable<Func, function_call>
@@ -129,7 +126,6 @@ concept invocable_on_expression_ref
        && std::invocable<Func, binary_operator&>
        && std::invocable<Func, column_value&>
        && std::invocable<Func, subscript&>
-        && std::invocable<Func, token&>
        && std::invocable<Func, unresolved_identifier&>
        && std::invocable<Func, column_mutation_attribute&>
        && std::invocable<Func, function_call&>
@@ -229,18 +225,6 @@ const column_value& get_subscripted_column(const subscript&);
 /// Only columns can be subscripted in CQL, so we can expect that the subscripted expression is a column_value.
 const column_value& get_subscripted_column(const expression&);

-/// Represents token(c1, c2) function on LHS of an operator relation.
-/// args contains arguments to the token function.
-struct token {
-    std::vector<expression> args;
-
-    explicit token(std::vector<expression>);
-    explicit token(const std::vector<const column_definition*>&);
-    explicit token(const std::vector<::shared_ptr<column_identifier_raw>>&);
-
-    friend bool operator==(const token&, const token&) = default;
-};
-
 enum class oper_t { EQ, NEQ, LT, LTE, GTE, GT, IN, CONTAINS, CONTAINS_KEY, IS_NOT, LIKE };

 /// Describes the nature of clustering-key comparisons.  Useful for implementing SCYLLA_CLUSTERING_BOUND.
@@ -429,7 +413,7 @@ struct usertype_constructor {
 // now that all expression types are fully defined, we can define expression::impl
 struct expression::impl final {
    using variant_type = std::variant<
-            conjunction, binary_operator, column_value, token, unresolved_identifier,
+            conjunction, binary_operator, column_value, unresolved_identifier,
            column_mutation_attribute, function_call, cast, field_selection,
            bind_variable, untyped_constant, constant, tuple_constructor, collection_constructor,
            usertype_constructor, subscript>;
@@ -510,8 +494,8 @@ using value_list = std::vector<managed_bytes>; // Sorted and deduped using value
 /// never singular and never has start > end.  Universal set is a nonwrapping_range with both bounds null.
 using value_set = std::variant<value_list, nonwrapping_range<managed_bytes>>;

-/// A set of all column values that would satisfy an expression.  If column is null, a set of all token values
-/// that satisfy.
+/// A set of all column values that would satisfy an expression. The _token_values variant finds
+/// matching values for the partition token function call instead of the column.
 ///
 /// An expression restricts possible values of a column or token:
 /// - `A>5` restricts A from below
@@ -521,7 +505,8 @@ using value_set = std::variant<value_list, nonwrapping_range<managed_bytes>>;
 /// - `A=1 AND A<=0` restricts A to an empty list; no value is able to satisfy the expression
 /// - `A>=NULL` also restricts A to an empty list; all comparisons to NULL are false
 /// - an expression without A "restricts" A to unbounded range
-extern value_set possible_lhs_values(const column_definition*, const expression&, const query_options&);
+extern value_set possible_column_values(const column_definition*, const expression&, const query_options&);
+extern value_set possible_partition_token_values(const expression&, const query_options&, const schema& table_schema);

 /// Turns value_set into a range, unless it's a multi-valued list (in which case this throws).
 extern nonwrapping_range<managed_bytes> to_range(const value_set&);
@@ -642,8 +627,21 @@ inline bool is_multi_column(const binary_operator& op) {
    return expr::is<tuple_constructor>(op.lhs);
 }

-inline bool has_token(const expression& e) {
-    return find_binop(e, [] (const binary_operator& o) { return expr::is<token>(o.lhs); });
+// Check whether the given expression represents
+// a call to the token() function.
+bool is_token_function(const function_call&);
+bool is_token_function(const expression&);
+
+bool is_partition_token_for_schema(const function_call&, const schema&);
+bool is_partition_token_for_schema(const expression&, const schema&);
+
+/// Check whether the expression contains a binary_operator whose LHS is a call to the token
+/// function representing a partition key token.
+/// Examples:
+/// For expression: "token(p1, p2, p3) < 123 AND c = 2" returns true
+/// For expression: "p1 = token(1, 2, 3) AND c = 2" return false
+inline bool has_partition_token(const expression& e, const schema& table_schema) {
+    return find_binop(e, [&] (const binary_operator& o) { return is_partition_token_for_schema(o.lhs, table_schema); });
 }

 inline bool has_slice_or_needs_filtering(const expression& e) {
@@ -689,7 +687,8 @@ extern expression replace_column_def(const expression&, const column_definition*

 // Replaces all occurences of token(p1, p2) on the left hand side with the given colum.
 // For example this changes token(p1, p2) < token(1, 2) to my_column_name < token(1, 2).
-extern expression replace_token(const expression&, const column_definition*);
+// Schema is needed to find out which calls to token() describe the partition token.
+extern expression replace_partition_token(const expression&, const column_definition*, const schema&);

 // Recursively copies e and returns it. Calls replace_candidate() on all nodes. If it returns nullopt,
 // continue with the copying. If it returns an expression, that expression replaces the current node.
@@ -829,12 +828,12 @@ bool has_only_eq_binops(const expression&);
 } // namespace cql3

 /// Custom formatter for an expression. Use {:user} for user-oriented
-/// output, {:debug} for debug-oriented output. Debug is the default.
+/// output, {:debug} for debug-oriented output. User is the default.
 ///
 /// Required for fmt::join() to work on expression.
 template <>
 class fmt::formatter<cql3::expr::expression> {
-    bool _debug = true;
+    bool _debug = false;
 private:
    constexpr static bool try_match_and_advance(format_parse_context& ctx, std::string_view s) {
        auto [ctx_end, s_end] = std::ranges::mismatch(ctx, s);
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -79,7 +79,7 @@ static
 void
 usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    if (!receiver.type->is_user_type()) {
-        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto ut = static_pointer_cast<const user_type_impl>(receiver.type);
@@ -91,7 +91,7 @@ usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_
        const expression& value = u.elements.at(field);
        auto&& field_spec = usertype_field_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *field_spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", receiver.name, field, field_spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", *receiver.name, field, field_spec->type->as_cql3_type()));
        }
    }
 }
@@ -314,7 +314,7 @@ set_validate_assignable_to(const collection_constructor& c, data_dictionary::dat
            return;
        }

-        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto&& value_spec = set_value_spec_of(receiver);
@@ -502,18 +502,18 @@ void
 tuple_constructor_validate_assignable_to(const tuple_constructor& tc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver.type->underlying_type());
    if (!tt) {
-        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    for (size_t i = 0; i < tc.elements.size(); ++i) {
        if (i >= tt->size()) {
            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: too many elements. Type {} expects {:d} but got {:d}",
-                                                            receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
+                                                            *receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
        }

        auto&& value = tc.elements[i];
        auto&& spec = component_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", receiver.name, i, spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", *receiver.name, i, spec->type->as_cql3_type()));
        }
    }
 }
@@ -817,17 +817,38 @@ cast_prepare_expression(const cast& c, data_dictionary::database db, const sstri

 std::optional<expression>
 prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
-    if (!receiver) {
-        // TODO: It is possible to infer the type of a function call if there is only one overload, or if all overloads return the same type
-        return std::nullopt;
+    // Try to extract a column family name from the available information.
+    // Most functions can be prepared without information about the column family, usually just the keyspace is enough.
+    // One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
+    // which can only be known when the column family is known.
+    // In cases when someone calls prepare_function_call on a token() function without a known column_family, an exception is thrown by functions::get.
+    std::optional<std::string_view> cf_name;
+    if (schema_opt != nullptr) {
+        cf_name = std::string_view(schema_opt->cf_name());
+    } else if (receiver.get() != nullptr) {
+        cf_name = receiver->cf_name;
    }
+
+    // Prepare the arguments that can be prepared without a receiver.
+    // Prepared expressions have a known type, which helps with finding the right function.
+    std::vector<expression> partially_prepared_args;
+    for (const expression& argument : fc.args) {
+        std::optional<expression> prepared_arg_opt = try_prepare_expression(argument, db, keyspace, schema_opt, nullptr);
+        if (prepared_arg_opt.has_value()) {
+            partially_prepared_args.emplace_back(*prepared_arg_opt);
+        } else {
+            partially_prepared_args.push_back(argument);
+        }
+    }
+
    auto&& fun = std::visit(overloaded_functor{
        [] (const shared_ptr<functions::function>& func) {
            return func;
        },
        [&] (const functions::function_name& name) {
-            auto args = boost::copy_range<std::vector<::shared_ptr<assignment_testable>>>(fc.args | boost::adaptors::transformed(expr::as_assignment_testable));
-            auto fun = functions::functions::get(db, keyspace, name, args, receiver->ks_name, receiver->cf_name, receiver.get());
+            auto args = boost::copy_range<std::vector<::shared_ptr<assignment_testable>>>(
+                    partially_prepared_args | boost::adaptors::transformed(expr::as_assignment_testable));
+            auto fun = functions::functions::get(db, keyspace, name, args, keyspace, cf_name, receiver.get());
            if (!fun) {
                throw exceptions::invalid_request_exception(format("Unknown function {} called", name));
            }
@@ -843,7 +864,7 @@ prepare_function_call(const expr::function_call& fc, data_dictionary::database d

    // Functions.get() will complain if no function "name" type check with the provided arguments.
    // We still have to validate that the return type matches however
-    if (!receiver->type->is_value_compatible_with(*scalar_fun->return_type())) {
+    if (receiver && !receiver->type->is_value_compatible_with(*scalar_fun->return_type())) {
        throw exceptions::invalid_request_exception(format("Type error: cannot assign result of function {} (type {}) to {} (type {})",
                                                    fun->name(), fun->return_type()->as_cql3_type(),
                                                    receiver->name, receiver->type->as_cql3_type()));
@@ -855,11 +876,11 @@ prepare_function_call(const expr::function_call& fc, data_dictionary::database d
    }

    std::vector<expr::expression> parameters;
-    parameters.reserve(fc.args.size());
+    parameters.reserve(partially_prepared_args.size());
    bool all_terminal = true;
-    for (size_t i = 0; i < fc.args.size(); ++i) {
-        expr::expression e = prepare_expression(fc.args[i], db, keyspace, schema_opt,
-                                                functions::functions::make_arg_spec(receiver->ks_name, receiver->cf_name, *scalar_fun, i));
+    for (size_t i = 0; i < partially_prepared_args.size(); ++i) {
+        expr::expression e = prepare_expression(partially_prepared_args[i], db, keyspace, schema_opt,
+                                                functions::functions::make_arg_spec(keyspace, cf_name, *scalar_fun, i));
        if (!expr::is<expr::constant>(e)) {
            all_terminal = false;
        }
@@ -908,6 +929,17 @@ test_assignment_function_call(const cql3::expr::function_call& fc, data_dictiona
    }
 }

+static assignment_testable::test_result expression_test_assignment(const data_type& expr_type,
+                                                                   const column_specification& receiver) {
+    if (receiver.type->underlying_type() == expr_type->underlying_type()) {
+        return assignment_testable::test_result::EXACT_MATCH;
+    } else if (receiver.type->is_value_compatible_with(*expr_type)) {
+        return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
+    } else {
+        return assignment_testable::test_result::NOT_ASSIGNABLE;
+    }
+}
+
 std::optional<expression> prepare_conjunction(const conjunction& conj,
                                              data_dictionary::database db,
                                              const sstring& keyspace,
@@ -958,8 +990,20 @@ std::optional<expression> prepare_conjunction(const conjunction& conj,
 std::optional<expression>
 try_prepare_expression(const expression& expr, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
    return expr::visit(overloaded_functor{
-        [] (const constant&) -> std::optional<expression> {
-            on_internal_error(expr_logger, "Can't prepare constant_value, it should not appear in parser output");
+        [&] (const constant& value) -> std::optional<expression> {
+            if (receiver && !is_assignable(expression_test_assignment(value.type, *receiver))) {
+                throw exceptions::invalid_request_exception(
+                    format("cannot assign a constant {:user} of type {} to receiver {} of type {}", value,
+                           value.type->as_cql3_type(), receiver->name, receiver->type->as_cql3_type()));
+            }
+
+            constant result = value;
+            if (receiver) {
+                // The receiver might have a different type from the constant, but this is allowed if the types are compatible.
+                // In such case the type is implictly converted to receiver type.
+                result.type = receiver->type;
+            }
+            return result;
        },
        [&] (const binary_operator& binop) -> std::optional<expression> {
            if (receiver.get() != nullptr && &receiver->type->without_reversed() != boolean_type.get()) {
@@ -1013,24 +1057,6 @@ try_prepare_expression(const expression& expr, data_dictionary::database db, con
                .type = static_cast<const collection_type_impl&>(sub_col_type).value_comparator(),
            };
        },
-        [&] (const token& tk) -> std::optional<expression> {
-            if (!schema_opt) {
-                throw exceptions::invalid_request_exception("cannot process token() function without schema");
-            }
-
-            std::vector<expression> prepared_token_args;
-            prepared_token_args.reserve(tk.args.size());
-
-            for (const expression& arg : tk.args) {
-                auto prepared_arg_opt = try_prepare_expression(arg, db, keyspace, schema_opt, receiver);
-                if (!prepared_arg_opt) {
-                    return std::nullopt;
-                }
-                prepared_token_args.emplace_back(std::move(*prepared_arg_opt));
-            }
-
-            return token(std::move(prepared_token_args));
-        },
        [&] (const unresolved_identifier& unin) -> std::optional<expression> {
            if (!schema_opt) {
                throw exceptions::invalid_request_exception(fmt::format("Cannot resolve column {} without schema", unin.ident->to_cql_string()));
@@ -1076,9 +1102,8 @@ assignment_testable::test_result
 test_assignment(const expression& expr, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    using test_result = assignment_testable::test_result;
    return expr::visit(overloaded_functor{
-        [&] (const constant&) -> test_result {
-            // constants shouldn't appear in parser output, only untyped_constants
-            on_internal_error(expr_logger, "constants are not yet reachable via test_assignment()");
+        [&] (const constant& value) -> test_result {
+            return expression_test_assignment(value.type, receiver);
        },
        [&] (const binary_operator&) -> test_result {
            on_internal_error(expr_logger, "binary_operators are not yet reachable via test_assignment()");
@@ -1086,15 +1111,12 @@ test_assignment(const expression& expr, data_dictionary::database db, const sstr
        [&] (const conjunction&) -> test_result {
            on_internal_error(expr_logger, "conjunctions are not yet reachable via test_assignment()");
        },
-        [&] (const column_value&) -> test_result {
-            on_internal_error(expr_logger, "column_values are not yet reachable via test_assignment()");
+        [&] (const column_value& col_val) -> test_result {
+            return expression_test_assignment(col_val.col->type, receiver);
        },
        [&] (const subscript&) -> test_result {
            on_internal_error(expr_logger, "subscripts are not yet reachable via test_assignment()");
        },
-        [&] (const token&) -> test_result {
-            on_internal_error(expr_logger, "tokens are not yet reachable via test_assignment()");
-        },
        [&] (const unresolved_identifier&) -> test_result {
            on_internal_error(expr_logger, "unresolved_identifiers are not yet reachable via test_assignment()");
        },
@@ -1221,11 +1243,30 @@ static lw_shared_ptr<column_specification> get_lhs_receiver(const expression& pr
            data_type tuple_type = tuple_type_impl::get_instance(tuple_types);
            return make_lw_shared<column_specification>(schema.ks_name(), schema.cf_name(), std::move(identifier), std::move(tuple_type));
        },
-        [&](const token& col_val) -> lw_shared_ptr<column_specification> {
-            return make_lw_shared<column_specification>(schema.ks_name(),
-                                                        schema.cf_name(),
-                                                        ::make_shared<column_identifier>("partition key token", true),
-                                                        dht::token::get_token_validator());
+        [&](const function_call& fun_call) -> lw_shared_ptr<column_specification> {
+            // In case of an expression like `token(p1, p2, p3) = ?` the receiver name should be "partition key token".
+            // This is required for compatibality with the java driver, it breaks with a receiver name like "token(p1, p2, p3)".
+            if (is_partition_token_for_schema(fun_call, schema)) {
+                return make_lw_shared<column_specification>(
+                    schema.ks_name(),
+                    schema.cf_name(),
+                    ::make_shared<column_identifier>("partition key token", true),
+                    long_type);
+            }
+
+            data_type return_type = std::visit(
+                    overloaded_functor{
+                        [](const shared_ptr<db::functions::function>& fun) -> data_type { return fun->return_type(); },
+                        [&](const functions::function_name&) -> data_type {
+                            on_internal_error(expr_logger,
+                                              format("get_lhs_receiver: unprepared function call {:debug}", fun_call));
+                        }},
+                    fun_call.func);
+
+            return make_lw_shared<column_specification>(
+                schema.ks_name(), schema.cf_name(),
+                ::make_shared<column_identifier>(format("{:user}", fun_call), true),
+                return_type);
        },
        [](const auto& other) -> lw_shared_ptr<column_specification> {
            on_internal_error(expr_logger, format("get_lhs_receiver: unexpected expression: {}", other));
--- a/cql3/expr/restrictions.cc
+++ b/cql3/expr/restrictions.cc
@@ -152,7 +152,10 @@ void preliminary_binop_vaidation_checks(const binary_operator& binop) {
        }
    }

-    if (is<token>(binop.lhs)) {
+    // Right now a token() on the LHS means that there's a partition token there.
+    // In the future with relaxed grammar this might no longer be true and this check will have to be revisisted.
+    // Moving the check after preparation would break tests and cassandra compatability.
+    if (is_token_function(binop.lhs)) {
        if (binop.op == oper_t::IN) {
            throw exceptions::invalid_request_exception("IN cannot be used with the token function");
        }
@@ -214,9 +217,9 @@ binary_operator validate_and_prepare_new_restriction(const binary_operator& rest
        }

        validate_multi_column_relation(lhs_cols, prepared_binop.op);
-    } else if (auto lhs_token = as_if<token>(&prepared_binop.lhs)) {
+    } else if (is_token_function(prepared_binop.lhs)) {
        // Token restriction
-        std::vector<const column_definition*> column_defs = to_column_definitions(lhs_token->args);
+        std::vector<const column_definition*> column_defs = to_column_definitions(as<function_call>(prepared_binop.lhs).args);
        validate_token_relation(column_defs, prepared_binop.op, *schema);
    } else {
        // Anything else
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -202,9 +202,10 @@ std::optional<function_name> functions::used_by_user_function(const ut_name& use
 }

 lw_shared_ptr<column_specification>
-functions::make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
+functions::make_arg_spec(const sstring& receiver_ks, std::optional<const std::string_view> receiver_cf_opt,
        const function& fun, size_t i) {
    auto&& name = fmt::to_string(fun.name());
+    const std::string_view receiver_cf = receiver_cf_opt.has_value() ? *receiver_cf_opt : "<unknown_col_family>";
    std::transform(name.begin(), name.end(), name.begin(), ::tolower);
    return make_lw_shared<column_specification>(receiver_ks,
                                   receiver_cf,
@@ -322,7 +323,7 @@ functions::get(data_dictionary::database db,
        const function_name& name,
        const std::vector<shared_ptr<assignment_testable>>& provided_args,
        const sstring& receiver_ks,
-        const sstring& receiver_cf,
+        std::optional<const std::string_view> receiver_cf,
        const column_specification* receiver) {

    static const function_name TOKEN_FUNCTION_NAME = function_name::native_function("token");
@@ -332,7 +333,11 @@ functions::get(data_dictionary::database db,
    if (name.has_keyspace()
                ? name == TOKEN_FUNCTION_NAME
                : name.name == TOKEN_FUNCTION_NAME.name) {
-        auto fun = ::make_shared<token_fct>(db.find_schema(receiver_ks, receiver_cf));
+
+        if (!receiver_cf.has_value()) {
+            throw exceptions::invalid_request_exception("functions::get for token doesn't have a known column family");
+        }
+        auto fun = ::make_shared<token_fct>(db.find_schema(receiver_ks, *receiver_cf));
        validate_types(db, keyspace, fun, provided_args, receiver_ks, receiver_cf);
        return fun;
    }
@@ -504,7 +509,7 @@ functions::validate_types(data_dictionary::database db,
                          shared_ptr<function> fun,
                          const std::vector<shared_ptr<assignment_testable>>& provided_args,
                          const sstring& receiver_ks,
-                          const sstring& receiver_cf) {
+                          std::optional<const std::string_view> receiver_cf) {
    if (provided_args.size() != fun->arg_types().size()) {
        throw exceptions::invalid_request_exception(
                format("Invalid number of arguments in call to function {}: {:d} required but {:d} provided",
@@ -534,7 +539,7 @@ functions::match_arguments(data_dictionary::database db, const sstring& keyspace
        shared_ptr<function> fun,
        const std::vector<shared_ptr<assignment_testable>>& provided_args,
        const sstring& receiver_ks,
-        const sstring& receiver_cf) {
+        std::optional<const std::string_view> receiver_cf) {
    if (provided_args.size() != fun->arg_types().size()) {
        return assignment_testable::test_result::NOT_ASSIGNABLE;
    }
--- a/cql3/functions/functions.hh
+++ b/cql3/functions/functions.hh
@@ -40,7 +40,7 @@ class functions {
 private:
    static std::unordered_multimap<function_name, shared_ptr<function>> init() noexcept;
 public:
-    static lw_shared_ptr<column_specification> make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
+    static lw_shared_ptr<column_specification> make_arg_spec(const sstring& receiver_ks, std::optional<const std::string_view> receiver_cf,
            const function& fun, size_t i);
 public:
    static shared_ptr<function> get(data_dictionary::database db,
@@ -48,7 +48,7 @@ public:
                                    const function_name& name,
                                    const std::vector<shared_ptr<assignment_testable>>& provided_args,
                                    const sstring& receiver_ks,
-                                    const sstring& receiver_cf,
+                                    std::optional<const std::string_view> receiver_cf,
                                    const column_specification* receiver = nullptr);
    template <typename AssignmentTestablePtrRange>
    static shared_ptr<function> get(data_dictionary::database db,
@@ -56,7 +56,7 @@ public:
                                    const function_name& name,
                                    AssignmentTestablePtrRange&& provided_args,
                                    const sstring& receiver_ks,
-                                    const sstring& receiver_cf,
+                                    std::optional<const std::string_view> receiver_cf,
                                    const column_specification* receiver = nullptr) {
        const std::vector<shared_ptr<assignment_testable>> args(std::begin(provided_args), std::end(provided_args));
        return get(db, keyspace, name, args, receiver_ks, receiver_cf, receiver);
@@ -87,12 +87,12 @@ private:
                              shared_ptr<function> fun,
                              const std::vector<shared_ptr<assignment_testable>>& provided_args,
                              const sstring& receiver_ks,
-                              const sstring& receiver_cf);
+                              std::optional<const std::string_view> receiver_cf);
    static assignment_testable::test_result match_arguments(data_dictionary::database db, const sstring& keyspace,
            shared_ptr<function> fun,
            const std::vector<shared_ptr<assignment_testable>>& provided_args,
            const sstring& receiver_ks,
-            const sstring& receiver_cf);
+            std::optional<const std::string_view> receiver_cf);

    static bool type_equals(const std::vector<data_type>& t1, const std::vector<data_type>& t2);

--- a/cql3/operation.cc
+++ b/cql3/operation.cc
@@ -32,9 +32,9 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
    using exceptions::invalid_request_exception;
    auto rtype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!rtype) {
-        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!rtype->is_multi_cell()) {
-        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (rtype->get_kind() == abstract_type::kind::list) {
@@ -47,7 +47,7 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
            return make_shared<lists::setter_by_index>(receiver, std::move(idx), std::move(lval));
        }
    } else if (rtype->get_kind() == abstract_type::kind::set) {
-        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name_as_text()));
    } else if (rtype->get_kind() == abstract_type::kind::map) {
        auto key = prepare_expression(_selector, db, keyspace, nullptr, maps::key_spec_of(*receiver.column_specification));
        auto mval = prepare_expression(_value, db, keyspace, nullptr, maps::value_spec_of(*receiver.column_specification));
@@ -136,11 +136,11 @@ operation::addition::prepare(data_dictionary::database db, const sstring& keyspa
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        return make_shared<constants::adder>(receiver, std::move(v));
    } else if (!ctype->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -169,14 +169,14 @@ operation::subtraction::prepare(data_dictionary::database db, const sstring& key
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);
        return make_shared<constants::subtracter>(receiver, std::move(v));
    }
    if (!ctype->is_multi_cell()) {
        throw exceptions::invalid_request_exception(
-                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -211,9 +211,9 @@ operation::prepend::prepare(data_dictionary::database db, const sstring& keyspac
    auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);

    if (!dynamic_cast<const list_type_impl*>(receiver.type.get())) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name_as_text()));
    }

    return make_shared<lists::prepender>(receiver, std::move(v));
@@ -296,8 +296,6 @@ operation::set_counter_value_from_tuple_list::prepare(data_dictionary::database
                auto clock = value_cast<int64_t>(tuple[2]);
                auto value = value_cast<int64_t>(tuple[3]);

-                using namespace std::rel_ops;
-
                if (id <= last) {
                    throw marshal_exception(
                                    format("invalid counter id order, {} <= {}",
@@ -343,9 +341,9 @@ operation::element_deletion::affected_column() const {
 shared_ptr<operation>
 operation::element_deletion::prepare(data_dictionary::database db, const sstring& keyspace, const column_definition& receiver) const {
    if (!receiver.type->is_collection()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name_as_text()));
    }
    auto ctype = static_pointer_cast<const collection_type_impl>(receiver.type);
    if (ctype->get_kind() == abstract_type::kind::list) {
--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -58,13 +58,7 @@ public:
        return key.key().second;
    }

-    bool operator==(const prepared_cache_key_type& other) const {
-        return _key == other._key;
-    }
-
-    bool operator!=(const prepared_cache_key_type& other) const {
-        return !(*this == other);
-    }
+    bool operator==(const prepared_cache_key_type& other) const = default;
 };

 class prepared_statements_cache {
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -729,65 +729,17 @@ bool query_processor::has_more_results(::shared_ptr<cql3::internal_query_state>
    return false;
 }

-future<> query_processor::for_each_cql_result(
-        ::shared_ptr<cql3::internal_query_state> state,
-        std::function<stop_iteration(const cql3::untyped_result_set::row&)>&& f) {
-    return do_with(seastar::shared_ptr<bool>(), [f, this, state](auto& is_done) mutable {
-        is_done = seastar::make_shared<bool>(false);
-
-        auto stop_when = [is_done]() {
-            return *is_done;
-        };
-        auto do_resuls = [is_done, state, f, this]() mutable {
-            return this->execute_paged_internal(
-                    state).then([is_done, state, f, this](::shared_ptr<cql3::untyped_result_set> msg) mutable {
-                if (msg->empty()) {
-                    *is_done = true;
-                } else {
-                    if (!this->has_more_results(state)) {
-                        *is_done = true;
-                    }
-                    for (auto& row : *msg) {
-                        if (f(row) == stop_iteration::yes) {
-                            *is_done = true;
-                            break;
-                        }
-                    }
-                }
-            });
-        };
-        return do_until(stop_when, do_resuls);
-    });
-}
-
 future<> query_processor::for_each_cql_result(
        ::shared_ptr<cql3::internal_query_state> state,
         noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)>&& f) {
-    // repeat can move the lambda's capture, so we need to hold f and it so the internal loop
-    // will be able to use it.
-    return do_with(noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)>(std::move(f)),
-            untyped_result_set::rows_type::const_iterator(),
-            [state, this](noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)>& f,
-                    untyped_result_set::rows_type::const_iterator& it) mutable {
-        return repeat([state, &f, &it, this]() mutable {
-            return this->execute_paged_internal(state).then([state, &f, &it, this](::shared_ptr<cql3::untyped_result_set> msg) mutable {
-                it = msg->begin();
-                return repeat_until_value([&it, &f, msg, state, this]() mutable {
-                    if (it == msg->end()) {
-                        return make_ready_future<std::optional<stop_iteration>>(std::optional<stop_iteration>(!this->has_more_results(state)));
-                    }
-
-                    return f(*it).then([&it, msg](stop_iteration i) {
-                        if (i == stop_iteration::yes) {
-                            return std::optional<stop_iteration>(i);
-                        }
-                        ++it;
-                        return std::optional<stop_iteration>();
-                    });
-                });
-            });
-        });
-    });
+    do {
+        auto msg = co_await execute_paged_internal(state);
+        for (auto& row : *msg) {
+            if ((co_await f(row)) == stop_iteration::yes) {
+                co_return;
+            }
+        }
+    } while (has_more_results(state));
 }

 future<::shared_ptr<untyped_result_set>>
@@ -948,6 +900,9 @@ void query_processor::migration_subscriber::on_update_view(
        const sstring& view_name, bool columns_changed) {
 }

+void query_processor::migration_subscriber::on_update_tablet_metadata() {
+}
+
 void query_processor::migration_subscriber::on_drop_keyspace(const sstring& ks_name) {
    remove_invalid_prepared_statements(ks_name, std::nullopt);
 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -294,6 +294,8 @@ public:
     * page_size - maximum page size
     * f - a function to be run on each row of the query result,
     *     if the function returns stop_iteration::yes the iteration will stop
+     *
+     * \note This function is optimized for convenience, not performance.
     */
    future<> query_internal(
            const sstring& query_string,
@@ -310,6 +312,8 @@ public:
     * query_string - the cql string, can contain placeholders
     * f - a function to be run on each row of the query result,
     *     if the function returns stop_iteration::yes the iteration will stop
+     *
+     * \note This function is optimized for convenience, not performance.
     */
    future<> query_internal(
            const sstring& query_string,
@@ -324,6 +328,8 @@ public:
    // and schema changes will not be announced to other nodes.
    // Because of that, changing global schema state (e.g. modifying non-local tables,
    // creating namespaces, etc) is explicitly forbidden via this interface.
+    //
+    // note: optimized for convenience, not performance.
    future<::shared_ptr<untyped_result_set>> execute_internal(
            const sstring& query_string,
            db::consistency_level,
@@ -429,18 +435,15 @@ private:

    /*!
     * \brief run a query using paging
+     *
+     * \note Optimized for convenience, not performance.
     */
    future<::shared_ptr<untyped_result_set>> execute_paged_internal(::shared_ptr<internal_query_state> state);

-    /*!
-     * \brief iterate over all results using paging
-     */
-    future<> for_each_cql_result(
-            ::shared_ptr<cql3::internal_query_state> state,
-            std::function<stop_iteration(const cql3::untyped_result_set_row&)>&& f);
-
    /*!
     * \brief iterate over all results using paging, accept a function that returns a future
+     *
+     * \note Optimized for convenience, not performance.
     */
    future<> for_each_cql_result(
            ::shared_ptr<cql3::internal_query_state> state,
@@ -522,6 +525,7 @@ public:
    virtual void on_update_function(const sstring& ks_name, const sstring& function_name) override;
    virtual void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override;
    virtual void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override;
+    virtual void on_update_tablet_metadata() override;

    virtual void on_drop_keyspace(const sstring& ks_name) override;
    virtual void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override;
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -88,7 +88,8 @@ void with_current_binary_operator(
 static std::vector<expr::expression> extract_partition_range(
        const expr::expression& where_clause, schema_ptr schema) {
    using namespace expr;
-    struct {
+    struct extract_partition_range_visitor {
+        schema_ptr table_schema;
        std::optional<expression> tokens;
        std::unordered_map<const column_definition*, expression> single_column;
        const binary_operator* current_binary_operator = nullptr;
@@ -106,7 +107,11 @@ static std::vector<expr::expression> extract_partition_range(
            current_binary_operator = nullptr;
        }

-        void operator()(const token&) {
+        void operator()(const function_call& token_fun_call) {
+            if (!is_partition_token_for_schema(token_fun_call, *table_schema)) {
+                on_internal_error(rlogger, "extract_partition_range(function_call)");
+            }
+
            with_current_binary_operator(*this, [&] (const binary_operator& b) {
                if (tokens) {
                    tokens = make_conjunction(std::move(*tokens), b);
@@ -159,10 +164,6 @@ static std::vector<expr::expression> extract_partition_range(
            on_internal_error(rlogger, "extract_partition_range(column_mutation_attribute)");
        }

-        void operator()(const function_call&) {
-            on_internal_error(rlogger, "extract_partition_range(function_call)");
-        }
-
        void operator()(const cast&) {
            on_internal_error(rlogger, "extract_partition_range(cast)");
        }
@@ -186,7 +187,12 @@ static std::vector<expr::expression> extract_partition_range(
        void operator()(const usertype_constructor&) {
            on_internal_error(rlogger, "extract_partition_range(usertype_constructor)");
        }
-    } v;
+    };
+
+    extract_partition_range_visitor v {
+        .table_schema = schema
+    };
+
    expr::visit(v, where_clause);
    if (v.tokens) {
        return {std::move(*v.tokens)};
@@ -207,6 +213,7 @@ static std::vector<expr::expression> extract_clustering_prefix_restrictions(
    /// Collects all clustering-column restrictions from an expression.  Presumes the expression only uses
    /// conjunction to combine subexpressions.
    struct visitor {
+        schema_ptr table_schema;
        std::vector<expression> multi; ///< All multi-column restrictions.
        /// All single-clustering-column restrictions, grouped by column.  Each value is either an atom or a
        /// conjunction of atoms.
@@ -266,8 +273,13 @@ static std::vector<expr::expression> extract_clustering_prefix_restrictions(
            });
        }

-        void operator()(const token&) {
-            // A token cannot be a clustering prefix restriction
+        void operator()(const function_call& fun_call) {
+            if (is_partition_token_for_schema(fun_call, *table_schema)) {
+                // A token cannot be a clustering prefix restriction
+                return;
+            }
+
+            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(function_call)");
        }

        void operator()(const constant&) {}
@@ -280,10 +292,6 @@ static std::vector<expr::expression> extract_clustering_prefix_restrictions(
            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(column_mutation_attribute)");
        }

-        void operator()(const function_call&) {
-            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(function_call)");
-        }
-
        void operator()(const cast&) {
            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(cast)");
        }
@@ -307,7 +315,11 @@ static std::vector<expr::expression> extract_clustering_prefix_restrictions(
        void operator()(const usertype_constructor&) {
            on_internal_error(rlogger, "extract_clustering_prefix_restrictions(usertype_constructor)");
        }
-    } v;
+    };
+    visitor v {
+        .table_schema = schema
+    };
+
    expr::visit(v, where_clause);

    if (!v.multi.empty()) {
@@ -358,7 +370,7 @@ statement_restrictions::statement_restrictions(data_dictionary::database db,
        }
    }
    if (_where.has_value()) {
-        if (!has_token(_partition_key_restrictions)) {
+        if (!has_token_restrictions()) {
            _single_column_partition_key_restrictions = expr::get_single_column_restrictions_map(_partition_key_restrictions);
        }
        if (!expr::contains_multi_column_restriction(_clustering_columns_restrictions)) {
@@ -488,7 +500,7 @@ std::pair<std::optional<secondary_index::index>, expr::expression> statement_res
    for (const auto& index : sim.list_indexes()) {
        auto cdef = _schema->get_column_definition(to_bytes(index.target_column()));
        for (const expr::expression& restriction : index_restrictions()) {
-            if (has_token(restriction) || contains_multi_column_restriction(restriction)) {
+            if (has_partition_token(restriction, *_schema) || contains_multi_column_restriction(restriction)) {
                continue;
            }

@@ -516,7 +528,8 @@ bool statement_restrictions::has_eq_restriction_on_column(const column_definitio
 std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(data_dictionary::database db) const {
    std::vector<const column_definition*> column_defs_for_filtering;
    if (need_filtering()) {
-        auto& sim = db.find_column_family(_schema).get_index_manager();
+        auto cf = db.find_column_family(_schema);
+        auto& sim = cf.get_index_manager();
        auto opt_idx = std::get<0>(find_idx(sim));
        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef, const expr::expression* single_col_restr) {
            return opt_idx && single_col_restr && is_supported_by(*single_col_restr, *opt_idx);
@@ -566,7 +579,7 @@ void statement_restrictions::add_restriction(const expr::binary_operator& restr,
    } else if (expr::is_multi_column(restr)) {
        // Multi column restrictions are only allowed on clustering columns
        add_multi_column_clustering_key_restriction(restr);
-    } else if (has_token(restr)) {
+    } else if (has_partition_token(restr, *_schema)) {
        // Token always restricts the partition key
        add_token_partition_key_restriction(restr);
    } else if (expr::is_single_column_restriction(restr)) {
@@ -610,7 +623,7 @@ void statement_restrictions::add_single_column_parition_key_restriction(const ex
                "Only EQ and IN relation are supported on the partition key "
                "(unless you use the token() function or allow filtering)");
    }
-    if (has_token(_partition_key_restrictions)) {
+    if (has_token_restrictions()) {
        throw exceptions::invalid_request_exception(
                format("Columns \"{}\" cannot be restricted by both a normal relation and a token relation",
                       fmt::join(expr::get_sorted_column_defs(_partition_key_restrictions) |
@@ -625,7 +638,7 @@ void statement_restrictions::add_single_column_parition_key_restriction(const ex
 }

 void statement_restrictions::add_token_partition_key_restriction(const expr::binary_operator& restr) {
-    if (!partition_key_restrictions_is_empty() && !has_token(_partition_key_restrictions)) {
+    if (!partition_key_restrictions_is_empty() && !has_token_restrictions()) {
        throw exceptions::invalid_request_exception(
                format("Columns \"{}\" cannot be restricted by both a normal relation and a token relation",
                        fmt::join(expr::get_sorted_column_defs(_partition_key_restrictions) |
@@ -736,7 +749,7 @@ void statement_restrictions::process_partition_key_restrictions(bool for_view, b
    // - Is it queriable without 2ndary index, which is always more efficient
    // If a component of the partition key is restricted by a relation, all preceding
    // components must have a EQ. Only the last partition key component can be in IN relation.
-    if (has_token(_partition_key_restrictions)) {
+    if (has_token_restrictions()) {
        _is_key_range = true;
    } else if (expr::is_empty_restriction(_partition_key_restrictions)) {
        _is_key_range = true;
@@ -775,7 +788,7 @@ size_t statement_restrictions::partition_key_restrictions_size() const {

 bool statement_restrictions::pk_restrictions_need_filtering() const {
     return !expr::is_empty_restriction(_partition_key_restrictions)
-         && !has_token(_partition_key_restrictions)
+         && !has_token_restrictions()
         && (has_partition_key_unrestricted_components() || expr::has_slice_or_needs_filtering(_partition_key_restrictions));
 }

@@ -886,7 +899,7 @@ bounds_slice statement_restrictions::get_clustering_slice() const {
 bool statement_restrictions::parition_key_restrictions_have_supporting_index(const secondary_index::secondary_index_manager& index_manager,
                                      expr::allow_local_index allow_local) const {
    // Token restrictions can't be supported by an index
-    if (has_token(_partition_key_restrictions)) {
+    if (has_token_restrictions()) {
        return false;
    }

@@ -926,8 +939,10 @@ namespace {
 using namespace expr;

 /// Computes partition-key ranges from token atoms in ex.
-dht::partition_range_vector partition_ranges_from_token(const expr::expression& ex, const query_options& options) {
-    auto values = possible_lhs_values(nullptr, ex, options);
+dht::partition_range_vector partition_ranges_from_token(const expr::expression& ex,
+                                                        const query_options& options,
+                                                        const schema& table_schema) {
+    auto values = possible_partition_token_values(ex, options, table_schema);
    if (values == expr::value_set(expr::value_list{})) {
        return {};
    }
@@ -975,7 +990,7 @@ dht::partition_range_vector partition_ranges_from_singles(
    for (const auto& e : expressions) {
        if (const auto arbitrary_binop = find_binop(e, [] (const binary_operator&) { return true; })) {
            if (auto cv = expr::as_if<expr::column_value>(&arbitrary_binop->lhs)) {
-                const value_set vals = possible_lhs_values(cv->col, e, options);
+                const value_set vals = possible_column_values(cv->col, e, options);
                if (auto lst = std::get_if<value_list>(&vals)) {
                    if (lst->empty()) {
                        return {};
@@ -1004,7 +1019,7 @@ dht::partition_range_vector partition_ranges_from_EQs(
    std::vector<managed_bytes> pk_value(schema.partition_key_size());
    for (const auto& e : eq_expressions) {
        const auto col = expr::get_subscripted_column(find(e, oper_t::EQ)->lhs).col;
-        const auto vals = std::get<value_list>(possible_lhs_values(col, e, options));
+        const auto vals = std::get<value_list>(possible_column_values(col, e, options));
        if (vals.empty()) { // Case of C=1 AND C=2.
            return {};
        }
@@ -1019,13 +1034,13 @@ dht::partition_range_vector statement_restrictions::get_partition_key_ranges(con
    if (_partition_range_restrictions.empty()) {
        return {dht::partition_range::make_open_ended_both_sides()};
    }
-    if (has_token(_partition_range_restrictions[0])) {
+    if (has_partition_token(_partition_range_restrictions[0], *_schema)) {
        if (_partition_range_restrictions.size() != 1) {
            on_internal_error(
                    rlogger,
                    format("Unexpected size of token restrictions: {}", _partition_range_restrictions.size()));
        }
-        return partition_ranges_from_token(_partition_range_restrictions[0], options);
+        return partition_ranges_from_token(_partition_range_restrictions[0], options, *_schema);
    } else if (_partition_range_is_simple) {
        // Special case to avoid extra allocations required for a Cartesian product.
        return partition_ranges_from_EQs(_partition_range_restrictions, options, *_schema);
@@ -1233,10 +1248,6 @@ struct multi_column_range_accumulator {
        on_internal_error(rlogger, "Subscript encountered outside binary operator");
    }

-    void operator()(const token&) {
-        on_internal_error(rlogger, "Token encountered outside binary operator");
-    }
-
    void operator()(const unresolved_identifier&) {
        on_internal_error(rlogger, "Unresolved identifier encountered outside binary operator");
    }
@@ -1340,7 +1351,7 @@ std::vector<query::clustering_range> get_single_column_clustering_bounds(
    size_t product_size = 1;
    std::vector<std::vector<managed_bytes>> prior_column_values; // Equality values of columns seen so far.
    for (size_t i = 0; i < single_column_restrictions.size(); ++i) {
-        auto values = possible_lhs_values(
+        auto values = possible_column_values(
                &schema.clustering_column_at(i), // This should be the LHS of restrictions[i].
                single_column_restrictions[i],
                options);
@@ -1410,10 +1421,10 @@ static std::vector<query::clustering_range> get_index_v1_token_range_clustering_
        const column_definition& token_column,
        const expression& token_restriction) {

-    // A workaround in order to make possible_lhs_values work properly.
-    // possible_lhs_values looks at the column type and uses this type's comparator.
+    // A workaround in order to make possible_column_values work properly.
+    // possible_column_values looks at the column type and uses this type's comparator.
    // This is a problem because when using blob's comparator, -4 is greater than 4.
-    // This makes possible_lhs_values think that an expression like token(p) > -4 and token(p) < 4
+    // This makes possible_column_values think that an expression like token(p) > -4 and token(p) < 4
    // is impossible to fulfill.
    // Create a fake token column with the type set to bigint, translate the restriction to use this column
    // and use this restriction to calculate possible lhs values.
@@ -1422,7 +1433,7 @@ static std::vector<query::clustering_range> get_index_v1_token_range_clustering_
    expression new_token_restrictions = replace_column_def(token_restriction, &token_column_bigint);

    std::variant<value_list, nonwrapping_range<managed_bytes>> values =
-        possible_lhs_values(&token_column_bigint, new_token_restrictions, options);
+        possible_column_values(&token_column_bigint, new_token_restrictions, options);

    return std::visit(overloaded_functor {
        [](const value_list& list) {
@@ -1690,7 +1701,7 @@ bool token_known(const statement_restrictions& r) {
 bool statement_restrictions::need_filtering() const {
    using namespace expr;

-    if (_uses_secondary_indexing && has_token(_partition_key_restrictions)) {
+    if (_uses_secondary_indexing && has_token_restrictions()) {
        // If there is a token(p1, p2) restriction, no p1, p2 restrictions are allowed in the query.
        // All other restrictions must be on clustering or regular columns.
        int64_t non_pk_restrictions_count = clustering_columns_restrictions_size();
@@ -1787,11 +1798,11 @@ void statement_restrictions::prepare_indexed_global(const schema& idx_tbl_schema

    const column_definition* token_column = &idx_tbl_schema.clustering_column_at(0);

-    if (has_token(_partition_key_restrictions)) {
+    if (has_token_restrictions()) {
        // When there is a token(p1, p2) >/</= ? restriction, it is not allowed to have restrictions on p1 or p2.
        // This means that p1 and p2 can have many different values (token is a hash, can have collisions).
        // Clustering prefix ends after token_restriction, all further restrictions have to be filtered.
-        expr::expression token_restriction = replace_token(_partition_key_restrictions, token_column);
+        expr::expression token_restriction = replace_partition_token(_partition_key_restrictions, token_column, *_schema);
        _idx_tbl_ck_prefix = std::vector{std::move(token_restriction)};

        return;
@@ -1899,7 +1910,7 @@ std::vector<query::clustering_range> statement_restrictions::get_global_index_cl
    std::vector<managed_bytes> pk_value(_schema->partition_key_size());
    for (const auto& e : _partition_range_restrictions) {
        const auto col = expr::as<column_value>(find(e, oper_t::EQ)->lhs).col;
-        const auto vals = std::get<value_list>(possible_lhs_values(col, e, options));
+        const auto vals = std::get<value_list>(possible_column_values(col, e, options));
        if (vals.empty()) { // Case of C=1 AND C=2.
            return {};
        }
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -181,7 +181,7 @@ public:
    }

    bool has_token_restrictions() const {
-        return has_token(_partition_key_restrictions);
+        return has_partition_token(_partition_key_restrictions, *_schema);
    }

    // Checks whether the given column has an EQ restriction.
@@ -478,7 +478,7 @@ public:
        // If token restrictions are present in an indexed query, then all other restrictions need to be filtered.
        // A single token restriction can have multiple matching partition key values.
        // Because of this we can't create a clustering prefix with more than token restriction.
-        || (_uses_secondary_indexing && has_token(_partition_key_restrictions));
+        || (_uses_secondary_indexing && has_token_restrictions());
    }

    bool clustering_key_restrictions_need_filtering() const;
--- a/cql3/result_generator.hh
+++ b/cql3/result_generator.hh
@@ -10,6 +10,7 @@

 #include "selection/selection.hh"
 #include "stats.hh"
+#include "utils/buffer_view-to-managed_bytes_view.hh"

 namespace cql3 {
 class untyped_result_set;
@@ -34,10 +35,10 @@ private:
    private:
        void accept_cell_value(const column_definition& def, query::result_row_view::iterator_type& i) {
            if (def.is_multi_cell()) {
-                _visitor.accept_value(i.next_collection_cell());
+                _visitor.accept_value(utils::buffer_view_to_managed_bytes_view(i.next_collection_cell()));
            } else {
                auto cell = i.next_atomic_cell();
-                _visitor.accept_value(cell ? std::optional<query::result_bytes_view>(cell->value()) : std::optional<query::result_bytes_view>());
+                _visitor.accept_value(cell ? utils::buffer_view_to_managed_bytes_view(cell->value()) : managed_bytes_view_opt());
            }
        }
    public:
@@ -65,11 +66,11 @@ private:
            for (auto&& def : _selection.get_columns()) {
                switch (def->kind) {
                case column_kind::partition_key:
-                    _visitor.accept_value(query::result_bytes_view(bytes_view(_partition_key[def->component_index()])));
+                    _visitor.accept_value(bytes_view(_partition_key[def->component_index()]));
                    break;
                case column_kind::clustering_key:
                    if (_clustering_key.size() > def->component_index()) {
-                        _visitor.accept_value(query::result_bytes_view(bytes_view(_clustering_key[def->component_index()])));
+                        _visitor.accept_value(bytes_view(_clustering_key[def->component_index()]));
                    } else {
                        _visitor.accept_value(std::nullopt);
                    }
@@ -92,7 +93,7 @@ private:
                auto static_row_iterator = static_row.iterator();
                for (auto&& def : _selection.get_columns()) {
                    if (def->is_partition_key()) {
-                        _visitor.accept_value(query::result_bytes_view(bytes_view(_partition_key[def->component_index()])));
+                        _visitor.accept_value(bytes_view(_partition_key[def->component_index()]));
                    } else if (def->is_static()) {
                        accept_cell_value(*def, static_row_iterator);
                    } else {
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -113,14 +113,23 @@ bool result_set::empty() const {
    return _rows.empty();
 }

-void result_set::add_row(std::vector<bytes_opt> row) {
+void result_set::add_row(std::vector<managed_bytes_opt> row) {
    assert(row.size() == _metadata->value_count());
    _rows.emplace_back(std::move(row));
 }

-void result_set::add_column_value(bytes_opt value) {
+void result_set::add_row(std::vector<bytes_opt> row) {
+    row_type new_row;
+    new_row.reserve(row.size());
+    for (auto& bo : row) {
+        new_row.emplace_back(bo ? managed_bytes_opt(*bo) : managed_bytes_opt());
+    }
+    add_row(std::move(new_row));
+}
+
+void result_set::add_column_value(managed_bytes_opt value) {
    if (_rows.empty() || _rows.back().size() == _metadata->value_count()) {
-        std::vector<bytes_opt> row;
+        std::vector<managed_bytes_opt> row;
        row.reserve(_metadata->value_count());
        _rows.emplace_back(std::move(row));
    }
@@ -128,6 +137,10 @@ void result_set::add_column_value(bytes_opt value) {
    _rows.back().emplace_back(std::move(value));
 }

+void result_set::add_column_value(bytes_opt value) {
+    add_column_value(to_managed_bytes_opt(value));
+}
+
 void result_set::reverse() {
    std::reverse(_rows.begin(), _rows.end());
 }
@@ -146,7 +159,7 @@ const metadata& result_set::get_metadata() const {
    return *_metadata;
 }

-const utils::chunked_vector<std::vector<bytes_opt>>& result_set::rows() const {
+const utils::chunked_vector<std::vector<managed_bytes_opt>>& result_set::rows() const {
    return _rows;
 }

--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -129,14 +129,14 @@ public:
 };

 template<typename Visitor>
-concept ResultVisitor = requires(Visitor& visitor) {
+concept ResultVisitor = requires(Visitor& visitor, managed_bytes_view_opt val) {
    visitor.start_row();
-    visitor.accept_value(std::optional<query::result_bytes_view>());
+    visitor.accept_value(std::move(val));
    visitor.end_row();
 };

 class result_set {
-    using col_type = bytes_opt;
+    using col_type = managed_bytes_opt;
    using row_type = std::vector<col_type>;
    using rows_type = utils::chunked_vector<row_type>;

@@ -157,8 +157,10 @@ public:
    bool empty() const;

    void add_row(row_type row);
+    void add_row(std::vector<bytes_opt> row);

    void add_column_value(col_type value);
+    void add_column_value(bytes_opt value);

    void reverse();

@@ -187,7 +189,7 @@ public:
            visitor.start_row();
            for (auto i = 0u; i < column_count; i++) {
                auto& cell = row[i];
-                visitor.accept_value(cell ? std::optional<query::result_bytes_view>(*cell) : std::optional<query::result_bytes_view>());
+                visitor.accept_value(cell ? managed_bytes_view_opt(*cell) : managed_bytes_view_opt());
            }
            visitor.end_row();
        }
@@ -204,12 +206,12 @@ public:
        : _result(std::move(mtd)) { }

    void start_row() { }
-    void accept_value(std::optional<query::result_bytes_view> value) {
+    void accept_value(managed_bytes_view_opt value) {
        if (!value) {
            _current_row.emplace_back();
            return;
        }
-        _current_row.emplace_back(value->linearize());
+        _current_row.emplace_back(value);
    }
    void end_row() {
        _result.add_row(std::exchange(_current_row, { }));
--- a/cql3/selection/aggregate_function_selector.hh
+++ b/cql3/selection/aggregate_function_selector.hh
@@ -31,16 +31,17 @@ public:
        for (size_t i = 0; i < m; ++i) {
            auto&& s = _arg_selectors[i];
            s->add_input(rs);
-            _args[i + 1] = s->get_output();
+            _args[i + 1] = to_bytes_opt(s->get_output());
            s->reset();
        }
        _accumulator = _aggregate.aggregation_function->execute(_args);
    }

-    virtual bytes_opt get_output() override {
-        return _aggregate.state_to_result_function
+    virtual managed_bytes_opt get_output() override {
+        return to_managed_bytes_opt(
+               _aggregate.state_to_result_function
                ? _aggregate.state_to_result_function->execute(std::span(&_accumulator, 1))
-                : std::move(_accumulator);
+                : std::move(_accumulator));
    }

    virtual void reset() override {
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -62,12 +62,12 @@ public:
        _selected->add_input(rs);
    }

-    virtual bytes_opt get_output() override {
+    virtual managed_bytes_opt get_output() override {
        auto&& value = _selected->get_output();
        if (!value) {
            return std::nullopt;
        }
-        return get_nth_tuple_element(single_fragmented_view(*value), _field);
+        return get_nth_tuple_element(managed_bytes_view(*value), _field);
    }

    virtual data_type get_type() const override {
--- a/cql3/selection/scalar_function_selector.hh
+++ b/cql3/selection/scalar_function_selector.hh
@@ -38,14 +38,14 @@ public:
    virtual void reset() override {
    }

-    virtual bytes_opt get_output() override {
+    virtual managed_bytes_opt get_output() override {
        size_t m = _arg_selectors.size();
        for (size_t i = 0; i < m; ++i) {
            auto&& s = _arg_selectors[i];
-            _args[i] = s->get_output();
+            _args[i] = to_bytes_opt(s->get_output());
            s->reset();
        }
-        return fun()->execute(_args);
+        return to_managed_bytes_opt(fun()->execute(_args));
    }

    virtual bool requires_thread() const override;
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -173,18 +173,6 @@ prepare_selectable(const schema& s, const expr::expression& raw_selectable) {
        [&] (const expr::subscript& sub) -> shared_ptr<selectable> {
            on_internal_error(slogger, "no way to express 'SELECT a[b]' in the grammar yet");
        },
-        [&] (const expr::token& tok) -> shared_ptr<selectable> {
-            // expr::token implicitly the partition key as arguments, but
-            // the selectable equivalent (with_function) needs explicit arguments,
-            // so construct them here.
-            auto name = functions::function_name("system", "token");
-            auto args = boost::copy_range<std::vector<shared_ptr<selectable>>>(
-                s.partition_key_columns()
-                | boost::adaptors::transformed([&] (const column_definition& cdef) {
-                    return ::make_shared<selectable_column>(column_identifier(cdef.name(), cdef.name_as_text()));
-                }));
-            return ::make_shared<selectable::with_function>(std::move(name), std::move(args));
-        },
        [&] (const expr::unresolved_identifier& ui) -> shared_ptr<selectable> {
            return make_shared<selectable_column>(*ui.ident->prepare(s));
        },
@@ -260,11 +248,6 @@ selectable_processes_selection(const expr::expression& raw_selectable) {
            // so bridge them.
            return false;
        },
-        [&] (const expr::token&) -> bool {
-            // Arguably, should return false, because it only processes the partition key.
-            // But selectable::with_function considers it true now, so return that.
-            return true;
-        },
        [&] (const expr::unresolved_identifier& ui) -> bool {
            return ui.ident->processes_selection();
        },
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -122,7 +122,7 @@ public:
 protected:
    class simple_selectors : public selectors {
    private:
-        std::vector<bytes_opt> _current;
+        std::vector<managed_bytes_opt> _current;
        bool _first = true; ///< Whether the next row we receive is the first in its group.
    public:
        virtual void reset() override {
@@ -132,7 +132,7 @@ protected:

        virtual bool requires_thread() const override { return false; }

-        virtual std::vector<bytes_opt> get_output_row() override {
+        virtual std::vector<managed_bytes_opt> get_output_row() override {
            return std::move(_current);
        }

@@ -234,8 +234,8 @@ protected:
            return _factories->does_aggregation();
        }

-        virtual std::vector<bytes_opt> get_output_row() override {
-            std::vector<bytes_opt> output_row;
+        virtual std::vector<managed_bytes_opt> get_output_row() override {
+            std::vector<managed_bytes_opt> output_row;
            output_row.reserve(_selectors.size());
            for (auto&& s : _selectors) {
                output_row.emplace_back(s->get_output());
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -52,7 +52,7 @@ public:
    */
    virtual void add_input_row(result_set_builder& rs) = 0;

-    virtual std::vector<bytes_opt> get_output_row() = 0;
+    virtual std::vector<managed_bytes_opt> get_output_row() = 0;

    virtual void reset() = 0;
 };
@@ -189,10 +189,10 @@ private:
    std::unique_ptr<result_set> _result_set;
    std::unique_ptr<selectors> _selectors;
    const std::vector<size_t> _group_by_cell_indices; ///< Indices in \c current of cells holding GROUP BY values.
-    std::vector<bytes_opt> _last_group; ///< Previous row's group: all of GROUP BY column values.
+    std::vector<managed_bytes_opt> _last_group; ///< Previous row's group: all of GROUP BY column values.
    bool _group_began; ///< Whether a group began being formed.
 public:
-    std::optional<std::vector<bytes_opt>> current;
+    std::optional<std::vector<managed_bytes_opt>> current;
 private:
    std::vector<api::timestamp_type> _timestamps;
    std::vector<int32_t> _ttls;
--- a/cql3/selection/selector.hh
+++ b/cql3/selection/selector.hh
@@ -49,7 +49,7 @@ public:
     * @return the selector output
     * @throws InvalidRequestException if a problem occurs while computing the output value
     */
-    virtual bytes_opt get_output() = 0;
+    virtual managed_bytes_opt get_output() = 0;

    /**
     * Returns the <code>selector</code> output type.
--- a/cql3/selection/simple_selector.hh
+++ b/cql3/selection/simple_selector.hh
@@ -49,7 +49,7 @@ private:
    const sstring _column_name;
    const uint32_t _idx;
    data_type _type;
-    bytes_opt _current;
+    managed_bytes_opt _current;
    bool _first; ///< Whether the next row we receive is the first in its group.
 public:
    static ::shared_ptr<factory> new_factory(const sstring& column_name, uint32_t idx, data_type type) {
@@ -74,7 +74,7 @@ public:
        }
    }

-    virtual bytes_opt get_output() override {
+    virtual managed_bytes_opt get_output() override {
        return std::move(_current);
    }

--- a/cql3/selection/writetime_or_ttl_selector.hh
+++ b/cql3/selection/writetime_or_ttl_selector.hh
@@ -21,7 +21,7 @@ class writetime_or_ttl_selector : public selector {
    sstring _column_name;
    int _idx;
    bool _is_writetime;
-    bytes_opt _current;
+    managed_bytes_opt _current;
 public:
    static shared_ptr<selector::factory> new_factory(sstring column_name, int idx, bool is_writetime) {
        class wtots_factory : public selector::factory {
@@ -60,25 +60,27 @@ public:
        if (_is_writetime) {
            int64_t ts = rs.timestamp_of(_idx);
            if (ts != api::missing_timestamp) {
-                _current = bytes(bytes::initialized_later(), 8);
-                auto i = _current->begin();
+                auto tmp = bytes(bytes::initialized_later(), 8);
+                auto i = tmp.begin();
                serialize_int64(i, ts);
+                _current = managed_bytes(tmp);
            } else {
                _current = std::nullopt;
            }
        } else {
            int ttl = rs.ttl_of(_idx);
            if (ttl > 0) {
-                _current = bytes(bytes::initialized_later(), 4);
-                auto i = _current->begin();
+                auto tmp = bytes(bytes::initialized_later(), 4);
+                auto i = tmp.begin();
                serialize_int32(i, ttl);
+                _current = managed_bytes(tmp);
            } else {
                _current = std::nullopt;
            }
        }
    }

-    virtual bytes_opt get_output() override {
+    virtual managed_bytes_opt get_output() override {
        return _current;
    }

--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -60,14 +60,14 @@ future<std::vector<mutation>> alter_type_statement::prepare_announcement_mutatio
    auto to_update = all_types.find(_name.get_user_type_name());
    // Shouldn't happen, unless we race with a drop
    if (to_update == all_types.end()) {
-        throw exceptions::invalid_request_exception(format("No user type named {} exists.", _name.to_string()));
+        throw exceptions::invalid_request_exception(format("No user type named {} exists.", _name.to_cql_string()));
    }

    for (auto&& schema : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
        for (auto&& column : schema->partition_key_columns()) {
            if (column.type->references_user_type(_name.get_keyspace(), _name.get_user_type_name())) {
                throw exceptions::invalid_request_exception(format("Cannot add new field to type {} because it is used in the partition key column {} of table {}.{}",
-                    _name.to_string(), column.name_as_text(), schema->ks_name(), schema->cf_name()));
+                    _name.to_cql_string(), column.name_as_text(), schema->ks_name(), schema->cf_name()));
            }
        }
    }
@@ -134,7 +134,7 @@ user_type alter_type_statement::add_or_alter::do_add(data_dictionary::database d
 {
    if (to_update->idx_of_field(_field_name->name())) {
        throw exceptions::invalid_request_exception(format("Cannot add new field {} to type {}: a field of the same name already exists",
-            _field_name->to_string(), _name.to_string()));
+            _field_name->to_string(), _name.to_cql_string()));
    }

    if (to_update->size() == max_udt_fields) {
@@ -147,7 +147,7 @@ user_type alter_type_statement::add_or_alter::do_add(data_dictionary::database d
    auto&& add_type = _field_type->prepare(db, keyspace()).get_type();
    if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
        throw exceptions::invalid_request_exception(format("Cannot add new field {} of type {} to type {} as this would create a circular reference",
-                    *_field_name, *_field_type, _name.to_string()));
+                    *_field_name, *_field_type, _name.to_cql_string()));
    }
    new_types.push_back(std::move(add_type));
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types), to_update->is_multi_cell());
@@ -157,7 +157,7 @@ user_type alter_type_statement::add_or_alter::do_alter(data_dictionary::database
 {
    auto idx = to_update->idx_of_field(_field_name->name());
    if (!idx) {
-        throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", _field_name->to_string(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", _field_name->to_string(), _name.to_cql_string()));
    }

    auto previous = to_update->field_types()[*idx];
@@ -194,7 +194,7 @@ user_type alter_type_statement::renames::make_updated_type(data_dictionary::data
        auto&& from = rename.first;
        auto idx = to_update->idx_of_field(from->name());
        if (!idx) {
-            throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", from->to_string(), _name.to_string()));
+            throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", from->to_string(), _name.to_cql_string()));
        }
        new_names[*idx] = rename.second->name();
    }
--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -65,7 +65,8 @@ void cql3::statements::authorization_statement::maybe_correct_resource(auth::res
            // This is an "ALL FUNCTIONS IN KEYSPACE" resource.
            return;
        }
-        const auto& utm = qp.db().find_keyspace(*keyspace).user_types();
+        auto ks = qp.db().find_keyspace(*keyspace);
+        const auto& utm = ks.user_types();
        auto function_name = *functions_view.function_name();
        auto function_args = functions_view.function_args();
        std::vector<data_type> parsed_types;
--- a/cql3/statements/create_aggregate_statement.cc
+++ b/cql3/statements/create_aggregate_statement.cc
@@ -90,6 +90,22 @@ create_aggregate_statement::prepare_schema_mutations(query_processor& qp, api::t
    co_return std::make_pair(std::move(ret), std::move(m));
 }

+seastar::future<> create_aggregate_statement::check_access(query_processor &qp, const service::client_state &state) const {
+    co_await create_function_statement_base::check_access(qp, state);
+    auto&& ks = _name.has_keyspace() ? _name.keyspace : state.get_keyspace();
+    create_arg_types(qp);
+    std::vector<data_type> sfunc_args = _arg_types;
+    data_type stype = prepare_type(qp, *_stype);
+    sfunc_args.insert(sfunc_args.begin(), stype);
+    co_await state.has_function_access(qp.db(), ks, auth::encode_signature(_sfunc,sfunc_args), auth::permission::EXECUTE);
+    if (_rfunc) {
+        co_await state.has_function_access(qp.db(), ks, auth::encode_signature(*_rfunc,{stype, stype}), auth::permission::EXECUTE);
+    }
+    if (_ffunc) {
+        co_await state.has_function_access(qp.db(), ks, auth::encode_signature(*_ffunc,{stype}), auth::permission::EXECUTE);
+    }
+}
+
 create_aggregate_statement::create_aggregate_statement(functions::function_name name, std::vector<shared_ptr<cql3_type::raw>> arg_types,
            sstring sfunc, shared_ptr<cql3_type::raw> stype, std::optional<sstring> rfunc, std::optional<sstring> ffunc, std::optional<expr::expression> ival, bool or_replace, bool if_not_exists)
        : create_function_statement_base(std::move(name), std::move(arg_types), or_replace, if_not_exists)
--- a/cql3/statements/create_aggregate_statement.hh
+++ b/cql3/statements/create_aggregate_statement.hh
@@ -26,6 +26,7 @@ namespace statements {
 class create_aggregate_statement final : public create_function_statement_base {
    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
    future<std::pair<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>>> prepare_schema_mutations(query_processor& qp, api::timestamp_type) const override;
+    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

    virtual seastar::future<shared_ptr<db::functions::function>> create(query_processor& qp, db::functions::function* old) const override;

--- a/cql3/statements/create_type_statement.cc
+++ b/cql3/statements/create_type_statement.cc
@@ -134,7 +134,7 @@ future<std::pair<::shared_ptr<cql_transport::event::schema_change>, std::vector<
                _name.get_string_type_name());
        } else {
            if (!_if_not_exists) {
-                co_await coroutine::return_exception(exceptions::invalid_request_exception(format("A user type of name {} already exists", _name.to_string())));
+                co_await coroutine::return_exception(exceptions::invalid_request_exception(format("A user type of name {} already exists", _name.to_cql_string())));
            }
        }
    } catch (data_dictionary::no_such_keyspace& e) {
--- a/cql3/statements/drop_keyspace_statement.cc
+++ b/cql3/statements/drop_keyspace_statement.cc
@@ -52,7 +52,7 @@ drop_keyspace_statement::prepare_schema_mutations(query_processor& qp, api::time
    ::shared_ptr<cql_transport::event::schema_change> ret;

    try {
-        m = qp.get_migration_manager().prepare_keyspace_drop_announcement(_keyspace, ts);
+        m = co_await qp.get_migration_manager().prepare_keyspace_drop_announcement(_keyspace, ts);

        using namespace cql_transport;
        ret = ::make_shared<event::schema_change>(
--- a/cql3/statements/drop_type_statement.cc
+++ b/cql3/statements/drop_type_statement.cc
@@ -56,7 +56,7 @@ void drop_type_statement::validate_while_executing(query_processor& qp) const {
            if (_if_exists) {
                return;
            } else {
-                throw exceptions::invalid_request_exception(format("No user type named {} exists.", _name.to_string()));
+                throw exceptions::invalid_request_exception(format("No user type named {} exists.", _name.to_cql_string()));
            }
        }

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -393,7 +393,7 @@ modification_statement::process_where_clause(data_dictionary::database db, expr:
            _has_regular_column_conditions = true;
        }
    }
-    if (has_token(_restrictions->get_partition_key_restrictions())) {
+    if (_restrictions->has_token_restrictions()) {
        throw exceptions::invalid_request_exception(format("The token function cannot be used in WHERE clauses for UPDATE and DELETE statements: {}",
                to_string(_restrictions->get_partition_key_restrictions())));
    }
--- a/cql3/statements/permission_altering_statement.cc
+++ b/cql3/statements/permission_altering_statement.cc
@@ -11,6 +11,7 @@
 #include <seastar/core/thread.hh>

 #include "auth/service.hh"
+#include "db/system_keyspace.hh"
 #include "permission_altering_statement.hh"
 #include "cql3/functions/functions.hh"
 #include "cql3/functions/user_aggregate.hh"
@@ -49,15 +50,10 @@ future<> cql3::statements::permission_altering_statement::check_access(query_pro

    return state.ensure_exists(_resource).then([this, &state] {
        if (_resource.kind() == auth::resource_kind::functions) {
-            // Even if the function exists, it may be a builtin function, in which case we disallow altering permissions on it.
+            // Even if the resource exists, it may be a builtin function or all builtin functions, in which case we disallow altering permissions on it.
            auth::functions_resource_view v(_resource);
-            if (v.function_signature()) {
-                // If the resource has a signature, it is a specific funciton and not "all functions"
-                auto [name, function_args] = auth::decode_signature(*v.function_signature());
-                auto fun = cql3::functions::functions::find(db::functions::function_name{sstring(*v.keyspace()), name}, function_args);
-                if (fun->is_native()) {
-                    return make_exception_future<>(exceptions::invalid_request_exception("Altering permissions on builtin functions is not supported"));
-                }
+            if (v.keyspace() && *v.keyspace() == db::system_keyspace::NAME) {
+                return make_exception_future<>(exceptions::invalid_request_exception("Altering permissions on builtin functions is not supported"));
            }
        }
        // check that the user has AUTHORIZE permission on the resource or its parents, otherwise reject
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -75,7 +75,7 @@ public:
    template<typename T>
    using compare_fn = std::function<bool(const T&, const T&)>;

-    using result_row_type = std::vector<bytes_opt>;
+    using result_row_type = std::vector<managed_bytes_opt>;
    using ordering_comparator_type = compare_fn<result_row_type>;
 private:
    using prepared_orderings_type = std::vector<std::pair<const column_definition*, ordering>>;
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -46,6 +46,7 @@
 #include "utils/result_combinators.hh"
 #include "utils/result_loop.hh"
 #include "service/forward_service.hh"
+#include "replica/database.hh"

 template<typename T = void>
 using coordinator_result = cql3::statements::select_statement::coordinator_result<T>;
@@ -560,7 +561,9 @@ indexed_table_select_statement::do_execute_base_query(
    auto cmd = prepare_command_for_base_query(qp, options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
    uint32_t queried_ranges_count = partition_ranges.size();
-    query_ranges_to_vnodes_generator ranges_to_vnodes(qp.proxy().get_token_metadata_ptr(), _schema, std::move(partition_ranges));
+    auto&& table = qp.proxy().local_db().find_column_family(_schema);
+    auto erm = table.get_effective_replication_map();
+    query_ranges_to_vnodes_generator ranges_to_vnodes(erm->make_splitter(), _schema, std::move(partition_ranges));

    struct base_query_state {
        query::result_merger merger;
@@ -873,7 +876,7 @@ primary_key_select_statement::primary_key_select_statement(schema_ptr schema, ui
    if (_ks_sel == ks_selector::NONSYSTEM) {
        if (_restrictions->need_filtering() ||
                _restrictions->partition_key_restrictions_is_empty() ||
-                (has_token(_restrictions->get_partition_key_restrictions()) &&
+                (_restrictions->has_token_restrictions() &&
                 !find(_restrictions->get_partition_key_restrictions(), expr::oper_t::EQ))) {
            _range_scan = true;
            if (!_parameters->bypass_cache())
@@ -897,7 +900,8 @@ indexed_table_select_statement::prepare(data_dictionary::database db,
                                         cql_stats &stats,
                                         std::unique_ptr<attributes> attrs)
 {
-    auto& sim = db.find_column_family(schema).get_index_manager();
+    auto cf = db.find_column_family(schema);
+    auto& sim = cf.get_index_manager();
    auto [index_opt, used_index_restrictions] = restrictions->find_idx(sim);
    if (!index_opt) {
        throw std::runtime_error("No index found.");
@@ -1208,7 +1212,7 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_g
    partition_slice_builder partition_slice_builder{*_view_schema};

    if (!_restrictions->has_partition_key_unrestricted_components()) {
-        bool pk_restrictions_is_single = !has_token(_restrictions->get_partition_key_restrictions());
+        bool pk_restrictions_is_single = !_restrictions->has_token_restrictions();
        // Only EQ restrictions on base partition key can be used in an index view query
        if (pk_restrictions_is_single && _restrictions->partition_key_restrictions_is_all_eq()) {
            partition_slice_builder.with_ranges(
@@ -2024,7 +2028,7 @@ static bool needs_allow_filtering_anyway(
    const auto& pk_restrictions = restrictions.get_partition_key_restrictions();
    // Even if no filtering happens on the coordinator, we still warn about poor performance when partition
    // slice is defined but in potentially unlimited number of partitions (see #7608).
-    if ((expr::is_empty_restriction(pk_restrictions) || has_token(pk_restrictions)) // Potentially unlimited partitions.
+    if ((expr::is_empty_restriction(pk_restrictions) || restrictions.has_token_restrictions()) // Potentially unlimited partitions.
        && !expr::is_empty_restriction(ck_restrictions) // Slice defined.
        && !restrictions.uses_secondary_indexing()) { // Base-table is used. (Index-table use always limits partitions.)
        if (strict_allow_filtering == flag_t::WARN) {
--- a/cql3/statements/statement_type.hh
+++ b/cql3/statements/statement_type.hh
@@ -56,13 +56,7 @@ public:
        return size_t(_type);
    }

-    bool operator==(const statement_type& other) const {
-        return _type == other._type;
-    }
-
-    bool operator!=(const statement_type& other) const {
-        return !(_type == other._type);
-    }
+    bool operator==(const statement_type&) const = default;

    friend std::ostream &operator<<(std::ostream &os, const statement_type& t) {
        switch (t._type) {
--- a/cql3/statements/strongly_consistent_select_statement.cc
+++ b/cql3/statements/strongly_consistent_select_statement.cc
@@ -119,7 +119,7 @@ strongly_consistent_select_statement::execute_without_checking_exception_message
    });

    if (query_result->value) {
-        result_set->add_row({ query_result->value });
+        result_set->add_row({ managed_bytes_opt(query_result->value) });
    }

    co_return ::make_shared<cql_transport::messages::result_message::rows>(cql3::result{std::move(result_set)});
--- a/cql3/stats.hh
+++ b/cql3/stats.hh
@@ -85,7 +85,7 @@ private:
    uint64_t _query_cnt[(size_t)source_selector::SIZE]
            [(size_t)ks_selector::SIZE]
            [(size_t)cond_selector::SIZE]
-            [statements::statement_type::MAX_VALUE + 1] = {0ul};
+            [statements::statement_type::MAX_VALUE + 1] = {};
 };

 }
--- a/cql3/untyped_result_set.cc
+++ b/cql3/untyped_result_set.cc
@@ -30,17 +30,17 @@ size_t cql3::untyped_result_set_row::index(const std::string_view& name) const {
 bool cql3::untyped_result_set_row::has(std::string_view name) const {
    auto i = index(name);
    if (i < _data.size()) {
-        return !std::holds_alternative<std::monostate>(_data.at(i));
+        return _data.at(i).has_value();
    }
    return false;
 }

 cql3::untyped_result_set_row::view_type cql3::untyped_result_set_row::get_view(std::string_view name) const {
-    return std::visit(make_visitor(
-        [](std::monostate) -> view_type { throw std::bad_variant_access(); },
-        [](const view_type& v) -> view_type { return v; },
-        [](const bytes& b) -> view_type { return view_type(b); }
-    ), _data.at(index(name)));
+    auto& data = _data.at(index(name));
+    if (!data) {
+        throw std::bad_variant_access();
+    }
+    return *data;
 }

 const std::vector<lw_shared_ptr<cql3::column_specification>>& cql3::untyped_result_set_row::get_columns() const {
@@ -74,12 +74,12 @@ struct cql3::untyped_result_set::visitor {
    void start_row() {
        tmp.reserve(index.size());
    }
-    void accept_value(std::optional<query::result_bytes_view>&& v) {
+    void accept_value(managed_bytes_view_opt&& v) {
        if (v) {
-            tmp.emplace_back(std::move(*v));
+            tmp.emplace_back(*v);
        } else {
-            tmp.emplace_back(std::monostate{});
-        } 
+            tmp.emplace_back(std::nullopt);
+        }
    }
    // somewhat weird dispatch, but when visiting directly via
    // result_generator, pk:s will be temporary - and sent 
--- a/cql3/untyped_result_set.hh
+++ b/cql3/untyped_result_set.hh
@@ -41,13 +41,12 @@ class metadata;

 class untyped_result_set_row {
 public:
-    using view_type = query::result_bytes_view;
+    using view_type = managed_bytes_view;
    using opt_view_type = std::optional<view_type>;
-    using view_holder = std::variant<std::monostate, view_type, bytes>;
 private:
    friend class untyped_result_set;
    using index_map = std::unordered_map<std::string_view, size_t>;
-    using data_views = std::vector<view_holder>;
+    using data_views = std::vector<managed_bytes_opt>;

    const index_map& _name_to_index;
    const cql3::metadata& _metadata;
@@ -62,7 +61,7 @@ public:
    bool has(std::string_view) const;
    view_type get_view(std::string_view name) const;
    bytes get_blob(std::string_view name) const {
-        return get_view(name).linearize();
+        return to_bytes(get_view(name));
    }
    managed_bytes get_blob_fragmented(std::string_view name) const {
        return managed_bytes(get_view(name));
@@ -150,6 +149,8 @@ public:

 class result_set;

+/// A tabular result. Unlike result_set, untyped_result_set is optimized for safety
+/// and convenience, not performance.
 class untyped_result_set {
 public:
    using row = untyped_result_set_row;
--- a/cql3/ut_name.cc
+++ b/cql3/ut_name.cc
@@ -39,8 +39,8 @@ sstring ut_name::get_string_type_name() const
    return _ut_name->to_string();
 }

-sstring ut_name::to_string() const {
-    return (has_keyspace() ? (_ks_name.value() + ".") : "") + _ut_name->to_string();
+sstring ut_name::to_cql_string() const {
+    return (has_keyspace() ? (_ks_name.value() + ".") : "") + _ut_name->to_cql_string();
 }

 }
--- a/Show More
+++ b/Show More