release: prepare for 4.3.4

scylla_raid_setup: use /dev/disk/by-uuid to specify filesystem
Currently, var-lib-scylla.mount may fails because it can start before MDRAID volume initialized. We may able to add "After=dev-disk-by\x2duuid-<uuid>.device" to wait for device become available, but systemd manual says it automatically configure dependency for mount unit when we specify filesystem path by "absolute path of a device node". So we need to replace What=UUID=<uuid> to What=/dev/disk/by-uuid/<uuid>. Fixes #8279 Closes #8681 (cherry picked from commit 3d307919c3)
2021-05-25 21:10:08 +03:00 · 2021-05-24 17:24:18 +03:00 · 2021-05-20 21:26:42 +03:00 · 2021-05-19 12:23:46 +03:00 · 2021-05-19 00:07:09 +03:00 · 2021-05-19 00:07:06 +03:00
117 changed files with 2594 additions and 633 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.3.4

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -159,23 +159,40 @@ static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error::validation(format("BEGINS_WITH operator requires String or Binary type in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error::validation(format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error::validation(format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
@@ -279,24 +296,38 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -310,7 +341,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

@@ -341,56 +373,71 @@ struct cmp_gt {
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-static bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
-        throw api_error::validation(
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        if (bounds_from_query) {
+            throw api_error::validation(
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error::validation("between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error::validation(
+        if (bounds_from_query) {
+           throw api_error::validation(
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error::validation(
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error::validation(
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -437,19 +484,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -461,7 +508,8 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
            {
                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
@@ -573,7 +621,8 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
            // Shouldn't happen unless we have a bug in the parser
            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
    case parsed::primitive_condition::type::IN:
        return check_IN(calculated_values);
    case parsed::primitive_condition::type::VALUE:
@@ -604,13 +653,17 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::NE:
        return check_NE(&calculated_values[0], calculated_values[1]);
    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    default:
        // Shouldn't happen unless we have a bug in the parser
        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -52,6 +52,7 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
 bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);

 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);

 bool verify_condition_expression(
        const parsed::condition_expression& condition_expression,
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1881,7 +1881,8 @@ static std::string get_item_type_string(const rjson::value& v) {

 // calculate_attrs_to_get() takes either AttributesToGet or
 // ProjectionExpression parameters (having both is *not* allowed),
-// and returns the list of cells we need to read.
+// and returns the list of cells we need to read, or an empty set when
+// *all* attributes are to be returned.
 // In our current implementation, only top-level attributes are stored
 // as cells, and nested documents are stored serialized as JSON.
 // So this function currently returns only the the top-level attributes
@@ -2243,19 +2244,30 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                    rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get());
                    rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item.get());
                    rjson::value result;
-                    std::string v1_type = get_item_type_string(v1);
-                    if (v1_type == "N") {
-                        if (get_item_type_string(v2) != "N") {
-                            throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                    // An ADD can be used to create a new attribute (when
+                    // v1.IsNull()) or to add to a pre-existing attribute:
+                    if (v1.IsNull()) {
+                        std::string v2_type = get_item_type_string(v2);
+                        if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
+                            result = v2;
+                        } else {
+                            throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2));
                        }
-                        result = number_add(v1, v2);
-                    } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
-                        if (get_item_type_string(v2) != v1_type) {
-                            throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
-                        }
-                        result = set_sum(v1, v2);
                    } else {
-                        throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
+                        std::string v1_type = get_item_type_string(v1);
+                        if (v1_type == "N") {
+                            if (get_item_type_string(v2) != "N") {
+                                throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                            }
+                            result = number_add(v1, v2);
+                        } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
+                            if (get_item_type_string(v2) != v1_type) {
+                                throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                            }
+                            result = set_sum(v1, v2);
+                        } else {
+                            throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
+                        }
                    }
                    do_update(to_bytes(column_name), result);
                },
@@ -2571,6 +2583,10 @@ public:
            std::unordered_set<std::string>& used_attribute_values);
    bool check(const rjson::value& item) const;
    bool filters_on(std::string_view attribute) const;
+    // for_filters_on() runs the given function on the attributes that the
+    // filter works on. It may run for the same attribute more than once if
+    // used more than once in the filter.
+    void for_filters_on(const noncopyable_function<void(std::string_view)>& func) const;
    operator bool() const { return bool(_imp); }
 };

@@ -2651,10 +2667,26 @@ bool filter::filters_on(std::string_view attribute) const {
    }, *_imp);
 }

+void filter::for_filters_on(const noncopyable_function<void(std::string_view)>& func) const {
+    if (_imp) {
+        std::visit(overloaded_functor {
+            [&] (const conditions_filter& f) -> void {
+                for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) {
+                    func(rjson::to_string_view(it->name));
+                }
+            },
+            [&] (const expression_filter& f) -> void {
+                return for_condition_expression_on(f.expression, func);
+            }
+        }, *_imp);
+    }
+}
+
 class describe_items_visitor {
    typedef std::vector<const column_definition*> columns_t;
    const columns_t& _columns;
    const std::unordered_set<std::string>& _attrs_to_get;
+    std::unordered_set<std::string> _extra_filter_attrs;
    const filter& _filter;
    typename columns_t::const_iterator _column_it;
    rjson::value _item;
@@ -2670,7 +2702,20 @@ public:
            , _item(rjson::empty_object())
            , _items(rjson::empty_array())
            , _scanned_count(0)
-    { }
+    {
+        // _filter.check() may need additional attributes not listed in
+        // _attrs_to_get (i.e., not requested as part of the output).
+        // We list those in _extra_filter_attrs. We will include them in
+        // the JSON but take them out before finally returning the JSON.
+        if (!_attrs_to_get.empty()) {
+            _filter.for_filters_on([&] (std::string_view attr) {
+                std::string a(attr); // no heterogenous maps searches :-(
+                if (!_attrs_to_get.contains(a)) {
+                    _extra_filter_attrs.emplace(std::move(a));
+                }
+            });
+        }
+    }

    void start_row() {
        _column_it = _columns.begin();
@@ -2684,7 +2729,7 @@ public:
        result_bytes_view->with_linearized([this] (bytes_view bv) {
            std::string column_name = (*_column_it)->name_as_text();
            if (column_name != executor::ATTRS_COLUMN_NAME) {
-                if (_attrs_to_get.empty() || _attrs_to_get.contains(column_name)) {
+                if (_attrs_to_get.empty() || _attrs_to_get.contains(column_name) || _extra_filter_attrs.contains(column_name)) {
                    if (!_item.HasMember(column_name.c_str())) {
                        rjson::set_with_string_name(_item, column_name, rjson::empty_object());
                    }
@@ -2696,7 +2741,7 @@ public:
                auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
                for (auto entry : keys_and_values) {
                    std::string attr_name = value_cast<sstring>(entry.first);
-                    if (_attrs_to_get.empty() || _attrs_to_get.contains(attr_name)) {
+                    if (_attrs_to_get.empty() || _attrs_to_get.contains(attr_name) || _extra_filter_attrs.contains(attr_name)) {
                        bytes value = value_cast<bytes>(entry.second);
                        rjson::set_with_string_name(_item, attr_name, deserialize_item(value));
                    }
@@ -2708,6 +2753,11 @@ public:

    void end_row() {
        if (_filter.check(_item)) {
+            // Remove the extra attributes _extra_filter_attrs which we had
+            // to add just for the filter, and not requested to be returned:
+            for (const auto& attr : _extra_filter_attrs) {
+                rjson::remove_member(_item, attr);
+            }
            rjson::push_back(_items, std::move(_item));
        }
        _item = rjson::empty_object();
@@ -2742,7 +2792,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    for (const column_definition& cdef : schema.partition_key_columns()) {
        rjson::set_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
        rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_pk_it)));
+        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef));
        ++exploded_pk_it;
    }
    auto ck = paging_state.get_clustering_key();
@@ -2752,7 +2802,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
        for (const column_definition& cdef : schema.clustering_key_columns()) {
            rjson::set_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
            rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_ck_it)));
+            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef));
            ++exploded_ck_it;
        }
    }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -348,6 +348,39 @@ bool condition_expression_on(const parsed::condition_expression& ce, std::string
    }, ce._expression);
 }

+// for_condition_expression_on() runs a given function over all the attributes
+// mentioned in the expression. If the same attribute is mentioned more than
+// once, the function will be called more than once for the same attribute.
+
+static void for_value_on(const parsed::value& v, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::constant& c) { },
+        [&] (const parsed::value::function_call& f) {
+            for (const parsed::value& value : f._parameters) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::path& p) {
+            func(p.root());
+        }
+    }, v._value);
+}
+
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) {
+            for (const parsed::value& value : cond._values) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::condition_expression::condition_list& list) {
+            for (const parsed::condition_expression& cond : list.conditions) {
+                for_condition_expression_on(cond, func);
+            }
+        }
+    }, ce._expression);
+}
+
 // The following calculate_value() functions calculate, or evaluate, a parsed
 // expression. The parsed expression is assumed to have been "resolved", with
 // the matching resolve_* function.
@@ -570,52 +603,8 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
-            // TODO: There's duplication here with check_BEGINS_WITH().
-            // But unfortunately, the two functions differ a bit.
-
-            // If one of v1 or v2 is malformed or has an unsupported type
-            // (not B or S), what we do depends on whether it came from
-            // the user's query (is_constant()), or the item. Unsupported
-            // values in the query result in an error, but if they are in
-            // the item, we silently return false (no match).
-            bool bad = false;
-            if (!v1.IsObject() || v1.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v1));
-                }
-            } else if (v1.MemberBegin()->name != "S" && v1.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v1));
-                }
-            }
-            if (!v2.IsObject() || v2.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v2));
-                }
-            } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v2));
-                }
-            }
-            bool ret = false;
-            if (!bad) {
-                auto it1 = v1.MemberBegin();
-                auto it2 = v2.MemberBegin();
-                if (it1->name == it2->name) {
-                    if (it2->name == "S") {
-                        std::string_view val1 = rjson::to_string_view(it1->value);
-                        std::string_view val2 = rjson::to_string_view(it2->value);
-                        ret = val1.starts_with(val2);
-                    } else /* it2->name == "B" */ {
-                        ret = base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
-                    }
-                }
-            }
-            return to_bool_json(ret);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
        }
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -27,6 +27,8 @@
 #include <unordered_set>
 #include <string_view>

+#include <seastar/util/noncopyable_function.hh>
+
 #include "expressions_types.hh"
 #include "utils/rjson.hh"

@@ -59,6 +61,11 @@ void validate_value(const rjson::value& v, const char* caller);

 bool condition_expression_on(const parsed::condition_expression& ce, std::string_view attribute);

+// for_condition_expression_on() runs the given function on the attributes
+// that the expression uses. It may run for the same attribute more than once
+// if the same attribute is used more than once in the expression.
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func);
+
 // calculate_value() behaves slightly different (especially, different
 // functions supported) when used in different types of expressions, as
 // enumerated in this enum:
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -849,6 +849,7 @@ future<executor::request_return_type> executor::get_records(client_state& client

    static const bytes timestamp_column_name = cdc::log_meta_column_name_bytes("time");
    static const bytes op_column_name = cdc::log_meta_column_name_bytes("operation");
+    static const bytes eor_column_name = cdc::log_meta_column_name_bytes("end_of_batch");

    auto key_names = boost::copy_range<std::unordered_set<std::string>>(
        boost::range::join(std::move(base->partition_key_columns()), std::move(base->clustering_key_columns()))
@@ -872,7 +873,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
    std::transform(cks.begin(), cks.end(), std::back_inserter(columns), [](auto& c) { return &c; });

    auto regular_columns = boost::copy_range<query::column_id_vector>(schema->regular_columns() 
-        | boost::adaptors::filtered([](const column_definition& cdef) { return cdef.name() == op_column_name || !cdc::is_cdc_metacolumn_name(cdef.name_as_text()); })
+        | boost::adaptors::filtered([](const column_definition& cdef) { return cdef.name() == op_column_name || cdef.name() == eor_column_name || !cdc::is_cdc_metacolumn_name(cdef.name_as_text()); })
        | boost::adaptors::transformed([&] (const column_definition& cdef) { columns.emplace_back(&cdef); return cdef.id; })
    );

@@ -905,6 +906,11 @@ future<executor::request_return_type> executor::get_records(client_state& client
                return cdef->name->name() == timestamp_column_name;
            })
        );
+        auto eor_index = std::distance(metadata.get_names().begin(), 
+            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+                return cdef->name->name() == eor_column_name;
+            })
+        );

        std::optional<utils::UUID> timestamp;
        auto dynamodb = rjson::empty_object();
@@ -930,15 +936,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
        for (auto& row : result_set->rows()) {
            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-
-            if (timestamp && timestamp != ts) {
-                maybe_add_record();
-                if (limit == 0) {
-                    break;
-                }
-            }
-
-            timestamp = ts;
+            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

            if (!dynamodb.HasMember("Keys")) {
                auto keys = rjson::empty_object();
@@ -991,9 +989,13 @@ future<executor::request_return_type> executor::get_records(client_state& client
                rjson::set(record, "eventName", "REMOVE");
                break;
            }
-        }
-        if (limit > 0 && timestamp) {
-            maybe_add_record();
+            if (eor) {
+                maybe_add_record();
+                timestamp = ts;
+                if (limit == 0) {
+                    break;
+                }
+            }
        }

        auto ret = rjson::empty_object();
@@ -1047,6 +1049,9 @@ void executor::add_stream_options(const rjson::value& stream_specification, sche
        if (!db.features().cluster_supports_cdc()) {
            throw api_error::validation("StreamSpecification: streams (CDC) feature not enabled in cluster.");
        }
+        if (!db.features().cluster_supports_alternator_streams()) {
+            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
+        }

        cdc::options opts;
        opts.enabled(true);
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -656,7 +656,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -664,7 +664,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -672,7 +672,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -680,7 +680,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -688,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -696,7 +696,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -20,10 +20,16 @@

 #pragma once

+#include <map>
+
+#include <seastar/core/sstring.hh>
+
+#include "bytes.hh"
 #include "serializer.hh"
 #include "db/extensions.hh"
 #include "cdc/cdc_options.hh"
 #include "schema.hh"
+#include "serializer_impl.hh"

 namespace cdc {

--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -23,6 +23,7 @@
 #include <random>
 #include <unordered_set>
 #include <seastar/core/sleep.hh>
+#include <algorithm>

 #include "keys.hh"
 #include "schema_builder.hh"
@@ -174,10 +175,29 @@ bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

+std::vector<token_range_description>&& topology_description::entries() && {
+    return std::move(_entries);
+}
+
+static std::vector<stream_id> create_stream_ids(
+        size_t index, dht::token start, dht::token end, size_t shard_count, uint8_t ignore_msb) {
+    std::vector<stream_id> result;
+    result.reserve(shard_count);
+    dht::sharder sharder(shard_count, ignore_msb);
+    for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+        auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+        // compose the id from token and the "index" of the range end owning vnode
+        // as defined by token sort order. Basically grouping within this
+        // shard set.
+        result.emplace_back(stream_id(t, index));
+    }
+    return result;
+}
+
 class topology_description_generator final {
    const db::config& _cfg;
    const std::unordered_set<dht::token>& _bootstrap_tokens;
@@ -217,18 +237,9 @@ class topology_description_generator final {
        desc.token_range_end = end;

        auto [shard_count, ignore_msb] = get_sharding_info(end);
-        desc.streams.reserve(shard_count);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
        desc.sharding_ignore_msb = ignore_msb;

-        dht::sharder sharder(shard_count, ignore_msb);
-        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
-            // compose the id from token and the "index" of the range end owning vnode
-            // as defined by token sort order. Basically grouping within this
-            // shard set.
-            desc.streams.emplace_back(stream_id(t, index));
-        }
-
        return desc;
    }
 public:
@@ -294,6 +305,38 @@ future<db_clock::time_point> get_local_streams_timestamp() {
    });
 }

+// non-static for testing
+size_t limit_of_streams_in_topology_description() {
+    // Each stream takes 16B and we don't want to exceed 4MB so we can have
+    // at most 262144 streams but not less than 1 per vnode.
+    return 4 * 1024 * 1024 / 16;
+}
+
+// non-static for testing
+topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
+    int64_t streams_count = 0;
+    for (auto& tr_desc : desc.entries()) {
+        streams_count += tr_desc.streams.size();
+    }
+
+    size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
+    if (limit >= size_t(streams_count)) {
+        return std::move(desc);
+    }
+    size_t streams_per_vnode_limit = limit / desc.entries().size();
+    auto entries = std::move(desc).entries();
+    auto start = entries.back().token_range_end;
+    for (size_t idx = 0; idx < entries.size(); ++idx) {
+        auto end = entries[idx].token_range_end;
+        if (entries[idx].streams.size() > streams_per_vnode_limit) {
+            entries[idx].streams =
+                create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
+        }
+        start = end;
+    }
+    return topology_description(std::move(entries));
+}
+
 // Run inside seastar::async context.
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
@@ -306,6 +349,18 @@ db_clock::time_point make_new_cdc_generation(
    using namespace std::chrono;
    auto gen = topology_description_generator(cfg, bootstrap_tokens, tm, g).generate();

+    // If the cluster is large we may end up with a generation that contains
+    // large number of streams. This is problematic because we store the
+    // generation in a single row. For a generation with large number of rows
+    // this will lead to a row that can be as big as 32MB. This is much more
+    // than the limit imposed by commitlog_segment_size_in_mb. If the size of
+    // the row that describes a new generation grows above
+    // commitlog_segment_size_in_mb, the write will fail and the new node won't
+    // be able to join. To avoid such problem we make sure that such row is
+    // always smaller than 4MB. We do that by removing some CDC streams from
+    // each vnode if the total number of streams is too large.
+    gen = limit_number_of_streams_if_needed(std::move(gen));
+
    // Begin the race.
    auto ts = db_clock::now() + (
            (for_testing || ring_delay == milliseconds(0)) ? milliseconds(0) : (
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -68,6 +68,7 @@ public:

    stream_id() = default;
    stream_id(bytes);
+    stream_id(dht::token, size_t);

    bool is_set() const;
    bool operator==(const stream_id&) const;
@@ -81,9 +82,6 @@ public:

    partition_key to_partition_key(const schema& log_schema) const;
    static int64_t token_from_bytes(bytes_view);
-private:
-    friend class topology_description_generator;
-    stream_id(dht::token, size_t);
 };

 /* Describes a mapping of tokens to CDC streams in a token range.
@@ -116,7 +114,8 @@ public:
    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
 };

 /**
@@ -154,7 +153,7 @@ bool should_propose_first_generation(const gms::inet_address& me, const gms::gos
 future<db_clock::time_point> get_local_streams_timestamp();

 /* Generate a new set of CDC streams and insert it into the distributed cdc_generation_descriptions table.
- * Returns the timestamp of this new generation.
+ * Returns the timestamp of this new generation
 *
 * Should be called when starting the node for the first time (i.e., joining the ring).
 *
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -519,6 +519,7 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
    b.with_column(log_meta_column_name_bytes("batch_seq_no"), int32_type, column_kind::clustering_key);
    b.with_column(log_meta_column_name_bytes("operation"), data_type_for<operation_native_type>());
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
+    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
@@ -880,14 +881,26 @@ public:
        return _base_schema;
    }

+    clustering_key create_ck(int batch) const {
+        return clustering_key::from_exploded(_log_schema, { _tuuid, int32_type->decompose(batch) });
+    }
+
    // Creates a new clustering row in the mutation, assigning it the next `cdc$batch_seq_no`.
    // The numbering of batch sequence numbers starts from 0.
    clustering_key allocate_new_log_row() {
-        auto log_ck = clustering_key::from_exploded(_log_schema, { _tuuid, int32_type->decompose(_batch_no++) });
+        auto log_ck = create_ck(_batch_no++);
        set_key_columns(log_ck, _base_schema.partition_key_columns(), _base_pk);
        return log_ck;
    }

+    bool has_rows() const {
+        return _batch_no != 0;
+    }
+
+    clustering_key last_row_key() const {
+        return create_ck(_batch_no - 1);
+    }
+
    // A common pattern is to allocate a row and then immediately set its `cdc$operation` column.
    clustering_key allocate_new_log_row(operation op) {
        auto log_ck = allocate_new_log_row();
@@ -944,6 +957,11 @@ public:
        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*log_cdef.type, _ts, deleted_elements, _ttl));
    }

+    void end_record() {
+        if (has_rows()) {
+            _log_mut.set_cell(last_row_key(), log_meta_column_name_bytes("end_of_batch"), data_value(true), _ts, _ttl);
+        }
+    }
 private:
    void set_key_columns(const clustering_key& log_ck, schema::const_iterator_range_type columns, const std::vector<bytes>& key) {
        size_t pos = 0;
@@ -1272,6 +1290,13 @@ struct process_change_visitor {
                _clustering_row_states, _generate_delta_values);
        visit_row_cells(v);

+        if (_enable_updating_state) {
+            // #7716: if there are no regular columns, our visitor would not have visited any cells,
+            // hence it would not have created a row_state for this row. In effect, postimage wouldn't be produced.
+            // Ensure that the row state exists.
+            _clustering_row_states.try_emplace(ckey);
+        }
+
        _builder.set_operation(log_ck, v._cdc_op);
        _builder.set_ttl(log_ck, v._ttl_column);
    }
@@ -1519,6 +1544,11 @@ public:
        cdc::inspect_mutation(m, v);
    }

+    void end_record() override {
+        assert(_builder);
+        _builder->end_record();
+    }
+
    // Takes and returns generated cdc log mutations and associated statistics about parts touched during transformer's lifetime.
    // The `transformer` object on which this method was called on should not be used anymore.
    std::tuple<std::vector<mutation>, stats::part_type_set> finish() && {
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -51,7 +51,8 @@ static cdc::stream_id get_stream(
    return entry.streams[shard_id];
 }

-static cdc::stream_id get_stream(
+// non-static for testing
+cdc::stream_id get_stream(
        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -684,6 +684,8 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
                processor.produce_postimage(&ck);
            }
        }
+
+        processor.end_record();
    }
 }

@@ -731,6 +733,8 @@ void process_changes_without_splitting(const mutation& base_mutation, change_pro
            processor.produce_postimage(&cr.key());
        }
    }
+
+    processor.end_record();
 }

 } // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -77,6 +77,10 @@ public:
    // both columns have different timestamp or TTL set.
    //   m - the small mutation to be converted into CDC log rows.
    virtual void process_change(const mutation& m) = 0;
+
+    // Tells processor we have reached end of record - last part
+    // of a given timestamp batch
+    virtual void end_record() = 0;
 };

 bool should_split(const mutation& base_mutation);
--- a/configure.py
+++ b/configure.py
@@ -275,6 +275,7 @@ modes = {

 scylla_tests = set([
    'test/boost/UUID_test',
+    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
    'test/boost/alternator_base64_test',
@@ -854,6 +855,7 @@ scylla_core = (['database.cc',
                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'mutation_writer/shard_based_splitting_writer.cc',
+                'mutation_writer/feed_writers.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -192,9 +192,12 @@ public:

        virtual ::shared_ptr<terminal> bind(const query_options& options) override {
            auto bytes = bind_and_get(options);
-            if (!bytes) {
+            if (bytes.is_null()) {
                return ::shared_ptr<terminal>{};
            }
+            if (bytes.is_unset_value()) {
+                return UNSET_VALUE;
+            }
            return ::make_shared<constants::value>(std::move(cql3::raw_value::make_value(to_bytes(*bytes))));
        }
    };
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -27,7 +27,9 @@
 #include <fmt/ostream.h>
 #include <unordered_map>

+#include "cql3/constants.hh"
 #include "cql3/lists.hh"
+#include "cql3/statements/request_validations.hh"
 #include "cql3/tuples.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/list.hh"
@@ -417,6 +419,8 @@ bool is_one_of(const column_value& col, term& rhs, const column_value_eval_bag&
    } else if (auto mkr = dynamic_cast<lists::marker*>(&rhs)) {
        // This is `a IN ?`.  RHS elements are values representable as bytes_opt.
        const auto values = static_pointer_cast<lists::value>(mkr->bind(bag.options));
+        statements::request_validations::check_not_null(
+                values, "Invalid null value for column %s", col.col->name_as_text());
        return boost::algorithm::any_of(values->get_elements(), [&] (const bytes_opt& b) {
                return equal(b, col, bag);
            });
@@ -568,7 +572,8 @@ const auto deref = boost::adaptors::transformed([] (const bytes_opt& b) { return

 /// Returns possible values from t, which must be RHS of IN.
 value_list get_IN_values(
-        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator) {
+        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator,
+        sstring_view column_name) {
    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
    if (auto dv = dynamic_pointer_cast<lists::delayed_value>(t)) {
        // Case `a IN (1,2,3)`.
@@ -578,8 +583,12 @@ value_list get_IN_values(
        return to_sorted_vector(std::move(result_range), comparator);
    } else if (auto mkr = dynamic_pointer_cast<lists::marker>(t)) {
        // Case `a IN ?`.  Collect all list-element values.
-        const auto val = static_pointer_cast<lists::value>(mkr->bind(options));
-        return to_sorted_vector(val->get_elements() | non_null | deref, comparator);
+        const auto val = mkr->bind(options);
+        if (val == constants::UNSET_VALUE) {
+            throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
+        }
+        statements::request_validations::check_not_null(val, "Invalid null value for IN tuple");
+        return to_sorted_vector(static_pointer_cast<lists::value>(val)->get_elements() | non_null | deref, comparator);
    }
    throw std::logic_error(format("get_IN_values(single column) on invalid term {}", *t));
 }
@@ -686,7 +695,7 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                                return oper.op == oper_t::EQ ? value_set(value_list{*val})
                                        : to_range(oper.op, *val);
                            } else if (oper.op == oper_t::IN) {
-                                return get_IN_values(oper.rhs, options, type->as_less_comparator());
+                                return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
                            }
                            throw std::logic_error(format("possible_lhs_values: unhandled operator {}", oper));
                        },
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -305,6 +305,12 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = _k->bind_and_get(params._options);
    auto value = _t->bind_and_get(params._options);
+    if (value.is_unset_value()) {
+        return;
+    }
+    if (key.is_unset_value() || value.is_unset_value()) {
+        throw invalid_request_exception("Invalid unset map key");
+    }
    if (!key) {
        throw invalid_request_exception("Invalid null map key");
    }
--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -315,7 +315,7 @@ sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, cons
    assert(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";

    auto&& value = _t->bind(params._options);
-    if (!value) {
+    if (!value || value == constants::UNSET_VALUE) {
        return;
    }

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -59,6 +59,7 @@
 #include "db/timeout_clock.hh"
 #include "db/consistency_level_validations.hh"
 #include "database.hh"
+#include "test/lib/select_statement_utils.hh"
 #include <boost/algorithm/cxx11/any_of.hpp>

 bool is_system_keyspace(const sstring& name);
@@ -67,6 +68,8 @@ namespace cql3 {

 namespace statements {

+static constexpr int DEFAULT_INTERNAL_PAGING_SIZE = select_statement::DEFAULT_COUNT_PAGE_SIZE;
+thread_local int internal_paging_size = DEFAULT_INTERNAL_PAGING_SIZE;
 thread_local const lw_shared_ptr<const select_statement::parameters> select_statement::_default_parameters = make_lw_shared<select_statement::parameters>();

 select_statement::parameters::parameters()
@@ -338,7 +341,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
    if (aggregate || nonpaged_filtering) {
-        page_size = DEFAULT_COUNT_PAGE_SIZE;
+        page_size = internal_paging_size;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);
@@ -453,7 +456,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
        if (!view_col) {
            throw std::runtime_error(format("Base key column not found in the view: {}", base_col.name_as_text()));
        }
-        if (base_col.type != view_col->type) {
+        if (base_col.type->without_reversed() != *view_col->type) {
            throw std::runtime_error(format("Mismatched types for base and view columns {}: {} and {}",
                    base_col.name_as_text(), base_col.type->cql3_type_name(), view_col->type->cql3_type_name()));
        }
@@ -541,13 +544,29 @@ indexed_table_select_statement::do_execute_base_query(
            if (old_paging_state && concurrency == 1) {
                auto base_pk = generate_base_key_from_index_pk<partition_key>(old_paging_state->get_partition_key(),
                        old_paging_state->get_clustering_key(), *_schema, *_view_schema);
+                auto row_ranges = command->slice.default_row_ranges();
                if (old_paging_state->get_clustering_key() && _schema->clustering_key_size() > 0) {
                    auto base_ck = generate_base_key_from_index_pk<clustering_key>(old_paging_state->get_partition_key(),
                            old_paging_state->get_clustering_key(), *_schema, *_view_schema);
-                    command->slice.set_range(*_schema, base_pk,
-                            std::vector<query::clustering_range>{query::clustering_range::make_starting_with(range_bound<clustering_key>(base_ck, false))});
+
+                    query::trim_clustering_row_ranges_to(*_schema, row_ranges, base_ck, false);
+                    command->slice.set_range(*_schema, base_pk, row_ranges);
                } else {
-                    command->slice.set_range(*_schema, base_pk, std::vector<query::clustering_range>{query::clustering_range::make_open_ended_both_sides()});
+                    // There is no clustering key in old_paging_state and/or no clustering key in 
+                    // _schema, therefore read an entire partition (whole clustering range).
+                    //
+                    // The only exception to applying no restrictions on clustering key
+                    // is a case when we have a secondary index on the first column
+                    // of clustering key. In such a case we should not read the
+                    // entire clustering range - only a range in which first column
+                    // of clustering key has the correct value. 
+                    //
+                    // This means that we should not set a open_ended_both_sides
+                    // clustering range on base_pk, instead intersect it with
+                    // _row_ranges (which contains the restrictions neccessary for the
+                    // case described above). The result of such intersection is just
+                    // _row_ranges, which we explicity set on base_pk.
+                    command->slice.set_range(*_schema, base_pk, row_ranges);
                }
            }
            concurrency *= 2;
@@ -992,12 +1011,16 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    if (aggregate) {
        const bool restrictions_need_filtering = _restrictions->need_filtering();
-        return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
+        return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format(), *_group_by_cell_indices), std::make_unique<cql3::query_options>(cql3::query_options(options)),
                [this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
            // page size is set to the internal count page size, regardless of the user-provided value
-            internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
+            internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), internal_paging_size));
            return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
-                auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                auto consume_results = [this, &builder, &options, &internal_options, &proxy, &state, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd, lw_shared_ptr<const service::pager::paging_state> paging_state) {
+                    if (paging_state) {
+                        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, proxy, state, options);
+                    }
+                    internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
                    if (restrictions_need_filtering) {
                        _stats.filtered_rows_read_total += *results->row_count();
                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
@@ -1005,24 +1028,24 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
                    } else {
                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
                    }
+                    bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                    return stop_iteration(!has_more_pages);
                };

                if (whole_partitions || partition_slices) {
                    return find_index_partition_ranges(proxy, state, *internal_options).then_unpack(
                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, lw_shared_ptr<const service::pager::paging_state> paging_state) {
-                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
-                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
-                        return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then_unpack(consume_results).then([has_more_pages] {
-                            return stop_iteration(!has_more_pages);
+                        return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, paging_state)
+                        .then_unpack([paging_state, consume_results = std::move(consume_results)](foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                            return consume_results(std::move(results), std::move(cmd), std::move(paging_state));
                        });
                    });
                } else {
                    return find_index_clustering_rows(proxy, state, *internal_options).then_unpack(
                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, lw_shared_ptr<const service::pager::paging_state> paging_state) {
-                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
-                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
-                        return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then_unpack(consume_results).then([has_more_pages] {
-                            return stop_iteration(!has_more_pages);
+                        return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, paging_state)
+                        .then_unpack([paging_state, consume_results = std::move(consume_results)](foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                            return consume_results(std::move(results), std::move(cmd), std::move(paging_state));
                        });
                    });
                }
@@ -1687,6 +1710,16 @@ std::vector<size_t> select_statement::prepare_group_by(const schema& schema, sel

 }

+future<> set_internal_paging_size(int paging_size) {
+    return seastar::smp::invoke_on_all([paging_size] {
+        internal_paging_size = paging_size;
+    });
+}
+
+future<> reset_internal_paging_size() {
+    return set_internal_paging_size(DEFAULT_INTERNAL_PAGING_SIZE);
+}
+
 }

 namespace util {
--- a/database.cc
+++ b/database.cc
@@ -572,9 +572,6 @@ void database::set_format_by_config() {
 }

 database::~database() {
-    _read_concurrency_sem.clear_inactive_reads();
-    _streaming_concurrency_sem.clear_inactive_reads();
-    _system_read_concurrency_sem.clear_inactive_reads();
 }

 void database::update_version(const utils::UUID& version) {
@@ -662,11 +659,22 @@ future<> database::parse_system_tables(distributed<service::storage_proxy>& prox
            });
    }).then([&proxy, &mm, this] {
        return do_parse_schema_tables(proxy, db::schema_tables::VIEWS, [this, &proxy, &mm] (schema_result_value_type &v) {
-            return create_views_from_schema_partition(proxy, v.second).then([this, &mm] (std::vector<view_ptr> views) {
-                return parallel_for_each(views.begin(), views.end(), [this, &mm] (auto&& v) {
-                    return this->add_column_family_and_make_directory(v).then([this, &mm, v] {
-                        return maybe_update_legacy_secondary_index_mv_schema(mm.local(), *this, v);
-                    });
+            return create_views_from_schema_partition(proxy, v.second).then([this, &mm, &proxy] (std::vector<view_ptr> views) {
+                return parallel_for_each(views.begin(), views.end(), [this, &mm, &proxy] (auto&& v) {
+                    // TODO: Remove once computed columns are guaranteed to be featured in the whole cluster.
+                    // we fix here the schema in place in oreder to avoid races (write commands comming from other coordinators).
+                    view_ptr fixed_v = maybe_fix_legacy_secondary_index_mv_schema(*this, v, nullptr, preserve_version::yes);
+                    view_ptr v_to_add = fixed_v ? fixed_v : v;
+                    future<> f = this->add_column_family_and_make_directory(v_to_add);
+                    if (bool(fixed_v)) {
+                        v_to_add = fixed_v;
+                        auto&& keyspace = find_keyspace(v->ks_name()).metadata();
+                        auto mutations = db::schema_tables::make_update_view_mutations(keyspace, view_ptr(v), fixed_v, api::new_timestamp(), true);
+                        f = f.then([this, &proxy, mutations = std::move(mutations)] {
+                            return db::schema_tables::merge_schema(proxy, _feat, std::move(mutations));
+                        });
+                    }
+                    return f;
                });
            });
        });
@@ -801,7 +809,7 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
    remove(*cf);
    cf->clear_views();
    auto& ks = find_keyspace(ks_name);
-    return when_all_succeed(cf->await_pending_writes(), cf->await_pending_reads()).then_unpack([this, &ks, cf, tsf = std::move(tsf), snapshot] {
+    return cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
            return cf->stop();
        });
@@ -1808,6 +1816,13 @@ future<>
 database::stop() {
    assert(!_large_data_handler->running());

+    // Inactive reads might hold on to sstables, blocking the
+    // `sstables_manager::close()` calls below. No one will come back for these
+    // reads at this point so clear them before proceeding with the shutdown.
+    _read_concurrency_sem.clear_inactive_reads();
+    _streaming_concurrency_sem.clear_inactive_reads();
+    _system_read_concurrency_sem.clear_inactive_reads();
+
    // try to ensure that CL has done disk flushing
    future<> maybe_shutdown_commitlog = _commitlog != nullptr ? _commitlog->shutdown() : make_ready_future<>();
    return maybe_shutdown_commitlog.then([this] {
@@ -1859,26 +1874,28 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun

        return cf.run_with_compaction_disabled([this, &cf, should_flush, auto_snapshot, tsf = std::move(tsf), low_mark]() mutable {
            future<> f = make_ready_future<>();
-            if (should_flush) {
+            bool did_flush = false;
+            if (should_flush && cf.can_flush()) {
                // TODO:
                // this is not really a guarantee at all that we've actually
                // gotten all things to disk. Again, need queue-ish or something.
                f = cf.flush();
+                did_flush = true;
            } else {
                f = cf.clear();
            }
-            return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush] {
+            return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush, did_flush] {
                dblog.debug("Discarding sstable data for truncated CF + indexes");
                // TODO: notify truncation

-                return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush](db_clock::time_point truncated_at) {
+                return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush, did_flush](db_clock::time_point truncated_at) {
                    future<> f = make_ready_future<>();
                    if (auto_snapshot) {
                        auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
                        f = cf.snapshot(*this, name);
                    }
-                    return f.then([this, &cf, truncated_at, low_mark, should_flush] {
-                        return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush](db::replay_position rp) {
+                    return f.then([this, &cf, truncated_at, low_mark, should_flush, did_flush] {
+                        return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush, did_flush](db::replay_position rp) {
                            // TODO: indexes.
                            // Note: since discard_sstables was changed to only count tables owned by this shard,
                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
@@ -1886,7 +1903,7 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
                            // creating the sstables that would create them.
-                            assert(!should_flush || low_mark <= rp || rp == db::replay_position());
+                            assert(!did_flush || low_mark <= rp || rp == db::replay_position());
                            rp = std::max(low_mark, rp);
                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
                                // save_truncation_record() may actually fail after we cached the truncation time
--- a/database.hh
+++ b/database.hh
@@ -224,6 +224,10 @@ public:
        return bool(_seal_immediate_fn);
    }

+    bool can_flush() const {
+        return may_flush() && !empty();
+    }
+
    bool empty() const {
        for (auto& m : _memtables) {
           if (!m->empty()) {
@@ -505,6 +509,8 @@ private:
    utils::phased_barrier _pending_reads_phaser;
    // Corresponding phaser for in-progress streams
    utils::phased_barrier _pending_streams_phaser;
+    // Corresponding phaser for in-progress flushes
+    utils::phased_barrier _pending_flushes_phaser;

    // This field cashes the last truncation time for the table.
    // The master resides in system.truncated table
@@ -780,6 +786,8 @@ public:
    // to them, and then pass that + 1 as "start".
    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);

+    bool can_flush() const;
+
    // FIXME: this is just an example, should be changed to something more
    // general. compact_all_sstables() starts a compaction of all sstables.
    // It doesn't flush the current memtable first. It's just a ad-hoc method,
@@ -932,6 +940,14 @@ public:
        return _pending_streams_phaser.advance_and_await();
    }

+    future<> await_pending_flushes() {
+        return _pending_flushes_phaser.advance_and_await();
+    }
+
+    future<> await_pending_ops() {
+        return when_all(await_pending_reads(), await_pending_writes(), await_pending_streams(), await_pending_flushes()).discard_result();
+    }
+
    void add_or_update_view(view_ptr v);
    void remove_view(view_ptr v);
    void clear_views();
--- a/db/config.cc
+++ b/db/config.cc
@@ -31,6 +31,7 @@
 #include <seastar/core/print.hh>
 #include <seastar/util/log.hh>

+#include "cdc/cdc_extension.hh"
 #include "config.hh"
 #include "extensions.hh"
 #include "log.hh"
@@ -694,7 +695,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , replace_address(this, "replace_address", value_status::Used, "", "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.")
    , replace_address_first_boot(this, "replace_address_first_boot", value_status::Used, "", "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.")
    , override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
-    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
+    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, false, "Set true to use enable repair based node operations instead of streaming based")
    , ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
    , shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
    , fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")
@@ -792,6 +793,10 @@ db::config::config()
 db::config::~config()
 {}

+void db::config::add_cdc_extension() {
+    _extensions->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
+}
+
 void db::config::setup_directories() {
    maybe_in_workdir(commitlog_directory, "commitlog");
    maybe_in_workdir(data_file_directories, "data");
@@ -874,7 +879,7 @@ db::fs::path db::config::get_conf_sub(db::fs::path sub) {
 }

 bool db::config::check_experimental(experimental_features_t::feature f) const {
-    if (experimental() && f != experimental_features_t::UNUSED) {
+    if (experimental() && f != experimental_features_t::UNUSED && f != experimental_features_t::UNUSED_CDC) {
        return true;
    }
    const auto& optval = experimental_features();
@@ -928,11 +933,13 @@ std::unordered_map<sstring, db::experimental_features_t::feature> db::experiment
    // https://github.com/scylladb/scylla/pull/5369#discussion_r353614807
    // Lightweight transactions are no longer experimental. Map them
    // to UNUSED switch for a while, then remove altogether.
-    return {{"lwt", UNUSED}, {"udf", UDF}, {"cdc", CDC}};
+    // Change Data Capture is no longer experimental. Map it
+    // to UNUSED_CDC switch for a while, then remove altogether.
+    return {{"lwt", UNUSED}, {"udf", UDF}, {"cdc", UNUSED_CDC}, {"alternator-streams", ALTERNATOR_STREAMS}};
 }

 std::vector<enum_option<db::experimental_features_t>> db::experimental_features_t::all() {
-    return {UDF, CDC};
+    return {UDF, ALTERNATOR_STREAMS};
 }

 template struct utils::config_file::named_value<seastar::log_level>;
--- a/db/config.hh
+++ b/db/config.hh
@@ -81,7 +81,7 @@ namespace db {

 /// Enumeration of all valid values for the `experimental` config entry.
 struct experimental_features_t {
-    enum feature { UNUSED, UDF, CDC };
+    enum feature { UNUSED, UDF, UNUSED_CDC, ALTERNATOR_STREAMS };
    static std::unordered_map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
 };
@@ -92,6 +92,9 @@ public:
    config(std::shared_ptr<db::extensions>);
    ~config();

+    // For testing only
+    void add_cdc_extension();
+
    /// True iff the feature is enabled.
    bool check_experimental(experimental_features_t::feature f) const;

--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -113,7 +113,7 @@ future<> cql_table_large_data_handler::record_large_cells(const sstables::sstabl
        auto ck_str = key_to_str(*clustering_key, s);
        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("{} {}", ck_str, column_name), extra_fields, ck_str, column_name);
    } else {
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, nullptr, column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
    }
 }

@@ -125,7 +125,7 @@ future<> cql_table_large_data_handler::record_large_rows(const sstables::sstable
        std::string ck_str = key_to_str(*clustering_key, s);
        return try_record("row", sst, partition_key, int64_t(row_size), "row", ck_str, extra_fields,  ck_str);
    } else {
-        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, nullptr);
+        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, data_value::make_null(utf8_type));
    }
 }

--- a/db/large_data_handler.hh
+++ b/db/large_data_handler.hh
@@ -111,27 +111,12 @@ public:
        return make_ready_future<>();
    }

-    future<> maybe_delete_large_data_entries(const schema& s, sstring filename, uint64_t data_size) {
+    future<> maybe_delete_large_data_entries(const schema& /*s*/, sstring /*filename*/, uint64_t /*data_size*/) {
        assert(running());
-        future<> large_partitions = make_ready_future<>();
-        if (__builtin_expect(data_size > _partition_threshold_bytes, false)) {
-            large_partitions = with_sem([&s, filename, this] () mutable {
-                return delete_large_data_entries(s, std::move(filename), db::system_keyspace::LARGE_PARTITIONS);
-            });
-        }
-        future<> large_rows = make_ready_future<>();
-        if (__builtin_expect(data_size > _row_threshold_bytes, false)) {
-            large_rows = with_sem([&s, filename, this] () mutable {
-                return delete_large_data_entries(s, std::move(filename), db::system_keyspace::LARGE_ROWS);
-            });
-        }
-        future<> large_cells = make_ready_future<>();
-        if (__builtin_expect(data_size > _cell_threshold_bytes, false)) {
-            large_cells = with_sem([&s, filename, this] () mutable {
-                return delete_large_data_entries(s, std::move(filename), db::system_keyspace::LARGE_CELLS);
-            });
-        }
-        return when_all(std::move(large_partitions), std::move(large_rows), std::move(large_cells)).discard_result();
+
+        // Deletion of large data entries is disabled due to #7668
+        // They will evetually expire based on the 30 days TTL.
+        return make_ready_future<>();
    }

    const large_data_handler::stats& stats() const { return _stats; }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -58,6 +58,7 @@
 #include "schema_registry.hh"
 #include "mutation_query.hh"
 #include "system_keyspace.hh"
+#include "system_distributed_keyspace.hh"
 #include "cql3/cql3_type.hh"
 #include "cql3/functions/functions.hh"
 #include "cql3/util.hh"
@@ -104,6 +105,11 @@ using namespace std::chrono_literals;

 static logging::logger diff_logger("schema_diff");

+static bool is_extra_durable(const sstring& ks_name, const sstring& cf_name) {
+    return (is_system_keyspace(ks_name) && db::system_keyspace::is_extra_durable(cf_name))
+        || (ks_name == db::system_distributed_keyspace::NAME && db::system_distributed_keyspace::is_extra_durable(cf_name));
+}
+

 /** system.schema_* tables used to store keyspace/table/type attributes prior to C* 3.0 */
 namespace db {
@@ -1202,7 +1208,42 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
        return create_table_from_mutations(proxy, std::move(sm));
    });
    auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), [&] (schema_mutations sm) {
-        return create_view_from_mutations(proxy, std::move(sm));
+        // The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
+        // If we don't do it we are leaving a window where write commands to this schema are illegal.
+        // There are 3 possibilities:
+        // 1. The table was altered - in this case we want the view to correspond to this new table schema.
+        // 2. The table was just created - the table is guarantied to be published with the view in that case.
+        // 3. The view itself was altered - in that case we already know the base table so we can take it from
+        //    the database object.
+        view_ptr vp = create_view_from_mutations(proxy, std::move(sm));
+        schema_ptr base_schema;
+        for (auto&& s : tables_diff.altered) {
+            if (s.new_schema.get()->ks_name() == vp->ks_name() && s.new_schema.get()->cf_name() == vp->view_info()->base_name() ) {
+                base_schema = s.new_schema;
+                break;
+            }
+        }
+        if (!base_schema) {
+            for (auto&& s : tables_diff.created) {
+                if (s.get()->ks_name() == vp->ks_name() && s.get()->cf_name() == vp->view_info()->base_name() ) {
+                    base_schema = s;
+                    break;
+                }
+            }
+        }
+
+        if (!base_schema) {
+            base_schema = proxy.local().local_db().find_schema(vp->ks_name(), vp->view_info()->base_name());
+        }
+
+        // Now when we have a referenced base - just in case we are registering an old view (this can happen in a mixed cluster)
+        // lets make it write enabled by updating it's compute columns.
+        view_ptr fixed_vp = maybe_fix_legacy_secondary_index_mv_schema(proxy.local().get_db().local(), vp, base_schema, preserve_version::yes);
+        if(fixed_vp) {
+            vp = fixed_vp;
+        }
+        vp->view_info()->set_base_info(vp->view_info()->make_base_dependent_view_info(*base_schema));
+        return vp;
    });

    proxy.local().get_db().invoke_on_all([&] (database& db) {
@@ -2499,7 +2540,7 @@ schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations
        builder.with_sharder(smp::count, ctxt.murmur3_partitioner_ignore_msb_bits());
    }

-    if (is_system_keyspace(ks_name) && is_extra_durable(cf_name)) {
+    if (is_extra_durable(ks_name, cf_name)) {
        builder.set_wait_for_sync_to_commitlog(true);
    }

@@ -3027,39 +3068,40 @@ std::vector<sstring> all_table_names(schema_features features) {
           boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); }));
 }

-future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v) {
-    // TODO(sarna): Remove once computed columns are guaranteed to be featured in the whole cluster.
+view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version) {
    // Legacy format for a secondary index used a hardcoded "token" column, which ensured a proper
    // order for indexed queries. This "token" column is now implemented as a computed column,
    // but for the sake of compatibility we assume that there might be indexes created in the legacy
    // format, where "token" is not marked as computed. Once we're sure that all indexes have their
    // columns marked as computed (because they were either created on a node that supports computed
    // columns or were fixed by this utility function), it's safe to remove this function altogether.
-    if (!db.features().cluster_supports_computed_columns()) {
-        return make_ready_future<>();
-    }
-
    if (v->clustering_key_size() == 0) {
-        return make_ready_future<>();
+        return view_ptr(nullptr);
    }
    const column_definition& first_view_ck = v->clustering_key_columns().front();
    if (first_view_ck.is_computed()) {
-        return make_ready_future<>();
+        return view_ptr(nullptr);
+    }
+
+    if (!base_schema) {
+        base_schema = db.find_schema(v->view_info()->base_id());
    }

-    table& base = db.find_column_family(v->view_info()->base_id());
-    schema_ptr base_schema = base.schema();
    // If the first clustering key part of a view is a column with name not found in base schema,
    // it implies it might be backing an index created before computed columns were introduced,
    // and as such it must be recreated properly.
    if (!base_schema->columns_by_name().contains(first_view_ck.name())) {
        schema_builder builder{schema_ptr(v)};
        builder.mark_column_computed(first_view_ck.name(), std::make_unique<token_column_computation>());
-        return mm.announce_view_update(view_ptr(builder.build()), true);
+        if (preserve_version) {
+            builder.with_version(v->version());
+        }
+        return view_ptr(builder.build());
    }
-    return make_ready_future<>();
+    return view_ptr(nullptr);
 }

+
 namespace legacy {

 table_schema_version schema_mutations::digest() const {
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -238,7 +238,9 @@ std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata

 std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

-future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v);
+class preserve_version_tag {};
+using preserve_version = bool_class<preserve_version_tag>;
+view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version);

 sstring serialize_kind(column_kind kind);
 column_kind deserialize_kind(sstring kind);
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -201,10 +201,10 @@ static future<std::vector<token_range>> get_local_ranges(database& db) {
        // All queries will be on that table, where all entries are text and there's no notion of
        // token ranges form the CQL point of view.
        auto left_inf = boost::find_if(ranges, [] (auto&& r) {
-            return !r.start() || r.start()->value() == dht::minimum_token();
+            return r.end() && (!r.start() || r.start()->value() == dht::minimum_token());
        });
        auto right_inf = boost::find_if(ranges, [] (auto&& r) {
-            return !r.end() || r.start()->value() == dht::maximum_token();
+            return r.start() && (!r.end() || r.end()->value() == dht::maximum_token());
        });
        if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
            local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -113,6 +113,10 @@ static std::vector<schema_ptr> all_tables() {
    };
 }

+bool system_distributed_keyspace::is_extra_durable(const sstring& cf_name) {
+    return cf_name == CDC_TOPOLOGY_DESCRIPTION;
+}
+
 system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm)
        : _qp(qp)
        , _mm(mm) {
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -64,6 +64,10 @@ private:
    service::migration_manager& _mm;

 public:
+    /* Should writes to the given table always be synchronized by commitlog (flushed to disk)
+     * before being acknowledged? */
+    static bool is_extra_durable(const sstring& cf_name);
+
    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&);

    future<> start();
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -1241,6 +1241,14 @@ future<> mutate_MV(
                }
            }
        }
+        // It's still possible that a target endpoint is dupliated in the remote endpoints list,
+        // so let's get rid of the duplicate if it exists
+        if (target_endpoint) {
+            auto remote_it = std::find(remote_endpoints.begin(), remote_endpoints.end(), *target_endpoint);
+            if (remote_it != remote_endpoints.end()) {
+                remote_endpoints.erase(remote_it);
+            }
+        }

        if (target_endpoint && *target_endpoint == my_address) {
            ++stats.view_updates_pushed_local;
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -24,6 +24,8 @@ import os
 import sys
 import tempfile
 import tarfile
+import shutil
+import glob
 from scylla_util import *
 import argparse

@@ -61,6 +63,9 @@ if __name__ == '__main__':
            f.write(data)
        with tarfile.open('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION)) as tf:
            tf.extractall(INSTALL_DIR)
+        shutil.chown(f'{INSTALL_DIR}/node_exporter-{VERSION}.linux-amd64', 'root', 'root')
+        for f in glob.glob(f'{INSTALL_DIR}/node_exporter-{VERSION}.linux-amd64/*'):
+            shutil.chown(f, 'root', 'root')
        os.remove('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION))
        if node_exporter_p.exists():
            node_exporter_p.unlink()
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -244,12 +244,12 @@ if __name__ == "__main__":
                # and https://cloud.google.com/compute/docs/disks/local-ssd#nvme
                # note that scylla iotune might measure more, this is GCP recommended
                mbs=1024*1024
-                if nr_disks >= 1 & nr_disks < 4:
+                if nr_disks >= 1 and nr_disks < 4:
                    disk_properties["read_iops"] = 170000 * nr_disks
                    disk_properties["read_bandwidth"] = 660 * mbs * nr_disks
                    disk_properties["write_iops"] = 90000 * nr_disks
                    disk_properties["write_bandwidth"] = 350 * mbs * nr_disks
-                elif nr_disks >= 4 & nr_disks <= 8:
+                elif nr_disks >= 4 and nr_disks <= 8:
                    disk_properties["read_iops"] = 680000
                    disk_properties["read_bandwidth"] = 2650 * mbs
                    disk_properties["write_iops"] = 360000
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -27,6 +27,7 @@ import platform
 import distro

 from scylla_util import *
+from multiprocessing import cpu_count

 def get_mode_cpuset(nic, mode):
    mode_cpu_mask = out('/opt/scylladb/scripts/perftune.py --tune net --nic {} --mode {} --get-cpu-mask-quiet'.format(nic, mode))
@@ -97,6 +98,16 @@ def verify_cpu():
                    print('\nIf this is a virtual machine, please update its CPU feature configuration or upgrade to a newer hypervisor.')
                    sys.exit(1)

+def configure_aio_slots():
+    with open('/proc/sys/fs/aio-max-nr') as f:
+        aio_max_nr = int(f.read())
+    # (10000 + 1024 + 2) * ncpus for scylla,
+    # 65536 for other apps
+    required_aio_slots = cpu_count() * 11026 + 65536
+    if aio_max_nr < required_aio_slots:
+        with open('/proc/sys/fs/aio-max-nr', 'w') as f:
+            f.write(str(required_aio_slots))
+
 if __name__ == '__main__':
    verify_cpu()

@@ -114,6 +125,8 @@ if __name__ == '__main__':
        os.remove('/etc/scylla/ami_disabled')
        sys.exit(1)

+    configure_aio_slots()
+
    if mode == 'virtio':
        tap = cfg.get('TAP')
        user = cfg.get('USER')
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -36,7 +36,7 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Configure RAID volume for Scylla.')
    parser.add_argument('--disks', required=True,
                        help='specify disks for RAID')
-    parser.add_argument('--raiddev', default='/dev/md0',
+    parser.add_argument('--raiddev',
                        help='MD device name for RAID')
    parser.add_argument('--enable-on-nextboot', '--update-fstab', action='store_true', default=False,
                        help='mount RAID on next boot')
@@ -73,9 +73,25 @@ if __name__ == '__main__':
            print('{} is busy'.format(disk))
            sys.exit(1)

-    if os.path.exists(args.raiddev):
-        print('{} is already using'.format(args.raiddev))
-        sys.exit(1)
+    if len(disks) == 1 and not args.force_raid:
+        raid = False
+        fsdev = disks[0]
+    else:
+        raid = True
+        if args.raiddev is None:
+            raiddevs_to_try = [f'/dev/md{i}' for i in range(10)]
+        else:
+            raiddevs_to_try = [args.raiddev, ]
+        for fsdev in raiddevs_to_try:
+            raiddevname = os.path.basename(fsdev)
+            if not os.path.exists(f'/sys/block/{raiddevname}/md/array_state'):
+                break
+            print(f'{fsdev} is already using')
+        else:
+            if args.raiddev is None:
+                print("Can't find unused /dev/mdX")
+            sys.exit(1)
+        print(f'{fsdev} will be used to setup a RAID')

    if os.path.ismount(mount_at):
        print('{} is already mounted'.format(mount_at))
@@ -94,13 +110,6 @@ if __name__ == '__main__':
    except SystemdException:
        md_service = systemd_unit('mdadm.service')

-    if len(disks) == 1 and not args.force_raid:
-        raid = False
-        fsdev = disks[0]
-    else:
-        raid = True
-        fsdev = args.raiddev
-
    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='RAID0' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    if distro.name() == 'Ubuntu' and distro.version() == '14.04':
        if raid:
@@ -151,7 +160,7 @@ Before=scylla-server.service
 After={after}

 [Mount]
-What=UUID={uuid}
+What=/dev/disk/by-uuid/{uuid}
 Where={mount_at}
 Type=xfs
 Options=noatime
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -92,7 +92,7 @@ def scyllabindir():


 # @param headers dict of k:v
-def curl(url, headers=None, byte=False, timeout=3, max_retries=5):
+def curl(url, headers=None, byte=False, timeout=3, max_retries=5, retry_interval=5):
    retries = 0
    while True:
        try:
@@ -102,9 +102,8 @@ def curl(url, headers=None, byte=False, timeout=3, max_retries=5):
                    return res.read()
                else:
                    return res.read().decode('utf-8')
-        except urllib.error.HTTPError:
-            logging.warning("Failed to grab %s..." % url)
-            time.sleep(5)
+        except urllib.error.URLError:
+            time.sleep(retry_interval)
            retries += 1
            if retries >= max_retries:
                raise
@@ -188,7 +187,7 @@ class gcp_instance:
        """get list of nvme disks from metadata server"""
        import json
        try:
-            disksREST=self.__instance_metadata("disks")
+            disksREST=self.__instance_metadata("disks", True)
            disksobj=json.loads(disksREST)
            nvmedisks=list(filter(self.isNVME, disksobj))
        except Exception as e:
@@ -236,7 +235,8 @@ class gcp_instance:

    def instance_size(self):
        """Returns the size of the instance we are running in. i.e.: 2"""
-        return self.instancetype.split("-")[2]
+        instancetypesplit = self.instancetype.split("-")
+        return instancetypesplit[2] if len(instancetypesplit)>2 else 0

    def instance_class(self):
        """Returns the class of the instance we are running in. i.e.: n2"""
@@ -298,22 +298,31 @@ class gcp_instance:
        return self.__firstNvmeSize

    def is_recommended_instance(self):
-        if self.is_recommended_instance_size() and not self.is_unsupported_instance_class() and self.is_supported_instance_class():
+        if not self.is_unsupported_instance_class() and self.is_supported_instance_class() and self.is_recommended_instance_size():
            # at least 1:2GB cpu:ram ratio , GCP is at 1:4, so this should be fine
            if self.cpu/self.memoryGB < 0.5:
-              # 30:1 Disk/RAM ratio must be kept at least(AWS), we relax this a little bit
-              # on GCP we are OK with 50:1 , n1-standard-2 can cope with 1 disk, not more
-              diskCount = self.nvmeDiskCount
-              # to reach max performance for > 16 disks we mandate 32 or more vcpus
-              # https://cloud.google.com/compute/docs/disks/local-ssd#performance
-              if diskCount >= 16 and self.cpu < 32:
-                  return False
-              diskSize= self.firstNvmeSize
-              if diskCount < 1:
-                  return False
-              disktoramratio = (diskCount*diskSize)/self.memoryGB
-              if (disktoramratio <= 50) and (disktoramratio > 0):
-                  return True
+                diskCount = self.nvmeDiskCount
+                # to reach max performance for > 16 disks we mandate 32 or more vcpus
+                # https://cloud.google.com/compute/docs/disks/local-ssd#performance
+                if diskCount >= 16 and self.cpu < 32:
+                    logging.warning(
+                        "This machine doesn't have enough CPUs for allocated number of NVMEs (at least 32 cpus for >=16 disks). Performance will suffer.")
+                    return False
+                if diskCount < 1:
+                    logging.warning("No ephemeral disks were found.")
+                    return False
+                diskSize = self.firstNvmeSize
+                max_disktoramratio = 105
+                # 30:1 Disk/RAM ratio must be kept at least(AWS), we relax this a little bit
+                # on GCP we are OK with {max_disktoramratio}:1 , n1-standard-2 can cope with 1 disk, not more
+                disktoramratio = (diskCount * diskSize) / self.memoryGB
+                if (disktoramratio > max_disktoramratio):
+                    logging.warning(
+                        f"Instance disk-to-RAM ratio is {disktoramratio}, which is higher than the recommended ratio {max_disktoramratio}. Performance may suffer.")
+                    return False
+                return True
+            else:
+                logging.warning("At least 2G of RAM per CPU is needed. Performance will suffer.")
        return False

    def private_ipv4(self):
@@ -365,6 +374,8 @@ class aws_instance:
            raise Exception("found more than one disk mounted at root'".format(root_dev_candidates))

        root_dev = root_dev_candidates[0].device
+        if root_dev == '/dev/root':
+            root_dev = run('findmnt -n -o SOURCE /', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
        nvmes_present = list(filter(nvme_re.match, os.listdir("/dev")))
        return {"root": [ root_dev ], "ephemeral": [ x for x in nvmes_present if not root_dev.startswith(os.path.join("/dev/", x)) ] }

@@ -398,7 +409,7 @@ class aws_instance:
    def is_aws_instance(cls):
        """Check if it's AWS instance via query to metadata server."""
        try:
-            curl(cls.META_DATA_BASE_URL, max_retries=2)
+            curl(cls.META_DATA_BASE_URL, max_retries=2, retry_interval=1)
            return True
        except (urllib.error.URLError, urllib.error.HTTPError):
            return False
@@ -462,7 +473,7 @@ class aws_instance:

    def ebs_disks(self):
        """Returns all EBS disks"""
-        return set(self._disks["ephemeral"])
+        return set(self._disks["ebs"])

    def public_ipv4(self):
        """Returns the public IPv4 address of this instance"""
@@ -490,9 +501,7 @@ class aws_instance:
        return curl(self.META_DATA_BASE_URL + "user-data")


-# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
 scylla_env = os.environ.copy()
-scylla_env['PATH'] =  '{}:{}'.format(scyllabindir(), scylla_env['PATH'])
 scylla_env['DEBIAN_FRONTEND'] = 'noninteractive'

 def run(cmd, shell=False, silent=False, exception=True):
--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -1,2 +0,0 @@
-# Raise max AIO events
-fs.aio-max-nr = 1048576
--- a/dist/common/sysctl.d/99-scylla-inotify.conf
+++ b/dist/common/sysctl.d/99-scylla-inotify.conf
@@ -0,0 +1,4 @@
+# allocate enough inotify instances for large machines
+# each tls instance needs 1 inotify instance, and there can be
+# multiple tls instances per shard.
+fs.inotify.max_user_instances = 1200
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,8 +9,8 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
-    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
+    sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
 fi

 #DEBHELPER#
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=666.development
+ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/scylla-4.3/latest/scylla.repo
+ARG VERSION=4.3.rc0

 ADD scylla_bashrc /scylla_bashrc

--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -76,13 +76,18 @@ getent passwd scylla || /usr/sbin/useradd -g scylla -s /sbin/nologin -r -d %{_sh
 %post server
 /opt/scylladb/scripts/scylla_post_install.sh

-%systemd_post scylla-server.service
+if [ $1 -eq 1 ] ; then
+    /usr/bin/systemctl preset scylla-server.service ||:
+fi

 %preun server
-%systemd_preun scylla-server.service
+if [ $1 -eq 0 ] ; then
+    /usr/bin/systemctl --no-reload disable scylla-server.service ||:
+    /usr/bin/systemctl stop scylla-server.service ||:
+fi

 %postun server
-%systemd_postun scylla-server.service
+/usr/bin/systemctl daemon-reload ||:

 %posttrans server
 if  [ -d /tmp/%{name}-%{version}-%{release} ]; then
@@ -129,10 +134,9 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
 %ghost /etc/systemd/system/scylla-helper.slice.d/
 %ghost /etc/systemd/system/scylla-helper.slice.d/memory.conf
-%ghost /etc/systemd/system/scylla-server.service.d/
 %ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
 %ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
-%ghost /etc/systemd/system/scylla-server.service.d/dependencies.conf
+/etc/systemd/system/scylla-server.service.d/dependencies.conf
 %ghost /etc/systemd/system/var-lib-systemd-coredump.mount
 %ghost /etc/systemd/system/scylla-cpupower.service
 %ghost /etc/systemd/system/var-lib-scylla.mount
@@ -190,6 +194,8 @@ Summary:        Scylla configuration package for the Linux kernel
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Requires:       kmod
+# tuned overwrites our sysctl settings
+Obsoletes:	tuned

 %description kernel-conf
 This package contains Linux kernel configuration changes for the Scylla database.  Install this package
@@ -199,8 +205,8 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/gms/feature.hh
+++ b/gms/feature.hh
@@ -143,6 +143,7 @@ extern const std::string_view LWT;
 extern const std::string_view PER_TABLE_PARTITIONERS;
 extern const std::string_view PER_TABLE_CACHING;
 extern const std::string_view DIGEST_FOR_NULL_VALUES;
+extern const std::string_view ALTERNATOR_STREAMS;

 }

--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -62,6 +62,7 @@ constexpr std::string_view features::LWT = "LWT";
 constexpr std::string_view features::PER_TABLE_PARTITIONERS = "PER_TABLE_PARTITIONERS";
 constexpr std::string_view features::PER_TABLE_CACHING = "PER_TABLE_CACHING";
 constexpr std::string_view features::DIGEST_FOR_NULL_VALUES = "DIGEST_FOR_NULL_VALUES";
+constexpr std::string_view features::ALTERNATOR_STREAMS = "ALTERNATOR_STREAMS";

 static logging::logger logger("features");

@@ -86,6 +87,7 @@ feature_service::feature_service(feature_config cfg) : _config(cfg)
        , _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS)
        , _per_table_caching_feature(*this, features::PER_TABLE_CACHING)
        , _digest_for_null_values_feature(*this, features::DIGEST_FOR_NULL_VALUES)
+        , _alternator_streams_feature(*this, features::ALTERNATOR_STREAMS)
 {}

 feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring> disabled) {
@@ -116,8 +118,8 @@ feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring>
        }
    }

-    if (!cfg.check_experimental(db::experimental_features_t::CDC)) {
-        fcfg._disabled_features.insert(sstring(gms::features::CDC));
+    if (!cfg.check_experimental(db::experimental_features_t::ALTERNATOR_STREAMS)) {
+        fcfg._disabled_features.insert(sstring(gms::features::ALTERNATOR_STREAMS));
    }

    return fcfg;
@@ -187,6 +189,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
        gms::features::UDF,
        gms::features::CDC,
        gms::features::DIGEST_FOR_NULL_VALUES,
+        gms::features::ALTERNATOR_STREAMS,
    };

    for (const sstring& s : _config._disabled_features) {
@@ -266,6 +269,7 @@ void feature_service::enable(const std::set<std::string_view>& list) {
        std::ref(_per_table_partitioners_feature),
        std::ref(_per_table_caching_feature),
        std::ref(_digest_for_null_values_feature),
+        std::ref(_alternator_streams_feature),
    })
    {
        if (list.contains(f.name())) {
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -92,6 +92,7 @@ private:
    gms::feature _per_table_partitioners_feature;
    gms::feature _per_table_caching_feature;
    gms::feature _digest_for_null_values_feature;
+    gms::feature _alternator_streams_feature;

 public:
    bool cluster_supports_user_defined_functions() const {
@@ -160,6 +161,10 @@ public:
    bool cluster_supports_lwt() const {
        return bool(_lwt_feature);
    }
+
+    bool cluster_supports_alternator_streams() const {
+        return bool(_alternator_streams_feature);
+    }
 };

 } // namespace gms
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1774,6 +1774,8 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes,
                }).handle_exception_type([node, &fall_back_to_syn_msg] (seastar::rpc::unknown_verb_error&) {
                    logger.warn("Node {} does not support get_endpoint_states verb", node);
                    fall_back_to_syn_msg = true;
+                }).handle_exception_type([node, &nodes_down] (seastar::rpc::timeout_error&) {
+                    logger.warn("The get_endpoint_states verb to node {} was timeout", node);
                }).handle_exception_type([node, &nodes_down] (seastar::rpc::closed_error&) {
                    nodes_down++;
                    logger.warn("Node {} is down for get_endpoint_states verb", node);
--- a/install.sh
+++ b/install.sh
@@ -142,11 +142,15 @@ DEBIAN_SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
 if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
  c=\${DEBIAN_SSL_CERT_FILE}
 fi
-PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
+PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
 EOF
    chmod +x "$install"
 }

+install() {
+    command install -Z "$@"
+}
+
 installconfig() {
    local perm="$1"
    local src="$2"
@@ -197,13 +201,13 @@ if [ -z "$python3" ]; then
 fi
 rpython3=$(realpath -m "$root/$python3")
 if ! $nonroot; then
-    retc="$root/etc"
-    rsysconfdir="$root/$sysconfdir"
-    rusr="$root/usr"
-    rsystemd="$rusr/lib/systemd/system"
+    retc=$(realpath -m "$root/etc")
+    rsysconfdir=$(realpath -m "$root/$sysconfdir")
+    rusr=$(realpath -m "$root/usr")
+    rsystemd=$(realpath -m "$rusr/lib/systemd/system")
    rdoc="$rprefix/share/doc"
-    rdata="$root/var/lib/scylla"
-    rhkdata="$root/var/lib/scylla-housekeeping"
+    rdata=$(realpath -m "$root/var/lib/scylla")
+    rhkdata=$(realpath -m "$root/var/lib/scylla-housekeeping")
 else
    retc="$rprefix/etc"
    rsysconfdir="$rprefix/$sysconfdir"
@@ -412,6 +416,10 @@ elif ! $packaging; then
    chown -R scylla:scylla $rdata
    chown -R scylla:scylla $rhkdata

+    for file in dist/common/sysctl.d/*.conf; do
+        bn=$(basename "$file")
+        sysctl -p "$rusr"/lib/sysctl.d/"$bn"
+    done
    $rprefix/scripts/scylla_post_install.sh
    echo "Scylla offline install completed."
 fi
--- a/main.cc
+++ b/main.cc
@@ -1023,8 +1023,7 @@ int main(int ac, char** av) {
            proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
                auto& ss = service::get_local_storage_service();
                ss.register_subscriber(&local_proxy);
-                //FIXME: discarded future
-                (void)local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), ss.shared_from_this());
+                return local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), ss.shared_from_this());
            }).get();

            supervisor::notify("starting messaging service");
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -2044,11 +2044,13 @@ public:
        }
    }
    void abort(std::exception_ptr ep) {
-        _end_of_stream = true;
        _ex = std::move(ep);
        if (_full) {
            _full->set_exception(_ex);
            _full.reset();
+        } else if (_not_full) {
+            _not_full->set_exception(_ex);
+            _not_full.reset();
        }
    }
 };
--- a/mutation_writer/feed_writers.cc
+++ b/mutation_writer/feed_writers.cc
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2021 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "feed_writers.hh"
+
+namespace mutation_writer {
+
+bucket_writer::bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
+    : _schema(schema)
+    , _handle(std::move(queue_reader.second))
+    , _consume_fut(consumer(std::move(queue_reader.first)))
+{ }
+
+bucket_writer::bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
+    : bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer)
+{ }
+
+future<> bucket_writer::consume(mutation_fragment mf) {
+    return _handle.push(std::move(mf));
+}
+
+void bucket_writer::consume_end_of_stream() {
+    _handle.push_end_of_stream();
+}
+
+void bucket_writer::abort(std::exception_ptr ep) noexcept {
+    _handle.abort(std::move(ep));
+}
+
+future<> bucket_writer::close() noexcept {
+    return std::move(_consume_fut);
+}
+
+} // mutation_writer
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -22,10 +22,31 @@
 #pragma once

 #include "flat_mutation_reader.hh"
+#include "mutation_reader.hh"

 namespace mutation_writer {
 using reader_consumer = noncopyable_function<future<> (flat_mutation_reader)>;

+class bucket_writer {
+    schema_ptr _schema;
+    queue_reader_handle _handle;
+    future<> _consume_fut;
+
+private:
+    bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer);
+
+public:
+    bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer);
+
+    future<> consume(mutation_fragment mf);
+
+    void consume_end_of_stream();
+
+    void abort(std::exception_ptr ep) noexcept;
+
+    future<> close() noexcept;
+};
+
 template <typename Writer>
 requires MutationFragmentConsumer<Writer, future<>>
 future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
@@ -36,8 +57,22 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
                return when_all_succeed(std::move(f1), std::move(f2)).discard_result();
            });
-        }).finally([&wr] {
-            return wr.consume_end_of_stream();
+        }).then_wrapped([&wr] (future<> f) {
+            if (f.failed()) {
+                auto ex = f.get_exception();
+                wr.abort(ex);
+                return wr.close().then_wrapped([ex = std::move(ex)] (future<> f) mutable {
+                    if (f.failed()) {
+                        // The consumer is expected to fail when aborted,
+                        // so just ignore any exception.
+                        (void)f.get_exception();
+                    }
+                    return make_exception_future<>(std::move(ex));
+                });
+            } else {
+                wr.consume_end_of_stream();
+                return wr.close();
+            }
        });
    });
 }
--- a/mutation_writer/shard_based_splitting_writer.cc
+++ b/mutation_writer/shard_based_splitting_writer.cc
@@ -31,33 +31,7 @@
 namespace mutation_writer {

 class shard_based_splitting_mutation_writer {
-    class shard_writer {
-        queue_reader_handle _handle;
-        future<> _consume_fut;
-    private:
-        shard_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
-            : _handle(std::move(queue_reader.second))
-            , _consume_fut(consumer(std::move(queue_reader.first))) {
-        }
-
-    public:
-        shard_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
-            : shard_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
-        }
-        future<> consume(mutation_fragment mf) {
-            return _handle.push(std::move(mf));
-        }
-        future<> consume_end_of_stream() {
-            // consume_end_of_stream is always called from a finally block,
-            // and that's because we wait for _consume_fut to return. We
-            // don't want to generate another exception here if the read was
-            // aborted.
-            if (!_handle.is_terminated()) {
-                _handle.push_end_of_stream();
-            }
-            return std::move(_consume_fut);
-        }
-    };
+    using shard_writer = bucket_writer;

 private:
    schema_ptr _schema;
@@ -102,12 +76,23 @@ public:
        return write_to_shard(mutation_fragment(*_schema, _permit, std::move(pe)));
    }

-    future<> consume_end_of_stream() {
-        return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
-            if (!shard) {
-                return make_ready_future<>();
+    void consume_end_of_stream() {
+        for (auto& shard : _shards) {
+            if (shard) {
+                shard->consume_end_of_stream();
            }
-            return shard->consume_end_of_stream();
+        }
+    }
+    void abort(std::exception_ptr ep) {
+        for (auto&& shard : _shards) {
+            if (shard) {
+                shard->abort(ep);
+            }
+        }
+    }
+    future<> close() noexcept {
+        return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
+            return shard ? shard->close() : make_ready_future<>();
        });
    }
 };
--- a/mutation_writer/timestamp_based_splitting_writer.cc
+++ b/mutation_writer/timestamp_based_splitting_writer.cc
@@ -109,22 +109,12 @@ small_flat_map<Key, Value, Size>::find(const key_type& k) {
 class timestamp_based_splitting_mutation_writer {
    using bucket_id = int64_t;

-    class bucket_writer {
-        schema_ptr _schema;
-        queue_reader_handle _handle;
-        future<> _consume_fut;
+    class timestamp_bucket_writer : public bucket_writer {
        bool _has_current_partition = false;

-    private:
-        bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
-            : _schema(std::move(schema))
-            , _handle(std::move(queue_reader.second))
-            , _consume_fut(consumer(std::move(queue_reader.first))) {
-        }
-
    public:
-        bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
-            : bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
+        timestamp_bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
+            : bucket_writer(schema, std::move(permit), consumer) {
        }
        void set_has_current_partition() {
            _has_current_partition = true;
@@ -135,15 +125,6 @@ class timestamp_based_splitting_mutation_writer {
        bool has_current_partition() const {
            return _has_current_partition;
        }
-        future<> consume(mutation_fragment mf) {
-            return _handle.push(std::move(mf));
-        }
-        future<> consume_end_of_stream() {
-            if (!_handle.is_terminated()) {
-                _handle.push_end_of_stream();
-            }
-            return std::move(_consume_fut);
-        }
    };

 private:
@@ -152,7 +133,7 @@ private:
    classify_by_timestamp _classifier;
    reader_consumer _consumer;
    partition_start _current_partition_start;
-    std::unordered_map<bucket_id, bucket_writer> _buckets;
+    std::unordered_map<bucket_id, timestamp_bucket_writer> _buckets;
    std::vector<bucket_id> _buckets_used_for_current_partition;

 private:
@@ -183,9 +164,19 @@ public:
    future<> consume(range_tombstone&& rt);
    future<> consume(partition_end&& pe);

-    future<> consume_end_of_stream() {
-        return parallel_for_each(_buckets, [] (std::pair<const bucket_id, bucket_writer>& bucket) {
-            return bucket.second.consume_end_of_stream();
+    void consume_end_of_stream() {
+        for (auto& b : _buckets) {
+            b.second.consume_end_of_stream();
+        }
+    }
+    void abort(std::exception_ptr ep) {
+        for (auto&& b : _buckets) {
+            b.second.abort(ep);
+        }
+    }
+    future<> close() noexcept {
+        return parallel_for_each(_buckets, [] (std::pair<const bucket_id, timestamp_bucket_writer>& b) {
+            return b.second.close();
        });
    }
 };
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -542,12 +542,12 @@ partition_snapshot_ptr partition_entry::read(logalloc::region& r,
    return partition_snapshot_ptr(std::move(snp));
 }

-std::vector<range_tombstone>
+partition_snapshot::range_tombstone_result
 partition_snapshot::range_tombstones(position_in_partition_view start, position_in_partition_view end)
 {
    partition_version* v = &*version();
    if (!v->next()) {
-        return boost::copy_range<std::vector<range_tombstone>>(
+        return boost::copy_range<range_tombstone_result>(
            v->partition().row_tombstones().slice(*_schema, start, end));
    }
    range_tombstone_list list(*_schema);
@@ -557,10 +557,10 @@ partition_snapshot::range_tombstones(position_in_partition_view start, position_
        }
        v = v->next();
    }
-    return boost::copy_range<std::vector<range_tombstone>>(list.slice(*_schema, start, end));
+    return boost::copy_range<range_tombstone_result>(list.slice(*_schema, start, end));
 }

-std::vector<range_tombstone>
+partition_snapshot::range_tombstone_result
 partition_snapshot::range_tombstones()
 {
    return range_tombstones(
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -26,6 +26,7 @@
 #include "utils/anchorless_list.hh"
 #include "utils/logalloc.hh"
 #include "utils/coroutine.hh"
+#include "utils/chunked_vector.hh"

 #include <boost/intrusive/parent_from_member.hpp>
 #include <boost/intrusive/slist.hpp>
@@ -400,10 +401,13 @@ public:
    ::static_row static_row(bool digest_requested) const;
    bool static_row_continuous() const;
    mutation_partition squashed() const;
+
+    using range_tombstone_result = utils::chunked_vector<range_tombstone>;
+
    // Returns range tombstones overlapping with [start, end)
-    std::vector<range_tombstone> range_tombstones(position_in_partition_view start, position_in_partition_view end);
+    range_tombstone_result range_tombstones(position_in_partition_view start, position_in_partition_view end);
    // Returns all range tombstones
-    std::vector<range_tombstone> range_tombstones();
+    range_tombstone_result range_tombstones();
 };

 class partition_snapshot_ptr {
--- a/query-result.hh
+++ b/query-result.hh
@@ -205,6 +205,10 @@ public:
            auto to_block = std::min(_used_memory - _blocked_bytes, n);
            _blocked_bytes += to_block;
            stop = (_limiter->update_and_check(to_block) && _stop_on_global_limit) || stop;
+            if (stop && !_short_read_allowed) {
+                // If we are here we stopped because of the global limit.
+                throw std::runtime_error("Maximum amount of memory for building query results is exhausted, unpaged query cannot be finished");
+            }
        }
        return stop;
    }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -75,7 +75,7 @@ class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intru
    sstring _op_name;
    std::string_view _op_name_view;
    reader_resources _resources;
-    reader_permit::state _state = reader_permit::state::registered;
+    reader_permit::state _state = reader_permit::state::active;

 public:
    struct value_tag {};
@@ -123,22 +123,17 @@ public:
    }

    void on_admission() {
-        _state = reader_permit::state::admitted;
-        _semaphore.consume(_resources);
+        _state = reader_permit::state::active;
    }

    void consume(reader_resources res) {
        _resources += res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.consume(res);
-        }
+        _semaphore.consume(res);
    }

    void signal(reader_resources res) {
        _resources -= res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.signal(res);
-        }
+        _semaphore.signal(res);
    }

    reader_resources resources() const {
@@ -205,14 +200,11 @@ reader_resources reader_permit::consumed_resources() const {

 std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
    switch (s) {
-        case reader_permit::state::registered:
-            os << "registered";
-            break;
        case reader_permit::state::waiting:
            os << "waiting";
            break;
-        case reader_permit::state::admitted:
-            os << "admitted";
+        case reader_permit::state::active:
+            os << "active";
            break;
    }
    return os;
@@ -249,7 +241,7 @@ struct permit_group_key_hash {

 using permit_groups = std::unordered_map<permit_group_key, permit_stats, permit_group_key_hash>;

-static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) {
+static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state) {
    struct permit_summary {
        const schema* s;
        std::string_view op_name;
@@ -265,25 +257,17 @@ static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const pe
        }
    }

-    std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) {
-        if (sort_by_memory) {
-            return a.memory < b.memory;
-        } else {
-            return a.count < b.count;
-        }
+    std::ranges::sort(permit_summaries, [] (const permit_summary& a, const permit_summary& b) {
+        return a.memory < b.memory;
    });

    permit_stats total;

-    auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) {
-        if (sort_by_memory) {
-            fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
-        } else {
-            fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3);
-        }
+    auto print_line = [&os] (auto col1, auto col2, auto col3) {
+        fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
    };

-    fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count");
+    fmt::print(os, "Permits with state {}\n", state);
    print_line("count", "memory", "name");
    for (const auto& summary : permit_summaries) {
        total.count += summary.count;
@@ -309,11 +293,9 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
    permit_stats total;

    fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem);
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::active);
    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false);
-    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting);
    fmt::print(os, "\n");
    fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory));
 }
@@ -374,7 +356,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
 reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(std::unique_ptr<inactive_read> ir) {
    // Implies _inactive_reads.empty(), we don't queue new readers before
    // evicting all inactive reads.
-    if (_wait_list.empty()) {
+    if (_wait_list.empty() && _resources.memory > 0) {
        const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
        (void)_;
        ++_stats.inactive_reads;
@@ -424,13 +406,13 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read() {
 }

 bool reader_concurrency_semaphore::has_available_units(const resources& r) const {
-    return bool(_resources) && _resources >= r;
+    // Special case: when there is no active reader (based on count) admit one
+    // regardless of availability of memory.
+    return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
 }

 bool reader_concurrency_semaphore::may_proceed(const resources& r) const {
-    // Special case: when there is no active reader (based on count) admit one
-    // regardless of availability of memory.
-    return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count);
+    return _wait_list.empty() && has_available_units(r);
 }

 future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
@@ -480,6 +462,12 @@ void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
    }
 }

+std::string reader_concurrency_semaphore::dump_diagnostics() const {
+    std::ostringstream os;
+    do_dump_reader_permit_diagnostics(os, *this, *_permit_list, "user request");
+    return os.str();
+}
+
 // A file that tracks the memory usage of buffers resulting from read
 // operations.
 class tracking_file_impl : public file_impl {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -231,4 +231,6 @@ public:
    }

    void broken(std::exception_ptr ex);
+
+    std::string dump_diagnostics() const;
 };
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -91,9 +91,8 @@ public:
    class resource_units;

    enum class state {
-        registered, // read is registered, but didn't attempt admission yet
        waiting, // waiting for admission
-        admitted,
+        active,
    };

    class impl;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -509,7 +509,7 @@ public:
    }
 };

-class repair_writer {
+class repair_writer : public enable_lw_shared_from_this<repair_writer> {
    schema_ptr _schema;
    reader_permit _permit;
    uint64_t _estimated_partitions;
@@ -569,6 +569,7 @@ public:
        table& t = db.local().find_column_family(_schema->id());
        auto [queue_reader, queue_handle] = make_queue_reader(_schema, _permit);
        _mq[node_idx] = std::move(queue_handle);
+        auto writer = shared_from_this();
        _writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema, std::move(queue_reader),
                [&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
            auto& t = db.local().find_column_family(reader.schema());
@@ -598,13 +599,13 @@ public:
                return consumer(std::move(reader));
            });
        },
-        t.stream_in_progress()).then([this, node_idx] (uint64_t partitions) {
+        t.stream_in_progress()).then([node_idx, writer] (uint64_t partitions) {
            rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
-                _schema->ks_name(), _schema->cf_name(), partitions);
-        }).handle_exception([this, node_idx] (std::exception_ptr ep) {
+                writer->_schema->ks_name(), writer->_schema->cf_name(), partitions);
+        }).handle_exception([node_idx, writer] (std::exception_ptr ep) {
            rlogger.warn("repair_writer: keyspace={}, table={}, multishard_writer failed: {}",
-                    _schema->ks_name(), _schema->cf_name(), ep);
-            _mq[node_idx]->abort(ep);
+                    writer->_schema->ks_name(), writer->_schema->cf_name(), ep);
+            writer->_mq[node_idx]->abort(ep);
            return make_exception_future<>(std::move(ep));
        });
    }
@@ -718,7 +719,7 @@ private:
    size_t _nr_peer_nodes= 1;
    repair_stats _stats;
    repair_reader _repair_reader;
-    repair_writer _repair_writer;
+    lw_shared_ptr<repair_writer> _repair_writer;
    // Contains rows read from disk
    std::list<repair_row> _row_buf;
    // Contains rows we are working on to sync between peers
@@ -822,7 +823,7 @@ public:
                    _seed,
                    repair_reader::is_local_reader(_repair_master || _same_sharding_config)
              )
-            , _repair_writer(_schema, _permit, _estimated_partitions, _nr_peer_nodes, _reason)
+            , _repair_writer(make_lw_shared<repair_writer>(_schema, _permit, _estimated_partitions, _nr_peer_nodes, _reason))
            , _sink_source_for_get_full_row_hashes(_repair_meta_id, _nr_peer_nodes,
                    [&ms] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
                        return ms.local().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, addr);
@@ -855,7 +856,7 @@ public:
        auto f2 = _sink_source_for_get_row_diff.close();
        auto f3 = _sink_source_for_put_row_diff.close();
        return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).discard_result().finally([this] {
-            return _repair_writer.wait_for_writer_done();
+            return _repair_writer->wait_for_writer_done();
        });
    }

@@ -1340,8 +1341,8 @@ private:

    future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
        return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
-            return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
-                _repair_writer.create_writer(_db, node_idx);
+            return with_semaphore(_repair_writer->sem(), 1, [this, node_idx, update_buf, &row_diff] {
+                _repair_writer->create_writer(_db, node_idx);
                return repeat([this, node_idx, update_buf, &row_diff] () mutable {
                    if (row_diff.empty()) {
                        return make_ready_future<stop_iteration>(stop_iteration::yes);
@@ -1355,7 +1356,7 @@ private:
                    // to_repair_rows_list above where the repair_row is created.
                    mutation_fragment mf = std::move(r.get_mutation_fragment());
                    auto dk_with_hash = r.get_dk_with_hash();
-                    return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
+                    return _repair_writer->do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
                        row_diff.pop_front();
                        return make_ready_future<stop_iteration>(stop_iteration::no);
                    });
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -1263,7 +1263,9 @@ flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader, row_
 // Assumes reader is in the corresponding partition
 flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) {
    auto snp = _pe.read(rc._tracker.region(), rc._tracker.cleaner(), _schema, &rc._tracker, reader.phase());
-    auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
+    auto ckr = with_linearized_managed_bytes([&] {
+        return query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
+    });
    auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
    r.upgrade_schema(rc.schema());
    r.upgrade_schema(reader.schema());
--- a/schema.cc
+++ b/schema.cc
@@ -456,6 +456,9 @@ schema::schema(const schema& o)
    rebuild();
    if (o.is_view()) {
        _view_info = std::make_unique<::view_info>(*this, o.view_info()->raw());
+        if (o.view_info()->base_info()) {
+            _view_info->set_base_info(o.view_info()->base_info());
+        }
    }
 }

@@ -859,7 +862,7 @@ std::ostream& schema::describe(database& db, std::ostream& os) const {
    os << "}";
    os << "\n    AND comment = '" << comment()<< "'";
    os << "\n    AND compaction = {'class': '" <<  sstables::compaction_strategy::name(compaction_strategy()) << "'";
-    map_as_cql_param(os, compaction_strategy_options()) << "}";
+    map_as_cql_param(os, compaction_strategy_options(), false) << "}";
    os << "\n    AND compression = {";
    map_as_cql_param(os,  get_compressor_params().get_options());
    os << "}";
--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -24,6 +24,7 @@
 #include "schema_registry.hh"
 #include "log.hh"
 #include "db/schema_tables.hh"
+#include "view_info.hh"

 static logging::logger slogger("schema_registry");

@@ -274,22 +275,43 @@ global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
    assert(o._cpu_of_origin == current);
    _ptr = std::move(o._ptr);
    _cpu_of_origin = current;
+    _base_schema = std::move(o._base_schema);
 }

 schema_ptr global_schema_ptr::get() const {
    if (this_shard_id() == _cpu_of_origin) {
        return _ptr;
    } else {
-        // 'e' points to a foreign entry, but we know it won't be evicted
-        // because _ptr is preventing this.
-        const schema_registry_entry& e = *_ptr->registry_entry();
-        schema_ptr s = local_schema_registry().get_or_null(e.version());
-        if (!s) {
-            s = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
-                return e.frozen();
-            });
+        auto registered_schema = [](const schema_registry_entry& e) {
+            schema_ptr ret = local_schema_registry().get_or_null(e.version());
+            if (!ret) {
+                ret = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
+                    return e.frozen();
+                });
+            }
+            return ret;
+        };
+
+        schema_ptr registered_bs;
+        // the following code contains registry entry dereference of a foreign shard
+        // however, it is guarantied to succeed since we made sure in the constructor
+        // that _bs_schema and _ptr will have a registry on the foreign shard where this
+        // object originated so as long as this object lives the registry entries lives too
+        // and it is safe to reference them on foreign shards.
+        if (_base_schema) {
+            registered_bs = registered_schema(*_base_schema->registry_entry());
+            if (_base_schema->registry_entry()->is_synced()) {
+                registered_bs->registry_entry()->mark_synced();
+            }
        }
-        if (e.is_synced()) {
+        schema_ptr s = registered_schema(*_ptr->registry_entry());
+        if (s->is_view()) {
+            if (!s->view_info()->base_info()) {
+                // we know that registered_bs is valid here because we make sure of it in the constructors.
+                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*registered_bs));
+            }
+        }
+        if (_ptr->registry_entry()->is_synced()) {
            s->registry_entry()->mark_synced();
        }
        return s;
@@ -297,16 +319,33 @@ schema_ptr global_schema_ptr::get() const {
 }

 global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
-    : _ptr([&ptr]() {
-        // _ptr must always have an associated registry entry,
-        // if ptr doesn't, we need to load it into the registry.
-        schema_registry_entry* e = ptr->registry_entry();
+        : _cpu_of_origin(this_shard_id()) {
+    // _ptr must always have an associated registry entry,
+    // if ptr doesn't, we need to load it into the registry.
+    auto ensure_registry_entry = [] (const schema_ptr& s) {
+        schema_registry_entry* e = s->registry_entry();
        if (e) {
-            return ptr;
-        }
-        return local_schema_registry().get_or_load(ptr->version(), [&ptr] (table_schema_version) {
-                return frozen_schema(ptr);
+            return s;
+        } else {
+            return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) {
+                return frozen_schema(s);
            });
-        }())
-    , _cpu_of_origin(this_shard_id())
-{ }
+        }
+    };
+
+    schema_ptr s = ensure_registry_entry(ptr);
+    if (s->is_view()) {
+        if (s->view_info()->base_info()) {
+            _base_schema = ensure_registry_entry(s->view_info()->base_info()->base_schema());
+        } else if (ptr->view_info()->base_info()) {
+            _base_schema = ensure_registry_entry(ptr->view_info()->base_info()->base_schema());
+        } else {
+            on_internal_error(slogger, format("Tried to build a global schema for view {}.{} with an uninitialized base info", s->ks_name(), s->cf_name()));
+        }
+
+        if (!s->view_info()->base_info() || !s->view_info()->base_info()->base_schema()->registry_entry()) {
+            s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*_base_schema));
+        }
+    }
+    _ptr = s;
+}
--- a/schema_registry.hh
+++ b/schema_registry.hh
@@ -165,6 +165,7 @@ schema_registry& local_schema_registry();
 // chain will last.
 class global_schema_ptr {
    schema_ptr _ptr;
+    schema_ptr _base_schema;
    unsigned _cpu_of_origin;
 public:
    // Note: the schema_ptr must come from the current shard and can't be nullptr.
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -53,6 +53,7 @@
 #include "database.hh"
 #include "db/schema_tables.hh"
 #include "types/user.hh"
+#include "db/schema_tables.hh"

 namespace service {

@@ -1096,8 +1097,19 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
            // referenced by the incoming request.
            // That means the column mapping for the schema should always be inserted
            // with TTL (refresh TTL in case column mapping already existed prior to that).
-            return db::schema_tables::store_column_mapping(proxy, s.unfreeze(db::schema_ctxt(proxy)), true).then([s] {
-                return s;
+            auto us = s.unfreeze(db::schema_ctxt(proxy));
+            // if this is a view - we might need to fix it's schema before registering it.
+            if (us->is_view()) {
+                auto& db = proxy.local().local_db();
+                schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
+                auto fixed_view = db::schema_tables::maybe_fix_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema,
+                        db::schema_tables::preserve_version::yes);
+                if (fixed_view) {
+                    us = fixed_view;
+                }
+            }
+            return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
+                return frozen_schema{us};
            });
        });
    }).then([] (schema_ptr s) {
@@ -1105,7 +1117,7 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
        // table.
        if (s->is_view()) {
            if (!s->view_info()->base_info()) {
-                auto& db = service::get_local_storage_proxy().get_db().local();
+                auto& db = service::get_local_storage_proxy().local_db();
                // This line might throw a no_such_column_family
                // It should be fine since if we tried to register a view for which
                // we don't know the base table, our registry is broken.
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3624,6 +3624,11 @@ protected:

 public:
    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
+        if (_targets.empty()) {
+            // We may have no targets to read from if a DC with zero replication is queried with LOCACL_QUORUM.
+            // Return an empty result in this case
+            return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>>(make_foreign(make_lw_shared(query::result())));
+        }
        digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for,
                db::is_datacenter_local(_cl) ? db::count_local_endpoints(_targets): _targets.size(), timeout);
        auto exec = shared_from_this();
@@ -4933,10 +4938,12 @@ void storage_proxy::init_messaging_service() {
            tracing::trace(trace_state_ptr, "read_data: message received from /{}", src_addr.addr);
        }
        auto da = oda.value_or(query::digest_algorithm::MD5);
+        auto sp = get_local_shared_storage_proxy();
        if (!cmd.max_result_size) {
-            cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
+            auto& cfg = sp->_db.local().get_config();
+            cmd.max_result_size.emplace(cfg.max_memory_for_unlimited_query_soft_limit(), cfg.max_memory_for_unlimited_query_hard_limit());
        }
-        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        return do_with(std::move(pr), std::move(sp), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
            p->get_stats().replica_data_reads++;
            auto src_ip = src_addr.addr;
            return get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, da, &pr, &p, &trace_state_ptr, t] (schema_ptr s) {
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -446,6 +446,12 @@ public:
    distributed<database>& get_db() {
        return _db;
    }
+    const database& local_db() const noexcept {
+        return _db.local();
+    }
+    database& local_db() noexcept {
+        return _db.local();
+    }

    void set_cdc_service(cdc::cdc_service* cdc) {
        _cdc = cdc;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -298,7 +298,7 @@ void storage_service::prepare_to_join(
        _token_metadata.update_normal_tokens(my_tokens, get_broadcast_address());

        _cdc_streams_ts = db::system_keyspace::get_saved_cdc_streams_timestamp().get0();
-        if (!_cdc_streams_ts && db().local().get_config().check_experimental(db::experimental_features_t::CDC)) {
+        if (!_cdc_streams_ts) {
            // We could not have completed joining if we didn't generate and persist a CDC streams timestamp,
            // unless we are restarting after upgrading from non-CDC supported version.
            // In that case we won't begin a CDC generation: it should be done by one of the nodes
@@ -550,7 +550,7 @@ void storage_service::join_token_ring(int delay) {
        assert(should_bootstrap() || db().local().is_replacing() || !_cdc_streams_ts);
    }

-    if (!_cdc_streams_ts && db().local().get_config().check_experimental(db::experimental_features_t::CDC)) {
+    if (!_cdc_streams_ts) {
        // If we didn't choose a CDC streams timestamp at this point, then either
        // 1. we're replacing a node which didn't gossip a CDC streams timestamp for whatever reason,
        // 2. we've already bootstrapped, but are upgrading from a non-CDC version,
@@ -570,10 +570,15 @@ void storage_service::join_token_ring(int delay) {
        if (!db().local().is_replacing()
                && (!db::system_keyspace::bootstrap_complete()
                    || cdc::should_propose_first_generation(get_broadcast_address(), _gossiper))) {
-
-            _cdc_streams_ts = cdc::make_new_cdc_generation(db().local().get_config(),
-                    _bootstrap_tokens, _token_metadata, _gossiper,
-                    _sys_dist_ks.local(), get_ring_delay(), _for_testing);
+            try {
+                _cdc_streams_ts = cdc::make_new_cdc_generation(db().local().get_config(),
+                        _bootstrap_tokens, _token_metadata, _gossiper,
+                        _sys_dist_ks.local(), get_ring_delay(), _for_testing);
+            } catch (...) {
+                cdc_log.warn(
+                    "Could not create a new CDC generation: {}. This may make it impossible to use CDC. Use nodetool checkAndRepairCdcStreams to fix CDC generation",
+                    std::current_exception());
+            }
        }
    }

@@ -893,24 +898,18 @@ void storage_service::bootstrap() {
        // It doesn't hurt: other nodes will (potentially) just do more generation switches.
        // We do this because with this new attempt at bootstrapping we picked a different set of tokens.

-        if (db().local().get_config().check_experimental(db::experimental_features_t::CDC)) {
-            // Update pending ranges now, so we correctly count ourselves as a pending replica
-            // when inserting the new CDC generation.
-            _token_metadata.add_bootstrap_tokens(_bootstrap_tokens, get_broadcast_address());
-            update_pending_ranges().get();
+        // Update pending ranges now, so we correctly count ourselves as a pending replica
+        // when inserting the new CDC generation.
+        _token_metadata.add_bootstrap_tokens(_bootstrap_tokens, get_broadcast_address());
+        update_pending_ranges().get();

-            // After we pick a generation timestamp, we start gossiping it, and we stick with it.
-            // We don't do any other generation switches (unless we crash before complecting bootstrap).
-            assert(!_cdc_streams_ts);
+        // After we pick a generation timestamp, we start gossiping it, and we stick with it.
+        // We don't do any other generation switches (unless we crash before complecting bootstrap).
+        assert(!_cdc_streams_ts);

-            _cdc_streams_ts = cdc::make_new_cdc_generation(db().local().get_config(),
-                    _bootstrap_tokens, _token_metadata, _gossiper,
-                    _sys_dist_ks.local(), get_ring_delay(), _for_testing);
-        } else {
-            // We should not be able to join the cluster if other nodes support CDC but we don't.
-            // The check should have been made somewhere in prepare_to_join (`check_knows_remote_features`).
-            assert(!_feature_service.cluster_supports_cdc());
-        }
+        _cdc_streams_ts = cdc::make_new_cdc_generation(db().local().get_config(),
+                _bootstrap_tokens, _token_metadata, _gossiper,
+                _sys_dist_ks.local(), get_ring_delay(), _for_testing);

        _gossiper.add_local_application_state({
            // Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
@@ -2036,9 +2035,8 @@ future<> storage_service::start_gossiping(bind_messaging_port do_bind) {
        return seastar::async([&ss, do_bind] {
            if (!ss._initialized) {
                slogger.warn("Starting gossip by operator request");
-                bool cdc_enabled = ss.db().local().get_config().check_experimental(db::experimental_features_t::CDC);
                ss.set_gossip_tokens(db::system_keyspace::get_local_tokens().get0(),
-                        cdc_enabled ? std::make_optional(cdc::get_local_streams_timestamp().get0()) : std::nullopt);
+                        std::make_optional(cdc::get_local_streams_timestamp().get0()));
                ss._gossiper.force_newer_generation();
                ss._gossiper.start_gossiping(utils::get_generation_number(), gms::bind_messaging_port(bool(do_bind))).then([&ss] {
                    ss._initialized = true;
@@ -2338,7 +2336,7 @@ future<> storage_service::rebuild(sstring source_dc) {
                    slogger.info("Streaming for rebuild successful");
                }).handle_exception([] (auto ep) {
                    // This is used exclusively through JMX, so log the full trace but only throw a simple RTE
-                    slogger.warn("Error while rebuilding node: {}", std::current_exception());
+                    slogger.warn("Error while rebuilding node: {}", ep);
                    return make_exception_future<>(std::move(ep));
                });
            });
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -212,16 +212,18 @@ public:
 };

 struct compaction_writer {
+    shared_sstable sst;
    // We use a ptr for pointer stability and so that it can be null
    // when using a noop monitor.
    sstable_writer writer;
    // The order in here is important. A monitor must be destroyed before the writer it is monitoring since it has a
    // periodic timer that checks the writer.
+    // The writer must be destroyed before the shared_sstable since the it may depend on the sstable
+    // (as in the mx::writer over compressed_file_data_sink_impl case that depends on sstables::compression).
    std::unique_ptr<compaction_write_monitor> monitor;
-    shared_sstable sst;

    compaction_writer(std::unique_ptr<compaction_write_monitor> monitor, sstable_writer writer, shared_sstable sst)
-        : writer(std::move(writer)), monitor(std::move(monitor)), sst(std::move(sst)) {}
+        : sst(std::move(sst)), writer(std::move(writer)), monitor(std::move(monitor)) {}
    compaction_writer(sstable_writer writer, shared_sstable sst)
        : compaction_writer(nullptr, std::move(writer), std::move(sst)) {}
 };
@@ -609,10 +611,12 @@ private:
                                         std::move(gc_consumer));

            return seastar::async([cfc = std::move(cfc), reader = std::move(reader), this] () mutable {
-                reader.consume_in_thread(std::move(cfc), make_partition_filter(), db::no_timeout);
+                reader.consume_in_thread(std::move(cfc), db::no_timeout);
            });
        });
-        return consumer(make_sstable_reader());
+        // producer will filter out a partition before it reaches the consumer(s)
+        auto producer = make_filtering_reader(make_sstable_reader(), make_partition_filter());
+        return consumer(std::move(producer));
    }

    virtual reader_consumer make_interposer_consumer(reader_consumer end_consumer) {
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -311,6 +311,7 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstring name, non
            cmlog.info("{} was abruptly stopped, reason: {}", name, e.what());
        } catch (...) {
            cmlog.error("{} failed: {}", name, std::current_exception());
+            throw;
        }
    });
    return task->compaction_done.get_future().then([task] {});
@@ -629,10 +630,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
    _tasks.push_back(task);

    auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
+    auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
    auto sstables_ptr = sstables.get();
    _stats.pending_tasks += sstables->size();

-    task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr] () mutable {
+    task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr, compacting] () mutable {

        // FIXME: lock cf here
        if (!can_proceed(task)) {
@@ -642,7 +644,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        auto sst = sstables_ptr->back();
        sstables_ptr->pop_back();

-        return repeat([this, task, options, sst = std::move(sst)] () mutable {
+        return repeat([this, task, options, sst = std::move(sst), compacting] () mutable {
            column_family& cf = *task->compacting_cf;
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
@@ -650,21 +652,22 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            auto descriptor = sstables::compaction_descriptor({ sst }, cf.get_sstable_set(), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

-            auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
            // Releases reference to cleaned sstable such that respective used disk space can be freed.
            descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting->release_compacting(exhausted_sstables);
            };

-            _stats.pending_tasks--;
-            _stats.active_tasks++;
-            task->compaction_running = true;
-            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-            return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
-                return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
-                    return cf.run_compaction(std::move(descriptor));
+            return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor)] () mutable {
+                _stats.pending_tasks--;
+                _stats.active_tasks++;
+                task->compaction_running = true;
+                compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
+                return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
+                    return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)]() mutable {
+                        return cf.run_compaction(std::move(descriptor));
+                    });
                });
-            }).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+            }).then_wrapped([this, task, compacting] (future<> f) mutable {
                task->compaction_running = false;
                _stats.active_tasks--;
                if (!can_proceed(task)) {
--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -111,6 +111,7 @@ private:
    std::unordered_map<column_family*, rwlock> _compaction_locks;

    semaphore _custom_job_sem{1};
+    seastar::named_semaphore _rewrite_sstables_sem = {1, named_semaphore_exception_factory{"rewrite sstables"}};

    std::function<void()> compaction_submission_callback();
    // all registered column families are submitted for compaction at a constant interval.
--- a/sstables/kl/writer.cc
+++ b/sstables/kl/writer.cc
@@ -315,8 +315,8 @@ void sstable_writer_k_l::write_collection(file_writer& out, const composite& clu
 void sstable_writer_k_l::write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row) {
    auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());

-    maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);
    maybe_write_row_tombstone(out, clustering_key, clustered_row);
+    maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);

    _collector.update_min_max_components(clustered_row.key());

--- a/sstables/leveled_compaction_strategy.cc
+++ b/sstables/leveled_compaction_strategy.cc
@@ -178,7 +178,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    unsigned max_filled_level = 0;

-    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
+    size_t offstrategy_threshold = (mode == reshape_mode::strict) ? std::max(schema->min_compaction_threshold(), 4) : std::max(schema->max_compaction_threshold(), 32);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
    auto tolerance = [mode] (unsigned level) -> unsigned {
        if (mode == reshape_mode::strict) {
--- a/sstables/mp_row_consumer.hh
+++ b/sstables/mp_row_consumer.hh
@@ -378,6 +378,7 @@ private:
        _fwd_end = _fwd ? position_in_partition::before_all_clustered_rows() : position_in_partition::after_all_clustered_rows();
        _out_of_range = false;
        _range_tombstones.reset();
+        _ready = {};
        _first_row_encountered = false;
    }
 public:
@@ -1144,7 +1145,11 @@ public:
        setup_for_partition(pk);
        auto dk = dht::decorate_key(*_schema, pk);
        _reader->on_next_partition(std::move(dk), tombstone(deltime));
-        return proceed::yes;
+        // Only partition start will be consumed if processing a large run of partition tombstones,
+        // so let's stop the consumer if buffer is full.
+        // Otherwise, partition tombstones will keep accumulating in memory till other fragment type
+        // is found which can stop the consumer (perhaps there's none if sstable is full of tombstones).
+        return proceed(!_reader->is_buffer_full());
    }

    virtual consumer_m::row_processing_result consume_row_start(const std::vector<temporary_buffer<char>>& ecp) override {
--- a/sstables/time_window_compaction_strategy.cc
+++ b/sstables/time_window_compaction_strategy.cc
@@ -162,7 +162,7 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
    for (auto& pair : all_buckets.first) {
        auto ssts = std::move(pair.second);
        if (ssts.size() > offstrategy_threshold) {
-            ssts.resize(std::min(multi_window.size(), max_sstables));
+            ssts.resize(std::min(ssts.size(), max_sstables));
            compaction_descriptor desc(std::move(ssts), std::optional<sstables::sstable_set>(), iop);
            desc.options = compaction_options::make_reshape();
            return desc;
--- a/sstables/time_window_compaction_strategy.hh
+++ b/sstables/time_window_compaction_strategy.hh
@@ -101,7 +101,8 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
    time_window_compaction_strategy_options _options;
    int64_t _estimated_remaining_tasks = 0;
    db_clock::time_point _last_expired_check;
-    timestamp_type _highest_window_seen;
+    // As timestamp_type is an int64_t, a primitive type, it must be initialized here.
+    timestamp_type _highest_window_seen = 0;
    // Keep track of all recent active windows that still need to be compacted into a single SSTable
    std::unordered_set<timestamp_type> _recent_active_windows;
    size_tiered_compaction_strategy_options _stcs_options;
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -403,7 +403,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
            try {
                db.find_column_family(ks, cf);
            } catch (no_such_column_family&) {
-                auto err = format("[Stream #{{}}] prepare requested ks={{}} cf={{}} does not exist", ks, cf);
+                auto err = format("[Stream #{{}}] prepare requested ks={{}} cf={{}} does not exist", plan_id, ks, cf);
                sslog.warn(err.c_str());
                throw std::runtime_error(err);
            }
--- a/table.cc
+++ b/table.cc
@@ -832,7 +832,7 @@ table::stop() {
        return make_ready_future<>();
    }
    return _async_gate.close().then([this] {
-        return when_all(await_pending_writes(), await_pending_reads(), await_pending_streams()).discard_result().finally([this] {
+        return await_pending_ops().finally([this] {
            return _memtables->request_flush().finally([this] {
                return _compaction_manager.remove(this).then([this] {
                    // Nest, instead of using when_all, so we don't lose any exceptions.
@@ -1532,7 +1532,8 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
 }

 future<> table::flush() {
-    return _memtables->request_flush();
+    auto op = _pending_flushes_phaser.start();
+    return _memtables->request_flush().then([op = std::move(op)] {});
 }

 // FIXME: We can do much better than this in terms of cache management. Right
@@ -1550,6 +1551,10 @@ future<> table::flush_streaming_mutations(utils::UUID plan_id, dht::partition_ra
    });
 }

+bool table::can_flush() const {
+    return _memtables->can_flush();
+}
+
 future<> table::clear() {
    if (_commitlog) {
        _commitlog->discard_completed_segments(_schema->id());
--- a/test/alternator/conftest.py
+++ b/test/alternator/conftest.py
@@ -80,7 +80,7 @@ def dynamodb(request):
        verify = not request.config.getoption('https')
        return boto3.resource('dynamodb', endpoint_url=local_url, verify=verify,
            region_name='us-east-1', aws_access_key_id='alternator', aws_secret_access_key='secret_pass',
-            config=botocore.client.Config(retries={"max_attempts": 3}))
+            config=botocore.client.Config(retries={"max_attempts": 0}, read_timeout=300))

@pytest.fixture(scope="session")
 def dynamodbstreams(request):
--- a/test/alternator/run
+++ b/test/alternator/run
@@ -86,7 +86,7 @@ ln -s "$SCYLLA" "$SCYLLA_LINK"
        --alternator-write-isolation=always_use_lwt \
        --alternator-streams-time-window-s=0 \
        --developer-mode=1 \
-        --experimental-features=cdc \
+        --experimental-features=alternator-streams \
        --ring-delay-ms 0 --collectd 0 \
        --smp 2 -m 1G \
        --overprovisioned --unsafe-bypass-fsync 1 \
--- a/test/alternator/test_condition_expression.py
+++ b/test/alternator/test_condition_expression.py
@@ -136,7 +136,7 @@ def test_update_condition_eq_different(test_table_s):
                        ConditionExpression='a = :val2',
                        ExpressionAttributeValues={':val1': val1, ':val2': val2})

-# Also check an actual case of same time, but inequality.
+# Also check an actual case of same type, but inequality.
 def test_update_condition_eq_unequal(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -146,6 +146,13 @@ def test_update_condition_eq_unequal(test_table_s):
            UpdateExpression='SET a = :val1',
            ConditionExpression='a = :oldval',
            ExpressionAttributeValues={':val1': 3, ':oldval': 2})
+    # If the attribute being compared doesn't exist, it's considered a failed
+    # condition, not an error:
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q = :oldval',
+            ExpressionAttributeValues={':val1': 3, ':oldval': 2})

 # Check that set equality is checked correctly. Unlike string equality (for
 # example), it cannot be done with just naive string comparison of the JSON
@@ -269,15 +276,44 @@ def test_update_condition_lt(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a < :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
-    # Trying to compare an unsupported type - e.g., in the following test
-    # a boolean, is unfortunately caught by boto3 and cannot be tested here...
-    #test_table_s.update_item(Key={'p': p},
-    #    AttributeUpdates={'d': {'Value': False, 'Action': 'PUT'}})
-    #with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
-    #    test_table_s.update_item(Key={'p': p},
-    #        UpdateExpression='SET z = :newval',
-    #        ConditionExpression='d < :oldval',
-    #        ExpressionAttributeValues={':newval': 2, ':oldval': True})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q < :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval < q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a < :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval < a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x < :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval < x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

 # Test for ConditionExpression with operator "<="
@@ -341,6 +377,44 @@ def test_update_condition_le(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a <= :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q <= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval <= q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a <= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval <= a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x <= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval <= x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 7

 # Test for ConditionExpression with operator ">"
@@ -404,6 +478,44 @@ def test_update_condition_gt(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a > :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q > :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval > q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a > :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval > a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x > :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval > x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

 # Test for ConditionExpression with operator ">="
@@ -467,6 +579,44 @@ def test_update_condition_ge(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a >= :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '0'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q >= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval >= q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a >= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval >= a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x >= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval >= x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 7

 # Test for ConditionExpression with ternary operator "BETWEEN" (checking
@@ -548,6 +698,60 @@ def test_update_condition_between(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
            ExpressionAttributeValues={':newval': 2, ':oldval1': '0', ':oldval2': '2'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': b'dog', ':oldval2': b'zebra'})
+    # If and operand from the query, and it has a type not supported by the
+    # comparison (e.g., a list), it's not just a failed condition - it is
+    # considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': [1,2], ':oldval2': [2,3]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'},
+                                                             'y': {'Value': [2,3,4], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN x and y',
+            ExpressionAttributeValues={':newval': 2})
+    # If the two operands come from the query (":val" references) then if they
+    # have different types or the wrong order, this is a ValidationException.
+    # But if one or more of the operands come from the item, this only causes
+    # a false condition - not a ValidationException.
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': 2, ':oldval2': 1})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': 2, ':oldval2': 'dog'})
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'two': {'Value': 2, 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN two AND :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval AND two',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 3})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN two AND :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 'dog'})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 9

 # Test for ConditionExpression with multi-operand operator "IN", checking
@@ -605,6 +809,13 @@ def test_update_condition_in(test_table_s):
            UpdateExpression='SET c = :val37',
            ConditionExpression='a IN ()',
            ExpressionAttributeValues=values)
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET c = :val37',
+            ConditionExpression='q IN ({})'.format(','.join(values.keys())),
+            ExpressionAttributeValues=values)

 # Beyond the above operators, there are also test functions supported -
 # attribute_exists, attribute_not_exists, attribute_type, begins_with,
--- a/test/alternator/test_expected.py
+++ b/test/alternator/test_expected.py
@@ -237,6 +237,30 @@ def test_update_expected_1_le(test_table_s):
                            'AttributeValueList': [2, 3]}}
        )

+# Comparison operators like le work only on numbers, strings or bytes.
+# As noted in issue #8043, if any other type is included in *the query*,
+# the result should be a ValidationException, but if the wrong type appears
+# in the item, not the query, the result is a failed condition.
+def test_update_expected_1_le_validation(test_table_s):
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'},
+                          'b': {'Value': [1,2], 'Action': 'PUT'}})
+    # Bad type (a list) in the query. Result is ValidationException.
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'a': {'ComparisonOperator': 'LE',
+                            'AttributeValueList': [[1,2,3]]}}
+        )
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'b': {'ComparisonOperator': 'LE',
+                            'AttributeValueList': [3]}}
+        )
+    assert not 'z' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+
 # Tests for Expected with ComparisonOperator = "LT":
 def test_update_expected_1_lt(test_table_s):
    p = random_string()
@@ -894,6 +918,34 @@ def test_update_expected_1_between(test_table_s):
            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
            Expected={'d': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': [set([1]), set([2])]}})

+# BETWEEN work only on numbers, strings or bytes. As noted in issue #8043,
+# if any other type is included in *the query*, the result should be a
+# ValidationException, but if the wrong type appears in the item, not the
+# query, the result is a failed condition.
+# BETWEEN should also generate ValidationException if the two ends of the
+# range are not of the same type or not in the correct order, but this
+# already is tested in the test above (test_update_expected_1_between).
+def test_update_expected_1_between_validation(test_table_s):
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'},
+                          'b': {'Value': [1,2], 'Action': 'PUT'}})
+    # Bad type (a list) in the query. Result is ValidationException.
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'a': {'ComparisonOperator': 'BETWEEN',
+                            'AttributeValueList': [[1,2,3], [2,3,4]]}}
+        )
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'b': {'ComparisonOperator': 'BETWEEN',
+                            'AttributeValueList': [1,2]}}
+        )
+    assert not 'z' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+
+
 ##############################################################################
 # Instead of ComparisonOperator and AttributeValueList, one can specify either
 # Value or Exists:
--- a/test/alternator/test_filter_expression.py
+++ b/test/alternator/test_filter_expression.py
@@ -235,6 +235,30 @@ def test_filter_expression_ge(test_table_sn_with_data):
        expected_items = [item for item in items if item[xn] >= xv]
        assert(got_items == expected_items)

+# Comparison operators such as >= or BETWEEN only work on numbers, strings or
+# bytes. When an expression's operands come from the item and has a wrong type
+# (e.g., a list), the result is that the item is skipped - aborting the scan
+# with a ValidationException is a bug (this was issue #8043).
+def test_filter_expression_le_bad_type(test_table_sn_with_data):
+    table, p, items = test_table_sn_with_data
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='l <= :xv',
+        ExpressionAttributeValues={':p': p, ':xv': 3})
+    assert got_items == []
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression=':xv <= l',
+        ExpressionAttributeValues={':p': p, ':xv': 3})
+    assert got_items == []
+def test_filter_expression_between_bad_type(test_table_sn_with_data):
+    table, p, items = test_table_sn_with_data
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='s between :xv and l',
+        ExpressionAttributeValues={':p': p, ':xv': 'cat'})
+    assert got_items == []
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='s between l and :xv',
+        ExpressionAttributeValues={':p': p, ':xv': 'cat'})
+    assert got_items == []
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='s between i and :xv',
+        ExpressionAttributeValues={':p': p, ':xv': 'cat'})
+    assert got_items == []
+
 # Test the "BETWEEN/AND" ternary operator on a numeric, string and bytes
 # attribute. These keywords are case-insensitive.
 def test_filter_expression_between(test_table_sn_with_data):
@@ -658,7 +682,6 @@ def test_filter_expression_and_sort_key_condition(test_table_sn_with_data):
 # In particular, test that FilterExpression may inspect attributes which will
 # not be returned by the query, because of the ProjectionExpression.
 # This test reproduces issue #6951.
-@pytest.mark.xfail(reason="issue #6951: cannot filter on non-returned attributes")
 def test_filter_expression_and_projection_expression(test_table):
    p = random_string()
    test_table.put_item(Item={'p': p, 'c': 'hi', 'x': 'dog', 'y': 'cat'})
--- a/test/alternator/test_query.py
+++ b/test/alternator/test_query.py
@@ -386,3 +386,38 @@ def test_query_missing_key(test_table):
        full_query(test_table, KeyConditions={})
    with pytest.raises(ClientError, match='ValidationException'):
        full_query(test_table)
+
+# The paging tests above used a numeric sort key. Let's now also test paging
+# with a bytes sort key. We already have above a test that bytes sort keys
+# work and are sorted correctly (test_query_sort_order_bytes), but the
+# following test adds a check that *paging* works correctly for such keys.
+# We used to have a bug in this (issue #7768) - the returned LastEvaluatedKey
+# was incorrectly formatted, breaking the boto3's parsing of the response.
+# Note we only check the case of bytes *sort* keys in this test. For bytes
+# *partition* keys, see test_scan_paging_bytes().
+def test_query_paging_bytes(test_table_sb):
+    p = random_string()
+    items = [{'p': p, 'c': random_bytes()} for i in range(10)]
+    with test_table_sb.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    # Deliberately pass Limit=1 to enforce paging even though we have
+    # just 10 items in the partition.
+    got_items = full_query(test_table_sb, Limit=1,
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
+    got_sort_keys = [x['c'] for x in got_items]
+    expected_sort_keys = sorted(x['c'] for x in items)
+    assert got_sort_keys == expected_sort_keys
+
+# Similar for test for string clustering keys
+def test_query_paging_string(test_table_ss):
+    p = random_string()
+    items = [{'p': p, 'c': random_string()} for i in range(10)]
+    with test_table_ss.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    got_items = full_query(test_table_ss, Limit=1,
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
+    got_sort_keys = [x['c'] for x in got_items]
+    expected_sort_keys = sorted(x['c'] for x in items)
+    assert got_sort_keys == expected_sort_keys
--- a/test/alternator/test_query_filter.py
+++ b/test/alternator/test_query_filter.py
@@ -539,7 +539,6 @@ def test_query_filter_paging(test_table_sn_with_data):
 # In particular, test that QueryFilter may inspect attributes which will
 # not be returned by the query, because the AttributesToGet.
 # This test reproduces issue #6951.
-@pytest.mark.xfail(reason="issue #6951: cannot filter on non-returned attributes")
 def test_query_filter_and_attributes_to_get(test_table):
    p = random_string()
    test_table.put_item(Item={'p': p, 'c': 'hi', 'x': 'dog', 'y': 'cat'})
--- a/test/alternator/test_scan.py
+++ b/test/alternator/test_scan.py
@@ -19,7 +19,7 @@

 import pytest
 from botocore.exceptions import ClientError
-from util import random_string, full_scan, full_scan_and_count, multiset
+from util import random_string, random_bytes, full_scan, full_scan_and_count, multiset
 from boto3.dynamodb.conditions import Attr

 # Test that scanning works fine with/without pagination
@@ -264,3 +264,20 @@ def test_scan_parallel_incorrect(filled_test_table):
    for segment in [7, 9]:
        with pytest.raises(ClientError, match='ValidationException.*Segment'):
            full_scan(test_table, TotalSegments=5, Segment=segment)
+
+# We used to have a bug with formatting of LastEvaluatedKey in the response
+# of Query and Scan with bytes keys (issue #7768). In test_query_paging_byte()
+# (test_query.py) we tested the case of bytes *sort* keys. In the following
+# test we check bytes *partition* keys.
+def test_scan_paging_bytes(test_table_b):
+    # We will not Scan the entire table - we have no idea what it contains.
+    # But we don't need to scan the entire table - we just need the table
+    # to contain at least two items, and then Scan it with Limit=1 and stop
+    # after one page. Before #7768 was fixed, the test failed when the
+    # LastEvaluatedKey in the response could not be parsed.
+    items = [{'p': random_bytes()}, {'p': random_bytes()}]
+    with test_table_b.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    response = test_table_b.scan(ConsistentRead=True, Limit=1)
+    assert 'LastEvaluatedKey' in response
--- a/test/alternator/test_system_tables.py
+++ b/test/alternator/test_system_tables.py
@@ -41,8 +41,10 @@ def test_fetch_from_system_tables(scylla_only, dynamodb):

        key_columns = [item['column_name'] for item in col_response['Items'] if item['kind'] == 'clustering' or item['kind'] == 'partition_key']
        qualified_name = "{}{}.{}".format(internal_prefix, ks_name, table_name)
-        response = client.scan(TableName=qualified_name, AttributesToGet=key_columns)
-        print(ks_name, table_name, response)
+        import time
+        start = time.time()
+        response = client.scan(TableName=qualified_name, AttributesToGet=key_columns, Limit=50)
+        print(ks_name, table_name, len(str(response)), time.time()-start)

 def test_block_access_to_non_system_tables_with_virtual_interface(scylla_only, test_table_s, dynamodb):
    client = dynamodb.meta.client
--- a/test/alternator/test_update_expression.py
+++ b/test/alternator/test_update_expression.py
@@ -659,6 +659,24 @@ def test_update_expression_add_numbers(test_table_s):
            UpdateExpression='ADD b :val1',
            ExpressionAttributeValues={':val1': 1})

+# In test_update_expression_add_numbers() above we tested ADDing a number to
+# an existing number. The following test check that ADD can be used to
+# create a *new* number, as if it was added to zero.
+def test_update_expression_add_numbers_new(test_table_s):
+    # Test that "ADD" can create a new number attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hello'})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD b :val1',
+        ExpressionAttributeValues={':val1': 7})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['b'] == 7
+    # Test that "ADD" can create an entirely new item:
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD b :val1',
+        ExpressionAttributeValues={':val1': 8})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['b'] == 8
+
 # Test "ADD" operation for sets
 def test_update_expression_add_sets(test_table_s):
    p = random_string()
@@ -687,6 +705,24 @@ def test_update_expression_add_sets(test_table_s):
            UpdateExpression='ADD a :val1',
            ExpressionAttributeValues={':val1': 'hello'})

+# In test_update_expression_add_sets() above we tested ADDing elements to an
+# existing set. The following test checks that ADD can be used to create a
+# *new* set, by adding its first item.
+def test_update_expression_add_sets_new(test_table_s):
+    # Test that "ADD" can create a new set attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hello'})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD b :val1',
+        ExpressionAttributeValues={':val1': set(['dog'])})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['b'] == set(['dog'])
+    # Test that "ADD" can create an entirely new item:
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD b :val1',
+        ExpressionAttributeValues={':val1': set(['cat'])})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['b'] == set(['cat'])
+
 # Test "DELETE" operation for sets
 def test_update_expression_delete_sets(test_table_s):
    p = random_string()
--- a/test/boost/cdc_generation_test.cc
+++ b/test/boost/cdc_generation_test.cc
@@ -0,0 +1,165 @@
+/*
+ * Copyright (C) 2021 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE core
+
+#include <boost/test/unit_test.hpp>
+#include <vector>
+
+#include "cdc/generation.hh"
+#include "test/lib/random_utils.hh"
+
+namespace cdc {
+
+size_t limit_of_streams_in_topology_description();
+topology_description limit_number_of_streams_if_needed(topology_description&& desc);
+
+} // namespace cdc
+
+static cdc::topology_description create_description(const std::vector<size_t>& streams_count_per_vnode) {
+    std::vector<cdc::token_range_description> result;
+    result.reserve(streams_count_per_vnode.size());
+    size_t vnode_index = 0;
+    int64_t token = std::numeric_limits<int64_t>::min() + 100;
+    for (size_t streams_count : streams_count_per_vnode) {
+        std::vector<cdc::stream_id> streams(streams_count);
+        token += 500;
+        for (size_t idx = 0; idx < streams_count; ++idx) {
+            streams[idx] = cdc::stream_id{dht::token::from_int64(token), vnode_index};
+            token += 100;
+        }
+        token += 10000;
+        // sharding_ignore_msb should not matter for limit_number_of_streams_if_needed
+        // so we're using sharding_ignore_msb equal to 12.
+        result.push_back(
+                cdc::token_range_description{dht::token::from_int64(token), std::move(streams), uint8_t{12}});
+        ++vnode_index;
+    }
+    return cdc::topology_description(std::move(result));
+}
+
+static void assert_streams_count(const cdc::topology_description& desc, const std::vector<size_t>& expected_count) {
+    BOOST_REQUIRE_EQUAL(expected_count.size(), desc.entries().size());
+
+    for (size_t idx = 0; idx < expected_count.size(); ++idx) {
+        BOOST_REQUIRE_EQUAL(expected_count[idx], desc.entries()[idx].streams.size());
+    }
+}
+
+static void assert_stream_ids_in_right_token_ranges(const cdc::topology_description& desc) {
+    dht::token start = desc.entries().back().token_range_end;
+    dht::token end = desc.entries().front().token_range_end;
+    for (auto& stream : desc.entries().front().streams) {
+        dht::token t = stream.token();
+        if (t > end) {
+            BOOST_REQUIRE(start < t);
+        } else {
+            BOOST_REQUIRE(t <= end);
+        }
+    }
+    for (size_t idx = 1; idx < desc.entries().size(); ++idx) {
+        for (auto& stream : desc.entries()[idx].streams) {
+            BOOST_REQUIRE(desc.entries()[idx - 1].token_range_end < stream.token());
+            BOOST_REQUIRE(stream.token() <= desc.entries()[idx].token_range_end);
+        }
+    }
+
+}
+
+cdc::stream_id get_stream(const std::vector<cdc::token_range_description>& entries, dht::token tok);
+
+static void assert_random_tokens_mapped_to_streams_with_tokens_in_the_same_token_range(const cdc::topology_description& desc) {
+    for (size_t count = 0; count < 100; ++count) {
+        int64_t token_value = tests::random::get_int(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
+        dht::token t = dht::token::from_int64(token_value);
+        auto stream = get_stream(desc.entries(), t);
+        auto& e = desc.entries().at(stream.index());
+        BOOST_REQUIRE(std::find(e.streams.begin(), e.streams.end(), stream) != e.streams.end());
+        if (stream.index() != 0) {
+            BOOST_REQUIRE(t <= e.token_range_end);
+            BOOST_REQUIRE(t > desc.entries().at(stream.index() - 1).token_range_end);
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(test_cdc_generation_limitting_single_vnode_should_not_limit) {
+    cdc::topology_description given = create_description({cdc::limit_of_streams_in_topology_description()});
+
+    cdc::topology_description result = cdc::limit_number_of_streams_if_needed(std::move(given));
+
+    assert_streams_count(result, {cdc::limit_of_streams_in_topology_description()});
+    assert_stream_ids_in_right_token_ranges(result);
+    assert_random_tokens_mapped_to_streams_with_tokens_in_the_same_token_range(result);
+}
+
+BOOST_AUTO_TEST_CASE(test_cdc_generation_limitting_single_vnode_should_limit) {
+    cdc::topology_description given = create_description({cdc::limit_of_streams_in_topology_description() + 1});
+
+    cdc::topology_description result = cdc::limit_number_of_streams_if_needed(std::move(given));
+
+    assert_streams_count(result, {cdc::limit_of_streams_in_topology_description()});
+    assert_stream_ids_in_right_token_ranges(result);
+    assert_random_tokens_mapped_to_streams_with_tokens_in_the_same_token_range(result);
+}
+
+BOOST_AUTO_TEST_CASE(test_cdc_generation_limitting_multiple_vnodes_should_not_limit) {
+    size_t total = 0;
+    std::vector<size_t> streams_count_per_vnode;
+    size_t count_for_next_vnode = 1;
+    while (total + count_for_next_vnode <= cdc::limit_of_streams_in_topology_description()) {
+        streams_count_per_vnode.push_back(count_for_next_vnode);
+        total += count_for_next_vnode;
+        ++count_for_next_vnode;
+    }
+    cdc::topology_description given = create_description(streams_count_per_vnode);
+
+    cdc::topology_description result = cdc::limit_number_of_streams_if_needed(std::move(given));
+
+    assert_streams_count(result, streams_count_per_vnode);
+    assert_stream_ids_in_right_token_ranges(result);
+    assert_random_tokens_mapped_to_streams_with_tokens_in_the_same_token_range(result);
+}
+
+BOOST_AUTO_TEST_CASE(test_cdc_generation_limitting_multiple_vnodes_should_limit) {
+    size_t total = 0;
+    std::vector<size_t> streams_count_per_vnode;
+    size_t count_for_next_vnode = 1;
+    while (total + count_for_next_vnode <= cdc::limit_of_streams_in_topology_description()) {
+        streams_count_per_vnode.push_back(count_for_next_vnode);
+        total += count_for_next_vnode;
+        ++count_for_next_vnode;
+    }
+    streams_count_per_vnode.push_back(cdc::limit_of_streams_in_topology_description() - total + 1);
+    cdc::topology_description given = create_description(streams_count_per_vnode);
+
+    cdc::topology_description result = cdc::limit_number_of_streams_if_needed(std::move(given));
+
+    assert(streams_count_per_vnode.size() <= cdc::limit_of_streams_in_topology_description());
+    size_t per_vnode_limit = cdc::limit_of_streams_in_topology_description() / streams_count_per_vnode.size();
+    for (auto& count : streams_count_per_vnode) {
+        count = std::min(count, per_vnode_limit);
+    }
+
+    assert_streams_count(result, streams_count_per_vnode);
+    assert_stream_ids_in_right_token_ranges(result);
+    assert_random_tokens_mapped_to_streams_with_tokens_in_the_same_token_range(result);
+}
+
--- a/test/boost/cdc_test.cc
+++ b/test/boost/cdc_test.cc
@@ -42,16 +42,6 @@

 using namespace std::string_literals;

-static cql_test_config mk_cdc_test_config() {
-    auto ext = std::make_shared<db::extensions>();
-    ext->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
-    auto cfg = ::make_shared<db::config>(std::move(ext));
-    auto features = cfg->experimental_features();
-    features.emplace_back(db::experimental_features_t::CDC);
-    cfg->experimental_features(features);
-    return cql_test_config(std::move(cfg));
-};
-
 namespace cdc {
 api::timestamp_type find_timestamp(const mutation&);
 utils::UUID generate_timeuuid(api::timestamp_type);
@@ -131,7 +121,7 @@ SEASTAR_THREAD_TEST_CASE(test_find_mutation_timestamp) {
        check_stmt("DELETE vut.b FROM t WHERE pk = 0 AND ck = 0");
        check_stmt("DELETE vfut FROM t WHERE pk = 0 AND ck = 0");
        check_stmt("DELETE vstatic FROM t WHERE pk = 0");
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_generate_timeuuid) {
@@ -199,7 +189,7 @@ SEASTAR_THREAD_TEST_CASE(test_with_cdc_parameter) {
        test("WITH cdc = {'enabled':'false'}", "{'enabled':'true'}", "{'enabled':'false'}", {false}, {true}, {false});
        test("", "{'enabled':'true','preimage':'true','postimage':'true','ttl':'1'}", "{'enabled':'false'}", {false}, {true, true, true, 1}, {false});
        test("WITH cdc = {'enabled':'true','preimage':'true','postimage':'true','ttl':'1'}", "{'enabled':'false'}", "{'enabled':'true','preimage':'false','postimage':'true','ttl':'2'}", {true, true, true, 1}, {false}, {true, false, true, 2});
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_detecting_conflict_of_cdc_log_table_with_existing_table) {
@@ -213,7 +203,7 @@ SEASTAR_THREAD_TEST_CASE(test_detecting_conflict_of_cdc_log_table_with_existing_
        e.execute_cql("CREATE TABLE ks.tbl (a int PRIMARY KEY)").get();
        e.require_table_exists("ks", "tbl").get();
        BOOST_REQUIRE_THROW(e.execute_cql("ALTER TABLE ks.tbl WITH cdc = {'enabled': true}").get(), exceptions::invalid_request_exception);
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_log_table) {
@@ -247,7 +237,7 @@ SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_log_table) {

        // Disallow DROP
        assert_unauthorized("DROP TABLE " + log_table);
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_disallow_cdc_on_materialized_view) {
@@ -257,7 +247,7 @@ SEASTAR_THREAD_TEST_CASE(test_disallow_cdc_on_materialized_view) {

        BOOST_REQUIRE_THROW(e.execute_cql("CREATE MATERIALIZED VIEW ks.mv AS SELECT a FROM ks.tbl PRIMARY KEY (a) WITH cdc = {'enabled': true}").get(), exceptions::invalid_request_exception);
        e.require_table_does_not_exist("ks", "mv").get();
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_description) {
@@ -285,7 +275,7 @@ SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_description) {

        test_table("cdc_streams_descriptions");
        test_table("cdc_generation_descriptions");
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_cdc_log_schema) {
@@ -326,6 +316,7 @@ SEASTAR_THREAD_TEST_CASE(test_cdc_log_schema) {
        // cdc log clustering key
        assert_has_column(cdc::log_meta_column_name("operation"), byte_type);
        assert_has_column(cdc::log_meta_column_name("ttl"), long_type);
+        assert_has_column(cdc::log_meta_column_name("end_of_batch"), boolean_type);

        // pk
        assert_has_column(cdc::log_data_column_name("pk"), int32_type);
@@ -370,7 +361,7 @@ SEASTAR_THREAD_TEST_CASE(test_cdc_log_schema) {

        // Check if we missed something
        BOOST_REQUIRE_EQUAL(required_column_count, log_schema->all_columns_count());
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 static std::vector<std::vector<bytes_opt>> to_bytes(const cql_transport::messages::result_message::rows& rows) {
@@ -512,7 +503,7 @@ SEASTAR_THREAD_TEST_CASE(test_primary_key_logging) {
        // DELETE FROM ks.tbl WHERE pk = 1 AND pk2 = 11
        assert_row(1, 11);
        BOOST_REQUIRE(actual_i == actual_end);
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_pre_post_image_logging) {
@@ -534,6 +525,7 @@ SEASTAR_THREAD_TEST_CASE(test_pre_post_image_logging) {
            auto val_index = column_index(*rows, cdc::log_data_column_name("val"));
            auto val2_index = column_index(*rows, cdc::log_data_column_name("val2"));
            auto ttl_index = column_index(*rows, cdc::log_meta_column_name("ttl"));
+            auto eor_index = column_index(*rows, cdc::log_meta_column_name("end_of_batch"));

            auto val_type = int32_type;
            auto val = *first[0][val_index];
@@ -567,7 +559,7 @@ SEASTAR_THREAD_TEST_CASE(test_pre_post_image_logging) {
                    BOOST_REQUIRE_EQUAL(pre_image.size(), i + 1);

                    val = *pre_image.back()[val_index];
-                    // note: no val2 in pre-image, because we are not modifying it. 
+                    // note: no val2 in pre-image, because we are not modifying it.
                    BOOST_REQUIRE_EQUAL(int32_type->decompose(1111), *pre_image.back()[ck2_index]);
                    BOOST_REQUIRE_EQUAL(data_value(last), val_type->deserialize(bytes_view(val)));
                    BOOST_REQUIRE_EQUAL(bytes_opt(), pre_image.back()[ttl_index]);
@@ -583,10 +575,12 @@ SEASTAR_THREAD_TEST_CASE(test_pre_post_image_logging) {
                if (post_enabled) {
                    val = *post_image.back()[val_index];
                    val2 = *post_image.back()[val2_index];
+                    auto eor = *post_image.back()[eor_index];

                    BOOST_REQUIRE_EQUAL(int32_type->decompose(1111), *post_image.back()[ck2_index]);
                    BOOST_REQUIRE_EQUAL(data_value(nv), val_type->deserialize(bytes_view(val)));
                    BOOST_REQUIRE_EQUAL(data_value(22222), val_type->deserialize(bytes_view(val2)));
+                    BOOST_REQUIRE_EQUAL(data_value(true), boolean_type->deserialize(bytes_view(eor)));
                }

                const auto& ttl_cell = second[second.size() - 2][ttl_index];
@@ -608,7 +602,7 @@ SEASTAR_THREAD_TEST_CASE(test_pre_post_image_logging) {
                }
            }
        }
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_pre_post_image_logging_static_row) {
@@ -682,7 +676,7 @@ SEASTAR_THREAD_TEST_CASE(test_pre_post_image_logging_static_row) {
        test(true, false);
        test(false, true);
        test(false, false);
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_range_deletion) {
@@ -691,7 +685,7 @@ SEASTAR_THREAD_TEST_CASE(test_range_deletion) {
        cquery_nofail(e, "DELETE FROM ks.tbl WHERE pk = 123 AND ck > 1 AND ck < 23");
        cquery_nofail(e, "DELETE FROM ks.tbl WHERE pk = 123 AND ck >= 4 AND ck <= 56");

-        auto msg = e.execute_cql(format("SELECT \"{}\", \"{}\", \"{}\", \"{}\" FROM ks.{}", 
+        auto msg = e.execute_cql(format("SELECT \"{}\", \"{}\", \"{}\", \"{}\" FROM ks.{}",
            cdc::log_meta_column_name("time"),
            cdc::log_data_column_name("pk"),
            cdc::log_data_column_name("ck"),
@@ -726,7 +720,7 @@ SEASTAR_THREAD_TEST_CASE(test_range_deletion) {
        // ck >= 4 AND ck <= 56
        check_row(4, cdc::operation::range_delete_start_inclusive);
        check_row(56, cdc::operation::range_delete_end_inclusive);
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_add_columns) {
@@ -750,11 +744,11 @@ SEASTAR_THREAD_TEST_CASE(test_add_columns) {
        auto kokos = *inserts.back()[kokos_index];

        BOOST_REQUIRE_EQUAL(data_value("kaka"), kokos_type->deserialize(bytes_view(kokos)));
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

-// #5582 - just quickly test that we can create the cdc enabled table on a different shard 
-// and still get the logs proper. 
+// #5582 - just quickly test that we can create the cdc enabled table on a different shard
+// and still get the logs proper.
 SEASTAR_THREAD_TEST_CASE(test_cdc_across_shards) {
    do_with_cql_env_thread([](cql_test_env& e) {
        if (smp::count < 2) {
@@ -772,7 +766,7 @@ SEASTAR_THREAD_TEST_CASE(test_cdc_across_shards) {
        auto rows = select_log(e, "tbl");

        BOOST_REQUIRE(!to_bytes_filtered(*rows, cdc::operation::insert).empty());
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_negative_ttl_fail) {
@@ -780,7 +774,7 @@ SEASTAR_THREAD_TEST_CASE(test_negative_ttl_fail) {
        BOOST_REQUIRE_EXCEPTION(e.execute_cql("CREATE TABLE ks.fail (a int PRIMARY KEY, b int) WITH cdc = {'enabled':true,'ttl':'-1'}").get0(),
                exceptions::configuration_exception,
                exception_predicate::message_contains("ttl"));
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_ttls) {
@@ -830,11 +824,11 @@ SEASTAR_THREAD_TEST_CASE(test_ttls) {
                auto cell_ttl_seconds = value_cast<int32_t>(cell_ttl);
                // 30% tolerance in case of slow execution (a little flaky...)
                BOOST_REQUIRE_CLOSE((float)cell_ttl_seconds, (float)ttl_seconds, 30.f);
-            }            
+            }
        };
        test_ttl(0);
        test_ttl(10);
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 // helper funcs + structs for collection testing
@@ -851,13 +845,13 @@ struct col_test {
    data_value post = data_value::make_null(int32_type); // whatever
 };

-// iterate a set of updates and verify pre and delta values. 
+// iterate a set of updates and verify pre and delta values.
 static void test_collection(cql_test_env& e, data_type val_type, data_type del_type, std::vector<col_test> tests, translate_func f = [](data_value v) { return v; }) {
    auto col_type = val_type;

    for (auto& t : tests) {
        cquery_nofail(e, t.update);
-        
+
        auto rows = select_log(e, "tbl");
        auto pre_image = to_bytes_filtered(*rows, cdc::operation::pre_image);
        auto updates = to_bytes_filtered(*rows, cdc::operation::update);
@@ -918,7 +912,7 @@ SEASTAR_THREAD_TEST_CASE(test_map_logging) {
        auto map_keys_type = set_type_impl::get_instance(utf8_type, false);

        test_collection(e, map_type, map_keys_type, {
-            { 
+            {
                "UPDATE ks.tbl set val = { 'apa':'ko' } where pk=1 and pk2=11 and ck=111",
                data_value::make_null(map_type), // no previous value
                {
@@ -930,7 +924,7 @@ SEASTAR_THREAD_TEST_CASE(test_map_logging) {
                },
                ::make_map_value(map_type, { { "apa", "ko" } })
            },
-            { 
+            {
                "UPDATE ks.tbl set val = val + { 'ninja':'mission' } where pk=1 and pk2=11 and ck=111",
                ::make_map_value(map_type, { { "apa", "ko" } }),
                {
@@ -941,9 +935,9 @@ SEASTAR_THREAD_TEST_CASE(test_map_logging) {
                },
                ::make_map_value(map_type, { { "apa", "ko" }, { "ninja", "mission" } })
            },
-            { 
+            {
                "UPDATE ks.tbl set val['ninja'] = 'shuriken' where pk=1 and pk2=11 and ck=111",
-                ::make_map_value(map_type, { { "apa", "ko" }, { "ninja", "mission" } }), 
+                ::make_map_value(map_type, { { "apa", "ko" }, { "ninja", "mission" } }),
                {
                    {
                        ::make_map_value(map_type, { { "ninja", "shuriken" } }),
@@ -952,9 +946,9 @@ SEASTAR_THREAD_TEST_CASE(test_map_logging) {
                },
                ::make_map_value(map_type, { { "apa", "ko" }, { "ninja", "shuriken" } })
            },
-            { 
+            {
                "UPDATE ks.tbl set val['apa'] = null where pk=1 and pk2=11 and ck=111",
-                ::make_map_value(map_type, { { "apa", "ko" }, { "ninja", "shuriken" } }), 
+                ::make_map_value(map_type, { { "apa", "ko" }, { "ninja", "shuriken" } }),
                {
                    {
                        data_value::make_null(map_type),
@@ -963,9 +957,9 @@ SEASTAR_THREAD_TEST_CASE(test_map_logging) {
                },
                ::make_map_value(map_type, { { "ninja", "shuriken" } })
            },
-            { 
+            {
                "UPDATE ks.tbl set val['ninja'] = null, val['ola'] = 'kokos' where pk=1 and pk2=11 and ck=111",
-                ::make_map_value(map_type, { { "ninja", "shuriken" } }), 
+                ::make_map_value(map_type, { { "ninja", "shuriken" } }),
                {
                    {
                        ::make_map_value(map_type, { { "ola", "kokos" } }),
@@ -974,9 +968,9 @@ SEASTAR_THREAD_TEST_CASE(test_map_logging) {
                },
                ::make_map_value(map_type, { { "ola", "kokos" } })
            },
-            { 
+            {
                "UPDATE ks.tbl set val = { 'bolla':'trolla', 'kork':'skruv' } where pk=1 and pk2=11 and ck=111",
-                ::make_map_value(map_type, { { "ola", "kokos" } }), 
+                ::make_map_value(map_type, { { "ola", "kokos" } }),
                {
                    {
                        ::make_map_value(map_type, { { "bolla", "trolla" }, { "kork", "skruv" } }),
@@ -988,7 +982,7 @@ SEASTAR_THREAD_TEST_CASE(test_map_logging) {
            }

        });
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_set_logging) {
@@ -999,7 +993,7 @@ SEASTAR_THREAD_TEST_CASE(test_set_logging) {
        });

        auto set_type = set_type_impl::get_instance(utf8_type, false);
-        
+
        test_collection(e, set_type, set_type, {
            {
                "UPDATE ks.tbl set val = { 'apa', 'ko' } where pk=1 and pk2=11 and ck=111",
@@ -1026,7 +1020,7 @@ SEASTAR_THREAD_TEST_CASE(test_set_logging) {
            },
            {
                "UPDATE ks.tbl set val = val - { 'apa' } where pk=1 and pk2=11 and ck=111",
-                ::make_set_value(set_type, { "apa", "ko", "mission", "ninja" }), 
+                ::make_set_value(set_type, { "apa", "ko", "mission", "ninja" }),
                {
                    {
                        data_value::make_null(set_type),
@@ -1037,7 +1031,7 @@ SEASTAR_THREAD_TEST_CASE(test_set_logging) {
            },
            {
                "UPDATE ks.tbl set val = val - { 'mission' }, val = val + { 'nils' } where pk=1 and pk2=11 and ck=111",
-                ::make_set_value(set_type, { "ko", "mission", "ninja" }), 
+                ::make_set_value(set_type, { "ko", "mission", "ninja" }),
                {
                    {
                        ::make_set_value(set_type, { "nils" }),
@@ -1059,7 +1053,7 @@ SEASTAR_THREAD_TEST_CASE(test_set_logging) {
                ::make_set_value(set_type, { "bolla", "trolla" })
            }
        });
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_list_logging) {
@@ -1072,11 +1066,11 @@ SEASTAR_THREAD_TEST_CASE(test_list_logging) {
        auto list_type = list_type_impl::get_instance(utf8_type, false);
        auto uuids_type = set_type_impl::get_instance(timeuuid_type, false);
        auto val_type = map_type_impl::get_instance(list_type->name_comparator(), list_type->value_comparator(), false);
-        
+
        test_collection(e, val_type, uuids_type, {
            {
                "UPDATE ks.tbl set val = [ 'apa', 'ko' ] where pk=1 and pk2=11 and ck=111",
-                data_value::make_null(list_type), 
+                data_value::make_null(list_type),
                {
                    {
                        ::make_list_value(list_type, { "apa", "ko" }),
@@ -1121,7 +1115,7 @@ SEASTAR_THREAD_TEST_CASE(test_list_logging) {
            },
            {
                "UPDATE ks.tbl set val[0] = 'babar' where pk=1 and pk2=11 and ck=111",
-                ::make_list_value(list_type, { "apa", "ko", "ninja", "mission" }), 
+                ::make_list_value(list_type, { "apa", "ko", "ninja", "mission" }),
                {
                    {
                        ::make_list_value(list_type, { "babar" }),
@@ -1151,7 +1145,7 @@ SEASTAR_THREAD_TEST_CASE(test_list_logging) {
            }
            return ::make_list_value(list_type, std::move(cpy));
        });
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_udt_logging) {
@@ -1163,7 +1157,7 @@ SEASTAR_THREAD_TEST_CASE(test_udt_logging) {
            e.execute_cql("DROP TYPE ks.mytype").get();
        });

-        auto udt_type = user_type_impl::get_instance("ks", to_bytes("mytype"), 
+        auto udt_type = user_type_impl::get_instance("ks", to_bytes("mytype"),
            { to_bytes("field0"), to_bytes("field1") },
            { int32_type, utf8_type },
            false
@@ -1171,18 +1165,18 @@ SEASTAR_THREAD_TEST_CASE(test_udt_logging) {
        auto index_set_type = set_type_impl::get_instance(short_type, false);
        auto f0_type = int32_type;
        auto f1_type = utf8_type;
-        
+
        auto make_tuple = [&](std::optional<std::optional<int32_t>> i, std::optional<std::optional<sstring>> s) {
            return ::make_user_value(udt_type, {
                i ? ::data_value(*i) : data_value::make_null(f0_type),
                s ? ::data_value(*s) : data_value::make_null(f1_type),
            });
        };
-        
+
        test_collection(e, udt_type, index_set_type, {
            {
                "UPDATE ks.tbl set val = { field0: 12, field1: 'ko' } where pk=1 and pk2=11 and ck=111",
-                data_value::make_null(udt_type), 
+                data_value::make_null(udt_type),
                {
                    {
                        make_tuple(12, "ko"),
@@ -1238,7 +1232,7 @@ SEASTAR_THREAD_TEST_CASE(test_udt_logging) {
                make_tuple(1, "bolla")
            },
        });
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_frozen_logging) {
@@ -1289,7 +1283,7 @@ SEASTAR_THREAD_TEST_CASE(test_frozen_logging) {
        test_frozen("frozen<set<text>>", "{'a', 'bb', 'ccc'}");
        test_frozen("frozen<map<text, text>>", "{'a': 'bb', 'ccc': 'dddd'}");
        test_frozen("frozen<udt>", "{a: 'bb', ccc: 'dddd'}");
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_update_insert_delete_distinction) {
@@ -1321,7 +1315,32 @@ SEASTAR_THREAD_TEST_CASE(test_update_insert_delete_distinction) {

        BOOST_REQUIRE_EQUAL(results[3].size(), 1);
        BOOST_REQUIRE_EQUAL(*results[3].front(), data_value(static_cast<int8_t>(cdc::operation::row_delete)).serialize_nonnull()); // log entry from (3)
-    }, mk_cdc_test_config()).get();
+    }).get();
+}
+
+static std::vector<std::vector<data_value>> get_result(cql_test_env& e,
+        const std::vector<data_type>& col_types, const sstring& query) {
+    auto deser = [] (const data_type& t, const bytes_opt& b) -> data_value {
+        if (!b) {
+            return data_value::make_null(t);
+        }
+        return t->deserialize(*b);
+    };
+
+    auto msg = e.execute_cql(query).get0();
+    auto rows = dynamic_pointer_cast<cql_transport::messages::result_message::rows>(msg);
+    BOOST_REQUIRE(rows);
+
+    std::vector<std::vector<data_value>> res;
+    for (auto&& r: to_bytes(*rows)) {
+        BOOST_REQUIRE_LE(col_types.size(), r.size());
+        std::vector<data_value> res_r;
+        for (size_t i = 0; i < col_types.size(); ++i) {
+            res_r.push_back(deser(col_types[i], r[i]));
+        }
+        res.push_back(std::move(res_r));
+    }
+    return res;
 }

 SEASTAR_THREAD_TEST_CASE(test_change_splitting) {
@@ -1346,28 +1365,8 @@ SEASTAR_THREAD_TEST_CASE(test_change_splitting) {
            return make_set_value(keys_type, std::move(s));
        };

-        auto deser = [] (const data_type& t, const bytes_opt& b) -> data_value {
-            if (!b) {
-                return data_value::make_null(t);
-            }
-            return t->deserialize(*b);
-        };
-
        auto get_result = [&] (const std::vector<data_type>& col_types, const sstring& s) -> std::vector<std::vector<data_value>> {
-            auto msg = e.execute_cql(s).get0();
-            auto rows = dynamic_pointer_cast<cql_transport::messages::result_message::rows>(msg);
-            BOOST_REQUIRE(rows);
-
-            std::vector<std::vector<data_value>> res;
-            for (auto&& r: to_bytes(*rows)) {
-                BOOST_REQUIRE_LE(col_types.size(), r.size());
-                std::vector<data_value> res_r;
-                for (size_t i = 0; i < col_types.size(); ++i) {
-                    res_r.push_back(deser(col_types[i], r[i]));
-                }
-                res.push_back(std::move(res_r));
-            }
-            return res;
+            return ::get_result(e, col_types, s);
        };

        cquery_nofail(e, "create table ks.t (pk int, ck int, s int static, v1 int, v2 int, m map<int, int>, primary key (pk, ck)) with cdc = {'enabled':true}");
@@ -1566,7 +1565,7 @@ SEASTAR_THREAD_TEST_CASE(test_change_splitting) {
            };
            BOOST_REQUIRE_EQUAL(expected, result);
        }
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_batch_with_row_delete) {
@@ -1630,7 +1629,7 @@ SEASTAR_THREAD_TEST_CASE(test_batch_with_row_delete) {
            BOOST_REQUIRE_EQUAL(deser(s_type, r[3]), er[3]);
            BOOST_REQUIRE_EQUAL(deser(oper_type, r[4]), er[4]);
        }
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 struct image_set {
@@ -1939,7 +1938,7 @@ void test_batch_images(bool preimage, bool postimage) {
                }
            }
        }, preimage, postimage);
-    }, mk_cdc_test_config()).get();
+    }).get();
 }

 SEASTAR_THREAD_TEST_CASE(test_batch_pre_image) {
@@ -1953,3 +1952,24 @@ SEASTAR_THREAD_TEST_CASE(test_batch_post_image) {
 SEASTAR_THREAD_TEST_CASE(test_batch_pre_post_image) {
    test_batch_images(true, true);
 }
+
+// Regression test for #7716
+SEASTAR_THREAD_TEST_CASE(test_postimage_with_no_regular_columns) {
+    do_with_cql_env_thread([] (cql_test_env& e) {
+        using oper_ut = std::underlying_type_t<cdc::operation>;
+
+        cquery_nofail(e, "create table ks.t (pk int, ck int, primary key (pk, ck)) with cdc = {'enabled': true, 'postimage': true}");
+        cquery_nofail(e, "insert into ks.t (pk, ck) values (1, 2)");
+
+        auto result = get_result(e,
+            {data_type_for<oper_ut>(), int32_type, int32_type},
+            "select \"cdc$operation\", pk, ck from ks.t_scylla_cdc_log");
+
+        std::vector<std::vector<data_value>> expected = {
+            { oper_ut(cdc::operation::insert), int32_t(1), int32_t(2) },
+            { oper_ut(cdc::operation::post_image), int32_t(1), int32_t(2) },
+        };
+
+        BOOST_REQUIRE_EQUAL(expected, result);
+    }).get();
+}
--- a/test/boost/config_test.cc
+++ b/test/boost/config_test.cc
@@ -931,10 +931,11 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_cdc) {
    auto cfg_ptr = std::make_unique<config>();
    config& cfg = *cfg_ptr;
    cfg.read_from_yaml("experimental_features:\n    - cdc\n", throw_on_error);
-    BOOST_CHECK_EQUAL(cfg.experimental_features(), features{ef::CDC});
-    BOOST_CHECK(cfg.check_experimental(ef::CDC));
+    BOOST_CHECK_EQUAL(cfg.experimental_features(), features{ef::UNUSED_CDC});
+    BOOST_CHECK(cfg.check_experimental(ef::UNUSED_CDC));
    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
    BOOST_CHECK(!cfg.check_experimental(ef::UDF));
+    BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
    return make_ready_future();
 }

@@ -943,9 +944,10 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_unused) {
    config& cfg = *cfg_ptr;
    cfg.read_from_yaml("experimental_features:\n    - lwt\n", throw_on_error);
    BOOST_CHECK_EQUAL(cfg.experimental_features(), features{ef::UNUSED});
-    BOOST_CHECK(!cfg.check_experimental(ef::CDC));
+    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED_CDC));
    BOOST_CHECK(cfg.check_experimental(ef::UNUSED));
    BOOST_CHECK(!cfg.check_experimental(ef::UDF));
+    BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
    return make_ready_future();
 }

@@ -954,9 +956,22 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_udf) {
    config& cfg = *cfg_ptr;
    cfg.read_from_yaml("experimental_features:\n    - udf\n", throw_on_error);
    BOOST_CHECK_EQUAL(cfg.experimental_features(), features{ef::UDF});
-    BOOST_CHECK(!cfg.check_experimental(ef::CDC));
+    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED_CDC));
    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
    BOOST_CHECK(cfg.check_experimental(ef::UDF));
+    BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
+    return make_ready_future();
+}
+
+SEASTAR_TEST_CASE(test_parse_experimental_features_alternator_streams) {
+    auto cfg_ptr = std::make_unique<config>();
+    config& cfg = *cfg_ptr;
+    cfg.read_from_yaml("experimental_features:\n    - alternator-streams\n", throw_on_error);
+    BOOST_CHECK_EQUAL(cfg.experimental_features(), features{ef::ALTERNATOR_STREAMS});
+    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED_CDC));
+    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
+    BOOST_CHECK(!cfg.check_experimental(ef::UDF));
+    BOOST_CHECK(cfg.check_experimental(ef::ALTERNATOR_STREAMS));
    return make_ready_future();
 }

@@ -964,10 +979,11 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_multiple) {
    auto cfg_ptr = std::make_unique<config>();
    config& cfg = *cfg_ptr;
    cfg.read_from_yaml("experimental_features:\n    - cdc\n    - lwt\n    - cdc\n", throw_on_error);
-    BOOST_CHECK_EQUAL(cfg.experimental_features(), (features{ef::CDC, ef::UNUSED, ef::CDC}));
-    BOOST_CHECK(cfg.check_experimental(ef::CDC));
+    BOOST_CHECK_EQUAL(cfg.experimental_features(), (features{ef::UNUSED_CDC, ef::UNUSED, ef::UNUSED_CDC}));
+    BOOST_CHECK(cfg.check_experimental(ef::UNUSED_CDC));
    BOOST_CHECK(cfg.check_experimental(ef::UNUSED));
    BOOST_CHECK(!cfg.check_experimental(ef::UDF));
+    BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
    return make_ready_future();
 }

@@ -979,9 +995,10 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_invalid) {
                       [&cfg] (const sstring& opt, const sstring& msg, std::optional<value_status> status) {
                           BOOST_REQUIRE_EQUAL(opt, "experimental_features");
                           BOOST_REQUIRE_NE(msg.find("line 2, column 7"), msg.npos);
-                           BOOST_CHECK(!cfg.check_experimental(ef::CDC));
+                           BOOST_CHECK(!cfg.check_experimental(ef::UNUSED_CDC));
                           BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
                           BOOST_CHECK(!cfg.check_experimental(ef::UDF));
+                           BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
                       });
    return make_ready_future();
 }
@@ -990,9 +1007,10 @@ SEASTAR_TEST_CASE(test_parse_experimental_true) {
    auto cfg_ptr = std::make_unique<config>();
    config& cfg = *cfg_ptr;
    cfg.read_from_yaml("experimental: true", throw_on_error);
-    BOOST_CHECK(cfg.check_experimental(ef::CDC));
+    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED_CDC));
    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
    BOOST_CHECK(cfg.check_experimental(ef::UDF));
+    BOOST_CHECK(cfg.check_experimental(ef::ALTERNATOR_STREAMS));
    return make_ready_future();
 }

@@ -1000,8 +1018,9 @@ SEASTAR_TEST_CASE(test_parse_experimental_false) {
    auto cfg_ptr = std::make_unique<config>();
    config& cfg = *cfg_ptr;
    cfg.read_from_yaml("experimental: false", throw_on_error);
-    BOOST_CHECK(!cfg.check_experimental(ef::CDC));
+    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED_CDC));
    BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
    BOOST_CHECK(!cfg.check_experimental(ef::UDF));
+    BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
    return make_ready_future();
 }
--- a/test/boost/cql_query_large_test.cc
+++ b/test/boost/cql_query_large_test.cc
@@ -128,12 +128,14 @@ SEASTAR_THREAD_TEST_CASE(test_large_data) {
            });
        }).get();

+        // Since deletion of large data entries has been deleted,
+        // expect the record to be present.
        assert_that(e.execute_cql("select partition_key from system.large_rows where table_name = 'tbl' allow filtering;").get0())
            .is_rows()
-            .is_empty();
+            .with_size(1);
        assert_that(e.execute_cql("select partition_key from system.large_cells where table_name = 'tbl' allow filtering;").get0())
            .is_rows()
-            .is_empty();
+            .with_size(1);

        return make_ready_future<>();
    }, cfg).get();
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -550,3 +550,71 @@ SEASTAR_THREAD_TEST_CASE(read_max_size) {
        }
    }).get();
 }
+
+// Check that mutation queries, those that are stopped when the memory
+// consumed by their results reach the local/global limit, are aborted
+// instead of silently terminated when this happens.
+SEASTAR_THREAD_TEST_CASE(unpaged_mutation_read_global_limit) {
+    auto cfg = cql_test_config{};
+    cfg.dbcfg.emplace();
+    // The memory available to the result memory limiter (global limit) is
+    // configured based on the available memory, so give a small amount to
+    // the "node", so we don't have to work with large amount of data.
+    cfg.dbcfg->available_memory = 2 * 1024 * 1024;
+    do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("CREATE TABLE test (pk text, ck int, v text, PRIMARY KEY (pk, ck));").get();
+        auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
+
+        auto& db = e.local_db();
+        auto& tab = db.find_column_family("ks", "test");
+        auto s = tab.schema();
+
+        auto pk = make_local_key(s);
+        const auto raw_pk = utf8_type->decompose(data_value(pk));
+        const auto cql3_pk = cql3::raw_value::make_value(raw_pk);
+
+        const auto value = sstring(1024, 'a');
+        const auto raw_value = utf8_type->decompose(data_value(value));
+        const auto cql3_value = cql3::raw_value::make_value(raw_value);
+
+        const int num_rows = 1024;
+        const auto max_size = 1024u * 1024u * 1024u;
+
+        for (int i = 0; i != num_rows; ++i) {
+            const auto cql3_ck = cql3::raw_value::make_value(int32_type->decompose(data_value(i)));
+            e.execute_prepared(id, {cql3_pk, cql3_ck, cql3_value}).get();
+        }
+
+        const auto partition_ranges = std::vector<dht::partition_range>{query::full_partition_range};
+
+        const std::vector<std::pair<sstring, std::function<future<size_t>(schema_ptr, const query::read_command&)>>> query_methods{
+                {"query_mutations()", [&db, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
+                    return db.query_mutations(s, cmd, partition_ranges.front(), {}, db::no_timeout).then(
+                            [] (const std::tuple<reconcilable_result, cache_temperature>& res) {
+                        return std::get<0>(res).memory_usage();
+                    });
+                }},
+                {"query_mutations_on_all_shards()", [&e, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
+                    return query_mutations_on_all_shards(e.db(), s, cmd, partition_ranges, {}, db::no_timeout).then(
+                            [] (const std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>& res) {
+                        return std::get<0>(res)->memory_usage();
+                    });
+                }}
+        };
+
+        for (auto [query_method_name, query_method] : query_methods) {
+            testlog.info("checking: query_method={}", query_method_name);
+            auto slice = s->full_slice();
+            slice.options.remove<query::partition_slice::option::allow_short_read>();
+            query::read_command cmd(s->id(), s->version(), slice, query::max_result_size(max_size));
+            try {
+                auto size = query_method(s, cmd).get0();
+                // Just to ensure we are not interpreting empty results as success.
+                BOOST_REQUIRE(size != 0);
+                BOOST_FAIL("Expected exception, but none was thrown.");
+            } catch (std::runtime_error& e) {
+                testlog.trace("Exception thrown, as expected: {}", e);
+            }
+        }
+    }, std::move(cfg)).get();
+}
--- a/test/boost/extensions_test.cc
+++ b/test/boost/extensions_test.cc
@@ -118,7 +118,6 @@ SEASTAR_TEST_CASE(cdc_schema_extension) {
    // Extensions have to be registered here - config needs to have them before construction of test env.
    ext->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
    auto cfg = ::make_shared<db::config>(ext);
-    cfg->experimental_features({db::experimental_features_t::feature::CDC});

    return do_with_cql_env([] (cql_test_env& e) {
        auto assert_ext_correctness = [] (cql_test_env& e, cdc::cdc_extension expected_ext) {
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -974,14 +974,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {

        const auto& partitions = pop_desc.partitions;
        smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] {
-            auto s = gs.get();
-            auto& sem = db->local().get_reader_concurrency_semaphore();
-
-            auto resources = sem.available_resources();
-            resources -= reader_concurrency_semaphore::resources{1, 0};
-            auto permit = sem.make_permit(s.get(), "fuzzy-test");
-
-            return run_fuzzy_test_workload(cfg, *db, std::move(s), partitions).finally([units = permit.consume_resources(resources)] {});
+            return run_fuzzy_test_workload(cfg, *db, gs.get(), partitions);
        }).handle_exception([seed] (std::exception_ptr e) {
            testlog.error("Test workload failed with exception {}."
                    " To repeat this particular run, replace the random seed of the test, with that of this run ({})."
--- a/test/boost/mutation_reader_test.cc
+++ b/test/boost/mutation_reader_test.cc
@@ -894,6 +894,232 @@ sstables::shared_sstable create_sstable(sstables::test_env& env, simple_schema&
        , mutations);
 }

+namespace {
+
+class generic_inactive_read : public reader_concurrency_semaphore::inactive_read {
+    flat_mutation_reader_opt _reader;
+
+private:
+    explicit generic_inactive_read(flat_mutation_reader&& rd) : _reader(std::move(rd)) { }
+
+    virtual void evict() override {
+        _reader = {};
+    }
+
+public:
+    static std::unique_ptr<inactive_read> make(flat_mutation_reader&& rd) {
+        return std::make_unique<generic_inactive_read>(generic_inactive_read(std::move(rd)));
+    }
+
+    static flat_mutation_reader_opt get_reader(std::unique_ptr<inactive_read>&& ir) {
+        if (!ir) {
+            return {};
+        }
+        auto gir = dynamic_cast<generic_inactive_read*>(ir.get());
+        BOOST_REQUIRE(gir);
+        return std::move(gir->_reader);
+    }
+};
+
+} // anonymous namespace
+
+// This unit test passes a read through admission again-and-again, just
+// like an evictable reader would be during its lifetime. When readmitted
+// the read sometimes has to wait and sometimes not. This is to check that
+// the readmitting a previously admitted reader doesn't leak any units.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves_units) {
+    simple_schema s;
+    const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
+    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
+
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+
+    std::optional<reader_permit::resource_units> residue_units;
+
+    for (int i = 0; i < 10; ++i) {
+        const auto have_residue_units = bool(residue_units);
+
+        auto current_resources = initial_resources;
+        if (have_residue_units) {
+            current_resources -= residue_units->resources();
+        }
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        std::optional<reader_permit::resource_units> admitted_units;
+        if (i % 2) {
+            const auto consumed_resources = semaphore.available_resources();
+            semaphore.consume(consumed_resources);
+
+            auto units_fut = permit.wait_admission(1024, db::no_timeout);
+            BOOST_REQUIRE(!units_fut.available());
+
+            semaphore.signal(consumed_resources);
+            admitted_units = units_fut.get();
+        } else {
+            admitted_units = permit.wait_admission(1024, db::no_timeout).get();
+        }
+
+        current_resources -= admitted_units->resources();
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        residue_units.emplace(permit.consume_resources(reader_resources(0, 100)));
+        if (!have_residue_units) {
+            current_resources -= residue_units->resources();
+        }
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        auto handle = semaphore.register_inactive_read(generic_inactive_read::make(make_empty_flat_reader(s.schema(), permit)));
+        (void)handle;
+        BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
+    }
+
+    BOOST_REQUIRE(semaphore.available_resources() == initial_resources - residue_units->resources());
+
+    residue_units.reset();
+
+    BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
+}
+
+// This unit test checks that the semaphore doesn't get into a deadlock
+// when contended, in the presence of many memory-only reads (that don't
+// wait for admission). This is tested by simulating the 3 kind of reads we
+// currently have in the system:
+// * memory-only: reads that don't pass admission and only own memory.
+// * admitted: reads that pass admission.
+// * evictable: admitted reads that are furthermore evictable.
+//
+// The test creates and runs a large number of these reads in parallel,
+// read kinds being selected randomly, then creates a watchdog which
+// kills the test if no progress is being made.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
+    class reader {
+        class skeleton_reader : public flat_mutation_reader::impl {
+            reader_permit::resource_units _base_resources;
+            std::optional<reader_permit::resource_units> _resources;
+        public:
+            skeleton_reader(schema_ptr s, reader_permit permit, reader_permit::resource_units res)
+                : impl(std::move(s), std::move(permit)), _base_resources(std::move(res)) { }
+            virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
+                _resources.emplace(_permit.consume_resources(reader_resources(0, tests::random::get_int(1024, 2048))));
+                return make_ready_future<>();
+            }
+            virtual void next_partition() override { }
+            virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
+            virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
+        };
+        struct reader_visitor {
+            reader& r;
+            future<> operator()(std::monostate& ms) { return r.tick(ms); }
+            future<> operator()(flat_mutation_reader& reader) { return r.tick(reader); }
+            future<> operator()(reader_concurrency_semaphore::inactive_read_handle& handle) { return r.tick(handle); }
+        };
+
+    private:
+        schema_ptr _schema;
+        reader_permit _permit;
+        bool _memory_only = true;
+        bool _evictable = false;
+        std::optional<reader_permit::resource_units> _units;
+        std::variant<std::monostate, flat_mutation_reader, reader_concurrency_semaphore::inactive_read_handle> _reader;
+
+    private:
+        future<> make_reader() {
+          return async([this] {
+            auto res = _permit.consume_memory();
+            if (!_memory_only) {
+                res = _permit.wait_admission(1024, db::no_timeout).get0();
+            }
+            _reader = make_flat_mutation_reader<skeleton_reader>(_schema, _permit, std::move(res));
+          });
+        }
+        future<> tick(std::monostate&) {
+          return async([this] {
+            make_reader().get();
+            tick(std::get<flat_mutation_reader>(_reader)).get();
+          });
+        }
+        future<> tick(flat_mutation_reader& reader) {
+          return async([this, &reader] {
+            reader.fill_buffer(db::no_timeout).get();
+            if (_evictable) {
+                _reader = _permit.semaphore().register_inactive_read(generic_inactive_read::make(std::move(reader)));
+            }
+          });
+        }
+        future<> tick(reader_concurrency_semaphore::inactive_read_handle& handle) {
+          return async([this, &handle] () mutable {
+            if (auto reader = generic_inactive_read::get_reader(_permit.semaphore().unregister_inactive_read(std::move(handle))); reader) {
+                _reader = std::move(*reader);
+            } else {
+                make_reader().get();
+            }
+            tick(std::get<flat_mutation_reader>(_reader)).get();
+          });
+        }
+
+    public:
+        reader(schema_ptr s, reader_permit permit, bool memory_only, bool evictable)
+            : _schema(std::move(s))
+            , _permit(std::move(permit))
+            , _memory_only(memory_only)
+            , _evictable(evictable)
+            , _units(_permit.consume_memory(tests::random::get_int(128, 1024)))
+        {
+        }
+        future<> tick() {
+            return std::visit(reader_visitor{*this}, _reader);
+        }
+    };
+
+    const auto count = 10;
+    const auto num_readers = 512;
+    const auto ticks = 1000;
+
+    simple_schema s;
+    reader_concurrency_semaphore semaphore(count, count * 1024, get_name());
+
+    std::list<std::optional<reader>> readers;
+    unsigned nr_memory_only = 0;
+    unsigned nr_admitted = 0;
+    unsigned nr_evictable = 0;
+
+    for (auto i = 0; i <  num_readers; ++i) {
+        const auto memory_only = tests::random::get_bool();
+        const auto evictable = !memory_only && tests::random::get_bool();
+        if (memory_only) {
+            ++nr_memory_only;
+        } else if (evictable) {
+            ++nr_evictable;
+        } else {
+            ++nr_admitted;
+        }
+        readers.emplace_back(reader(s.schema(), semaphore.make_permit(s.schema().get(), fmt::format("reader{}", i)), memory_only, evictable));
+    }
+
+    testlog.info("Created {} readers, memory_only={}, admitted={}, evictable={}", readers.size(), nr_memory_only, nr_admitted, nr_evictable);
+
+    bool watchdog_touched = false;
+    auto watchdog = timer<db::timeout_clock>([&semaphore, &watchdog_touched] {
+        if (!watchdog_touched) {
+            testlog.error("Watchdog detected a deadlock, dumping diagnostics before killing the test: {}", semaphore.dump_diagnostics());
+            semaphore.broken(std::make_exception_ptr(std::runtime_error("test killed by watchdog")));
+        }
+        watchdog_touched = false;
+    });
+    watchdog.arm_periodic(std::chrono::seconds(30));
+
+    parallel_for_each(readers, [&] (std::optional<reader>& r) -> future<> {
+      return async([this, &watchdog_touched, &r] {
+        for (auto i = 0; i < ticks; ++i) {
+            watchdog_touched = true;
+            r->tick().get();
+        }
+        r.reset();
+        watchdog_touched = true;
+      });
+    }).get();
+}
+
 static
 sstables::shared_sstable create_sstable(sstables::test_env& env, schema_ptr s, std::vector<mutation> mutations) {
    static thread_local auto tmp = tmpdir();
@@ -2715,7 +2941,7 @@ SEASTAR_THREAD_TEST_CASE(test_queue_reader) {
        }
    }

-    // abort()
+    // abort() -- check that consumer is aborted
    {
        auto [reader, handle] = make_queue_reader(gen.schema(), tests::make_permit());
        auto fill_buffer_fut = reader.fill_buffer(db::no_timeout);
@@ -2730,6 +2956,28 @@ SEASTAR_THREAD_TEST_CASE(test_queue_reader) {

        BOOST_REQUIRE_THROW(fill_buffer_fut.get(), std::runtime_error);
        BOOST_REQUIRE_THROW(handle.push(mutation_fragment(*gen.schema(), tests::make_permit(), partition_end{})).get(), std::runtime_error);
+        BOOST_REQUIRE(!reader.is_end_of_stream());
+    }
+
+    // abort() -- check that producer is aborted
+    {
+        auto [reader, handle] = make_queue_reader(gen.schema(), tests::make_permit());
+        reader.set_max_buffer_size(1);
+
+        auto expected_reader = flat_mutation_reader_from_mutations(tests::make_permit(), expected_muts);
+
+        auto push_fut = make_ready_future<>();
+        while (push_fut.available()) {
+            push_fut = handle.push(std::move(*expected_reader(db::no_timeout).get0()));
+        }
+
+        BOOST_REQUIRE(!push_fut.available());
+
+        handle.abort(std::make_exception_ptr<std::runtime_error>(std::runtime_error("error")));
+
+        BOOST_REQUIRE_THROW(reader.fill_buffer(db::no_timeout).get(), std::runtime_error);
+        BOOST_REQUIRE_THROW(push_fut.get(), std::runtime_error);
+        BOOST_REQUIRE(!reader.is_end_of_stream());
    }

    // Detached handle
--- a/test/boost/mutation_writer_test.cc
+++ b/test/boost/mutation_writer_test.cc
@@ -166,7 +166,7 @@ SEASTAR_TEST_CASE(test_multishard_writer_producer_aborts) {

 namespace {

-class bucket_writer {
+class test_bucket_writer {
    schema_ptr _schema;
    classify_by_timestamp _classify;
    std::unordered_map<int64_t, std::vector<mutation>>& _buckets;
@@ -175,6 +175,17 @@ class bucket_writer {
    mutation_opt _current_mutation;
    bool _is_first_mutation = true;

+    size_t _throw_after;
+    size_t _mutation_consumed = 0;
+
+public:
+    class expected_exception : public std::exception {
+    public:
+        virtual const char* what() const noexcept override {
+            return "expected_exception";
+        }
+    };
+
 private:
    void check_timestamp(api::timestamp_type ts) {
        const auto bucket_id = _classify(ts);
@@ -223,40 +234,53 @@ private:
        check_timestamp(rt.tomb.timestamp);
    }

+    void maybe_throw() {
+        if (_mutation_consumed++ >= _throw_after) {
+            throw(expected_exception());
+        }
+    }
+
 public:
-    bucket_writer(schema_ptr schema, classify_by_timestamp classify, std::unordered_map<int64_t, std::vector<mutation>>& buckets)
+    test_bucket_writer(schema_ptr schema, classify_by_timestamp classify, std::unordered_map<int64_t, std::vector<mutation>>& buckets, size_t throw_after = std::numeric_limits<size_t>::max())
        : _schema(std::move(schema))
        , _classify(std::move(classify))
-        , _buckets(buckets) {
-    }
+        , _buckets(buckets)
+        , _throw_after(throw_after)
+    { }
    void consume_new_partition(const dht::decorated_key& dk) {
+        maybe_throw();
        BOOST_REQUIRE(!_current_mutation);
        _current_mutation = mutation(_schema, dk);
    }
    void consume(tombstone partition_tombstone) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_partition_tombstone(partition_tombstone);
        _current_mutation->partition().apply(partition_tombstone);
    }
    stop_iteration consume(static_row&& sr) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_static_row(sr);
        _current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(sr)));
        return stop_iteration::no;
    }
    stop_iteration consume(clustering_row&& cr) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_clustering_row(cr);
        _current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(cr)));
        return stop_iteration::no;
    }
    stop_iteration consume(range_tombstone&& rt) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_range_tombstone(rt);
        _current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(rt)));
        return stop_iteration::no;
    }
    stop_iteration consume_end_of_partition() {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        BOOST_REQUIRE(_bucket_id);
        auto& bucket = _buckets[*_bucket_id];
@@ -311,7 +335,7 @@ SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer) {

    auto consumer = [&] (flat_mutation_reader bucket_reader) {
        return do_with(std::move(bucket_reader), [&] (flat_mutation_reader& rd) {
-            return rd.consume(bucket_writer(random_schema.schema(), classify_fn, buckets), db::no_timeout);
+            return rd.consume(test_bucket_writer(random_schema.schema(), classify_fn, buckets), db::no_timeout);
        });
    };

@@ -342,3 +366,53 @@ SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer) {
    }

 }
+
+SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer_abort) {
+    auto random_spec = tests::make_random_schema_specification(
+            get_name(),
+            std::uniform_int_distribution<size_t>(1, 4),
+            std::uniform_int_distribution<size_t>(2, 4),
+            std::uniform_int_distribution<size_t>(2, 8),
+            std::uniform_int_distribution<size_t>(2, 8));
+    auto random_schema = tests::random_schema{tests::random::get_int<uint32_t>(), *random_spec};
+
+    testlog.info("Random schema:\n{}", random_schema.cql());
+
+    auto ts_gen = [&, underlying = tests::default_timestamp_generator()] (std::mt19937& engine,
+            tests::timestamp_destination ts_dest, api::timestamp_type min_timestamp) -> api::timestamp_type {
+        if (ts_dest == tests::timestamp_destination::partition_tombstone ||
+                ts_dest == tests::timestamp_destination::row_marker ||
+                ts_dest == tests::timestamp_destination::row_tombstone ||
+                ts_dest == tests::timestamp_destination::collection_tombstone) {
+            if (tests::random::get_int<int>(0, 10, engine)) {
+                return api::missing_timestamp;
+            }
+        }
+        return underlying(engine, ts_dest, min_timestamp);
+    };
+
+    auto muts = tests::generate_random_mutations(random_schema, ts_gen).get0();
+
+    auto classify_fn = [] (api::timestamp_type ts) {
+        return int64_t(ts % 2);
+    };
+
+    std::unordered_map<int64_t, std::vector<mutation>> buckets;
+
+    int throw_after = tests::random::get_int(muts.size() - 1);
+    testlog.info("Will raise exception after {}/{} mutations", throw_after, muts.size());
+    auto consumer = [&] (flat_mutation_reader bucket_reader) {
+        return do_with(std::move(bucket_reader), [&] (flat_mutation_reader& rd) {
+            return rd.consume(test_bucket_writer(random_schema.schema(), classify_fn, buckets, throw_after), db::no_timeout);
+        });
+    };
+
+    try {
+        segregate_by_timestamp(flat_mutation_reader_from_mutations(tests::make_permit(), muts), classify_fn, std::move(consumer)).get();
+    } catch (const test_bucket_writer::expected_exception&) {
+        BOOST_TEST_PASSPOINT();
+    } catch (const seastar::broken_promise&) {
+        // Tolerated until we properly abort readers
+        BOOST_TEST_PASSPOINT();
+    }
+}
--- a/Show More
+++ b/Show More