mutation_writer: feed_writer(): handle exceptions from consume_end_of_stream()

Currently the exception handling code of feed_writer() assumes consume_end_of_stream() doesn't throw. This is false and an exception from said method can currently lead to an unclean destroy of the writer and reader. Fix by also handling exceptions from consume_end_of_stream() too. Closes #10147 (cherry picked from commit 1963d1cc25)
release: prepare for 4.4.9
2022-03-03 10:45:40 +01:00 · 2022-02-16 14:24:54 +02:00 · 2022-02-03 18:40:12 +02:00 · 2022-01-30 20:08:43 +02:00 · 2022-01-30 11:00:21 +02:00 · 2022-01-27 10:27:45 +02:00
181 changed files with 5518 additions and 1809 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -498,6 +498,7 @@ set(scylla_sources
    mutation_writer/multishard_writer.cc
    mutation_writer/shard_based_splitting_writer.cc
    mutation_writer/timestamp_based_splitting_writer.cc
+    mutation_writer/feed_writers.cc
    partition_slice_builder.cc
    partition_version.cc
    querier.cc
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.4.dev
+VERSION=4.4.9

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
 // as internally they're stored in an array, and the order of elements is
 // not important in set equality. See issue #5021
 static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
-    if (set1.Size() != set2.Size()) {
+    if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
        return false;
    }
    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
@@ -137,45 +137,107 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
    }
    return true;
 }
+// Moreover, the JSON being compared can be a nested document with outer
+// layers of lists and maps and some inner set - and we need to get to that
+// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
+static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
+        return false;
+    }
+    auto it1 = list1.Begin();
+    auto it2 = list2.Begin();
+    while (it1 != list1.End()) {
+        // Note: Alternator limits an item's depth (rjson::parse() limits
+        // it to around 37 levels), so this recursion is safe.
+        if (!check_EQ(&*it1, *it2)) {
+            return false;
+        }
+        ++it1;
+        ++it2;
+    }
+    return true;
+}
+static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
+        return false;
+    }
+    for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
+        auto it2 = list2.FindMember(it1->name);
+        if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
+            return false;
+        }
+    }
+    return true;
+}

 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    if (!v1) {
-        return false;
-    }
-    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+    if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
        auto it1 = v1->MemberBegin();
        auto it2 = v2.MemberBegin();
-        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
-            return check_EQ_for_sets(it1->value, it2->value);
+        if (it1->name != it2->name) {
+            return false;
        }
+        if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
+            return check_EQ_for_sets(it1->value, it2->value);
+        } else if(it1->name == "L") {
+            return check_EQ_for_lists(it1->value, it2->value);
+        } else if(it1->name == "M") {
+            return check_EQ_for_maps(it1->value, it2->value);
+        } else {
+            // Other, non-nested types (number, string, etc.) can be compared
+            // literally, comparing their JSON representation.
+            return it1->value == it2->value;
+        }
+    } else {
+        // If v1 and/or v2 are missing (IsNull()) the result should be false.
+        // In the unlikely case that the object is malformed (issue #8070),
+        // let's also return false.
+        return false;
    }
-    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
 static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
-    return !v1 || *v1 != v2; // null is unequal to anything.
+    return !check_EQ(v1, v2);
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error::validation(format("BEGINS_WITH operator requires String or Binary type in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error::validation(format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error::validation(format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
@@ -279,24 +341,40 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+// Note that in particular, if the value is missing (v->IsNull()), this
+// check returns false.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -310,7 +388,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

@@ -341,56 +420,71 @@ struct cmp_gt {
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-static bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
-        throw api_error::validation(
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        if (bounds_from_query) {
+            throw api_error::validation(
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error::validation("between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error::validation(
+        if (bounds_from_query) {
+           throw api_error::validation(
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error::validation(
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error::validation(
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -437,19 +531,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -461,7 +555,8 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
            {
                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
@@ -573,7 +668,8 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
            // Shouldn't happen unless we have a bug in the parser
            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
    case parsed::primitive_condition::type::IN:
        return check_IN(calculated_values);
    case parsed::primitive_condition::type::VALUE:
@@ -604,13 +700,17 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::NE:
        return check_NE(&calculated_values[0], calculated_values[1]);
    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    default:
        // Shouldn't happen unless we have a bug in the parser
        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -52,6 +52,7 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
 bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);

 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);

 bool verify_condition_expression(
        const parsed::condition_expression& condition_expression,
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -202,7 +202,7 @@ static schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& r
    if (!schema) {
        // if we get here then the name was missing, since syntax or missing actual CF 
        // checks throw. Slow path, but just call get_table_name to generate exception. 
-        get_table_name(request);        
+        get_table_name(request);
    }
    return schema;
 }
@@ -1882,18 +1882,182 @@ static std::string get_item_type_string(const rjson::value& v) {
    return it->name.GetString();
 }

+// attrs_to_get saves for each top-level attribute an attrs_to_get_node,
+// a hierarchy of subparts that need to be kept. The following function
+// takes a given JSON value and drops its parts which weren't asked to be
+// kept. It modifies the given JSON value, or returns false to signify that
+// the entire object should be dropped.
+// Note that The JSON value is assumed to be encoded using the DynamoDB
+// conventions - i.e., it is really a map whose key has a type string,
+// and the value is the real object.
+template<typename T>
+static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>& h) {
+    if (!val.IsObject() || val.MemberCount() != 1) {
+        // This shouldn't happen. We shouldn't have stored malformed objects.
+        // But today Alternator does not validate the structure of nested
+        // documents before storing them, so this can happen on read.
+        throw api_error::internal(format("Malformed value object read: {}", val));
+    }
+    const char* type = val.MemberBegin()->name.GetString();
+    rjson::value& v = val.MemberBegin()->value;
+    if (h.has_members()) {
+        const auto& members = h.get_members();
+        if (type[0] != 'M' || !v.IsObject()) {
+            // If v is not an object (dictionary, map), none of the members
+            // can match.
+            return false;
+        }
+        rjson::value newv = rjson::empty_object();
+        for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) {
+            std::string attr = it->name.GetString();
+            auto x = members.find(attr);
+            if (x != members.end()) {
+                if (x->second) {
+                    // Only a part of this attribute is to be filtered, do it.
+                    if (hierarchy_filter(it->value, *x->second)) {
+                        rjson::set_with_string_name(newv, attr, std::move(it->value));
+                    }
+                } else {
+                    // The entire attribute is to be kept
+                    rjson::set_with_string_name(newv, attr, std::move(it->value));
+                }
+            }
+        }
+        if (newv.MemberCount() == 0) {
+            return false;
+        }
+        v = newv;
+    } else if (h.has_indexes()) {
+        const auto& indexes = h.get_indexes();
+        if (type[0] != 'L' || !v.IsArray()) {
+            return false;
+        }
+        rjson::value newv = rjson::empty_array();
+        const auto& a = v.GetArray();
+        for (unsigned i = 0; i < v.Size(); i++) {
+            auto x = indexes.find(i);
+            if (x != indexes.end()) {
+                if (x->second) {
+                    if (hierarchy_filter(a[i], *x->second)) {
+                        rjson::push_back(newv, std::move(a[i]));
+                    }
+                } else {
+                    // The entire attribute is to be kept
+                    rjson::push_back(newv, std::move(a[i]));
+                }
+            }
+        }
+        if (newv.Size() == 0) {
+            return false;
+        }
+        v = newv;
+    }
+    return true;
+}
+
+// Add a path to a attribute_path_map. Throws a validation error if the path
+// "overlaps" with one already in the filter (one is a sub-path of the other)
+// or "conflicts" with it (both a member and index is requested).
+template<typename T>
+void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const parsed::path& p, T value = {}) {
+   using node = attribute_path_map_node<T>;
+    // The first step is to look for the top-level attribute (p.root()):
+    auto it = map.find(p.root());
+    if (it == map.end()) {
+        if (p.has_operators()) {
+            it = map.emplace(p.root(), node {std::nullopt}).first;
+        } else {
+            (void) map.emplace(p.root(), node {std::move(value)}).first;
+            // Value inserted for top-level node. We're done.
+            return;
+        }
+    } else if(!p.has_operators()) {
+        // If p is top-level and we already have it or a part of it
+        // in map, it's a forbidden overlapping path.
+        throw api_error::validation(format(
+            "Invalid {}: two document paths overlap at {}", source, p.root()));
+    } else if (it->second.has_value()) {
+        // If we're here, it != map.end() && p.has_operators && it->second.has_value().
+        // This means the top-level attribute already has a value, and we're
+        // trying to add a non-top-level value. It's an overlap.
+        throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p.root()));
+    }
+    node* h = &it->second;
+    // The second step is to walk h from the top-level node to the inner node
+    // where we're supposed to insert the value:
+    for (const auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                if (h->is_empty()) {
+                    *h = node {typename node::members_t()};
+                } else if (h->has_indexes()) {
+                    throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
+                } else if (h->has_value()) {
+                    throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
+                }
+                typename node::members_t& members = h->get_members();
+                auto it = members.find(member);
+                if (it == members.end()) {
+                    it = members.insert({member, make_shared<node>()}).first;
+                }
+                h = it->second.get();
+            },
+            [&] (unsigned index) {
+                if (h->is_empty()) {
+                    *h = node {typename node::indexes_t()};
+                } else if (h->has_members()) {
+                    throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
+                } else if (h->has_value()) {
+                    throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
+                }
+                typename node::indexes_t& indexes = h->get_indexes();
+                auto it = indexes.find(index);
+                if (it == indexes.end()) {
+                    it = indexes.insert({index, make_shared<node>()}).first;
+                }
+                h = it->second.get();
+            }
+        }, op);
+    }
+    // Finally, insert the value in the node h.
+    if (h->is_empty()) {
+        *h = node {std::move(value)};
+    } else {
+        throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
+    }
+}
+
+// A very simplified version of the above function for the special case of
+// adding only top-level attribute. It's not only simpler, we also use a
+// different error message, referring to a "duplicate attribute"instead of
+// "overlapping paths". DynamoDB also has this distinction (errors in
+// AttributesToGet refer to duplicates, not overlaps, but errors in
+// ProjectionExpression refer to overlap - even if it's an exact duplicate).
+template<typename T>
+void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const std::string& attr, T value = {}) {
+   using node = attribute_path_map_node<T>;
+    auto it = map.find(attr);
+    if (it == map.end()) {
+        map.emplace(attr, node {std::move(value)});
+    } else {
+        throw api_error::validation(format(
+            "Invalid {}: Duplicate attribute: {}", source, attr));
+    }
+}
+
 // calculate_attrs_to_get() takes either AttributesToGet or
 // ProjectionExpression parameters (having both is *not* allowed),
 // and returns the list of cells we need to read, or an empty set when
 // *all* attributes are to be returned.
-// In our current implementation, only top-level attributes are stored
-// as cells, and nested documents are stored serialized as JSON.
-// So this function currently returns only the the top-level attributes
-// but we also need to add, after the query, filtering to keep only
-// the parts of the JSON attributes that were chosen in the paths'
-// operators. Because we don't have such filtering yet (FIXME), we fail here
-// if the requested paths are anything but top-level attributes.
-std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req, std::unordered_set<std::string>& used_attribute_names) {
+// However, in our current implementation, only top-level attributes are
+// stored as separate cells - a nested document is stored serialized together
+// (as JSON) in the same cell. So this function return a map - each key is the
+// top-level attribute we will need need to read, and the value for each
+// top-level attribute is the partial hierarchy (struct hierarchy_filter)
+// that we will need to extract from that serialized JSON.
+// For example, if ProjectionExpression lists a.b and a.c[2], we
+// return one top-level attribute name, "a", with the value "{b, c[2]}".
+static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unordered_set<std::string>& used_attribute_names) {
    const bool has_attributes_to_get = req.HasMember("AttributesToGet");
    const bool has_projection_expression = req.HasMember("ProjectionExpression");
    if (has_attributes_to_get && has_projection_expression) {
@@ -1902,9 +2066,9 @@ std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req,
    }
    if (has_attributes_to_get) {
        const rjson::value& attributes_to_get = req["AttributesToGet"];
-        std::unordered_set<std::string> ret;
+        attrs_to_get ret;
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
-            ret.insert(it->GetString());
+            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
        return ret;
    } else if (has_projection_expression) {
@@ -1917,24 +2081,13 @@ std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req,
            throw api_error::validation(e.what());
        }
        resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names);
-        std::unordered_set<std::string> seen_column_names;
-        auto ret = boost::copy_range<std::unordered_set<std::string>>(paths_to_get |
-            boost::adaptors::transformed([&] (const parsed::path& p) {
-                if (p.has_operators()) {
-                    // FIXME: this check will need to change when we support non-toplevel attributes
-                    throw api_error::validation("Non-toplevel attributes in ProjectionExpression not yet implemented");
-                }
-                if (!seen_column_names.insert(p.root()).second) {
-                    // FIXME: this check will need to change when we support non-toplevel attributes
-                    throw api_error::validation(
-                            format("Invalid ProjectionExpression: two document paths overlap with each other: {} and {}.",
-                                    p.root(), p.root()));
-                }
-                return p.root();
-            }));
+        attrs_to_get ret;
+        for (const parsed::path& p : paths_to_get) {
+            attribute_path_map_add("ProjectionExpression", ret, p);
+        }
        return ret;
    }
-    // An empty set asks to read everything
+    // An empty map asks to read everything
    return {};
 }

@@ -1955,7 +2108,7 @@ std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req,
 */ 
 void executor::describe_single_item(const cql3::selection::selection& selection,
    const std::vector<bytes_opt>& result_row,
-    const std::unordered_set<std::string>& attrs_to_get,
+    const attrs_to_get& attrs_to_get,
    rjson::value& item,
    bool include_all_embedded_attributes) 
 {
@@ -1976,7 +2129,16 @@ void executor::describe_single_item(const cql3::selection::selection& selection,
                std::string attr_name = value_cast<sstring>(entry.first);
                if (include_all_embedded_attributes || attrs_to_get.empty() || attrs_to_get.contains(attr_name)) {
                    bytes value = value_cast<bytes>(entry.second);
-                    rjson::set_with_string_name(item, attr_name, deserialize_item(value));
+                    rjson::value v = deserialize_item(value);
+                    auto it = attrs_to_get.find(attr_name);
+                    if (it != attrs_to_get.end()) {
+                        // attrs_to_get may have asked for only part of this attribute:
+                        if (hierarchy_filter(v, it->second)) {
+                            rjson::set_with_string_name(item, attr_name, std::move(v));
+                        }
+                    } else {
+                        rjson::set_with_string_name(item, attr_name, std::move(v));
+                    }
                }
            }
        }
@@ -1988,7 +2150,7 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
        const query::partition_slice& slice,
        const cql3::selection::selection& selection,
        const query::result& query_result,
-        const std::unordered_set<std::string>& attrs_to_get) {
+        const attrs_to_get& attrs_to_get) {
    rjson::value item = rjson::empty_object();

    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
@@ -2024,8 +2186,16 @@ static bool check_needs_read_before_write(const parsed::value& v) {
    }, v._value);
 }

-static bool check_needs_read_before_write(const parsed::update_expression& update_expression) {
-    return boost::algorithm::any_of(update_expression.actions(), [](const parsed::update_expression::action& action) {
+static bool check_needs_read_before_write(const attribute_path_map<parsed::update_expression::action>& update_expression) {
+    return boost::algorithm::any_of(update_expression, [](const auto& p) {
+        if (!p.second.has_value()) {
+            // If the action is not on the top-level attribute, we need to
+            // read the old item: we change only a part of the top-level
+            // attribute, and write the full top-level attribute back.
+            return true;
+        }
+        // Otherwise, the action p.second.get_value() is just on top-level
+        // attribute. Check if it needs read-before-write:
        return std::visit(overloaded_functor {
            [&] (const parsed::update_expression::action::set& a) -> bool {
                return check_needs_read_before_write(a._rhs._v1) || (a._rhs._op != 'v' && check_needs_read_before_write(a._rhs._v2));
@@ -2039,7 +2209,7 @@ static bool check_needs_read_before_write(const parsed::update_expression& updat
            [&] (const parsed::update_expression::action::del& a) -> bool {
                return true;
            }
-        }, action._action);
+        }, p.second.get_value()._action);
    });
 }

@@ -2048,7 +2218,11 @@ public:
    // Some information parsed during the constructor to check for input
    // errors, and cached to be used again during apply().
    rjson::value* _attribute_updates;
-    parsed::update_expression _update_expression;
+    // Instead of keeping a parsed::update_expression with an unsorted list
+    // list of actions, we keep them in an attribute_path_map which groups
+    // them by top-level attribute, and detects forbidden overlaps/conflicts.
+    attribute_path_map<parsed::update_expression::action> _update_expression;
+
    parsed::condition_expression _condition_expression;

    update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
@@ -2079,16 +2253,22 @@ update_item_operation::update_item_operation(service::storage_proxy& proxy, rjso
            throw api_error::validation("UpdateExpression must be a string");
        }
        try {
-            _update_expression = parse_update_expression(update_expression->GetString());
-            resolve_update_expression(_update_expression,
+            parsed::update_expression expr = parse_update_expression(update_expression->GetString());
+            resolve_update_expression(expr,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
+            if (expr.empty()) {
+                throw api_error::validation("Empty expression in UpdateExpression is not allowed");
+            }
+            for (auto& action : expr.actions()) {
+                // Unfortunately we need to copy the action's path, because
+                // we std::move the action object.
+                auto p = action._path;
+                attribute_path_map_add("UpdateExpression", _update_expression, p, std::move(action));
+            }
        } catch(expressions_syntax_error& e) {
            throw api_error::validation(e.what());
        }
-        if (_update_expression.empty()) {
-            throw api_error::validation("Empty expression in UpdateExpression is not allowed");
-        }
    }
    _attribute_updates = rjson::find(_request, "AttributeUpdates");
    if (_attribute_updates) {
@@ -2130,6 +2310,187 @@ update_item_operation::needs_read_before_write() const {
           (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
 }

+// action_result() returns the result of applying an UpdateItem action -
+// this result is either a JSON object or an unset optional which indicates
+// the action was a deletion. The caller (update_item_operation::apply()
+// below) will either write this JSON as the content of a column, or
+// use it as a piece in a bigger top-level attribute.
+static std::optional<rjson::value> action_result(
+        const parsed::update_expression::action& action,
+        const rjson::value* previous_item) {
+    return std::visit(overloaded_functor {
+        [&] (const parsed::update_expression::action::set& a) -> std::optional<rjson::value> {
+            return calculate_value(a._rhs, previous_item);
+        },
+        [&] (const parsed::update_expression::action::remove& a) -> std::optional<rjson::value> {
+            return std::nullopt;
+        },
+        [&] (const parsed::update_expression::action::add& a) -> std::optional<rjson::value> {
+            parsed::value base;
+            parsed::value addition;
+            base.set_path(action._path);
+            addition.set_constant(a._valref);
+            rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
+            rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item);
+            rjson::value result;
+            // An ADD can be used to create a new attribute (when
+            // v1.IsNull()) or to add to a pre-existing attribute:
+            if (v1.IsNull()) {
+                std::string v2_type = get_item_type_string(v2);
+                if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
+                    result = v2;
+                } else {
+                    throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2));
+                }
+            } else {
+                std::string v1_type = get_item_type_string(v1);
+                if (v1_type == "N") {
+                    if (get_item_type_string(v2) != "N") {
+                        throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                    }
+                    result = number_add(v1, v2);
+                } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
+                    if (get_item_type_string(v2) != v1_type) {
+                        throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                    }
+                    result = set_sum(v1, v2);
+                } else {
+                    throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
+                }
+            }
+            return result;
+        },
+        [&] (const parsed::update_expression::action::del& a) -> std::optional<rjson::value> {
+            parsed::value base;
+            parsed::value subset;
+            base.set_path(action._path);
+            subset.set_constant(a._valref);
+            rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
+            rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item);
+            if (!v1.IsNull()) {
+                return set_diff(v1, v2);
+            }
+            // When we return nullopt here, we ask to *delete* this attribute,
+            // which is unnecessary because we know the attribute does not
+            // exist anyway. This is a waste, but a small one. Note that also
+            // for the "remove" action above we don't bother to check if the
+            // previous_item add anything to remove.
+            return std::nullopt;
+        }
+    }, action._action);
+}
+
+// Print an attribute_path_map_node<action> as the list of paths it contains:
+static std::ostream& operator<<(std::ostream& out, const attribute_path_map_node<parsed::update_expression::action>& h) {
+    if (h.has_value()) {
+        out << " " << h.get_value()._path;
+    } else if (h.has_members()) {
+        for (auto& member : h.get_members()) {
+            out << *member.second;
+        }
+    } else if (h.has_indexes()) {
+        for (auto& index : h.get_indexes()) {
+            out << *index.second;
+        }
+    }
+    return out;
+}
+
+// Apply the hierarchy of actions in an attribute_path_map_node<action> to a
+// JSON object which uses DynamoDB's serialization conventions. The complete,
+// unmodified, previous_item is also necessary for the right-hand sides of the
+// actions. Modifies obj in-place or returns false if it is to be removed.
+static bool hierarchy_actions(
+        rjson::value& obj,
+        const attribute_path_map_node<parsed::update_expression::action>& h,
+        const rjson::value* previous_item)
+{
+    if (!obj.IsObject() || obj.MemberCount() != 1) {
+        // This shouldn't happen. We shouldn't have stored malformed objects.
+        // But today Alternator does not validate the structure of nested
+        // documents before storing them, so this can happen on read.
+        throw api_error::validation(format("Malformed value object read: {}", obj));
+    }
+    const char* type = obj.MemberBegin()->name.GetString();
+    rjson::value& v = obj.MemberBegin()->value;
+    if (h.has_value()) {
+        // Action replacing everything in this position in the hierarchy
+        std::optional<rjson::value> newv = action_result(h.get_value(), previous_item);
+        if (newv) {
+            obj = std::move(*newv);
+        } else {
+            return false;
+        }
+    } else if (h.has_members()) {
+        if (type[0] != 'M' || !v.IsObject()) {
+            // A .something on a non-map doesn't work.
+            throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+        }
+        for (const auto& member : h.get_members()) {
+            std::string attr = member.first;
+            const attribute_path_map_node<parsed::update_expression::action>& subh = *member.second;
+            rjson::value *subobj = rjson::find(v, attr);
+            if (subobj) {
+                if (!hierarchy_actions(*subobj, subh, previous_item)) {
+                    rjson::remove_member(v, attr);
+                }
+            } else {
+                // When a.b does not exist, setting a.b itself (i.e.
+                // subh.has_value()) is fine, but setting a.b.c is not.
+                if (subh.has_value()) {
+                    std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
+                    if (newv) {
+                        rjson::set_with_string_name(v, attr, std::move(*newv));
+                    } else {
+                        throw api_error::validation(format("Can't remove document path {} - not present in item",
+                            subh.get_value()._path));
+                    }
+                } else {
+                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+                }
+            }
+        }
+    } else if (h.has_indexes()) {
+        if (type[0] != 'L' || !v.IsArray()) {
+            // A [i] on a non-list doesn't work.
+            throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+        }
+        unsigned nremoved = 0;
+        for (const auto& index : h.get_indexes()) {
+            unsigned i = index.first - nremoved;
+            const attribute_path_map_node<parsed::update_expression::action>& subh = *index.second;
+            if (i < v.Size()) {
+                if (!hierarchy_actions(v[i], subh, previous_item)) {
+                    v.Erase(v.Begin() + i);
+                    // If we have the actions "REMOVE a[1] SET a[3] = :val",
+                    // the index 3 refers to the original indexes, before any
+                    // items were removed. So we offset the next indexes
+                    // (which are guaranteed to be higher than i - indexes is
+                    // a sorted map) by an increased "nremoved".
+                    nremoved++;
+                }
+            } else {
+                // If a[7] does not exist, setting a[7] itself (i.e.
+                // subh.has_value()) is fine - and appends an item, though
+                // not necessarily with index 7. But setting a[7].b will
+                // not work.
+                if (subh.has_value()) {
+                    std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
+                    if (newv) {
+                        rjson::push_back(v, std::move(*newv));
+                    } else {
+                        // Removing a[7] when the list has fewer elements is
+                        // silently ignored. It's not considered an error.
+                    }
+                } else {
+                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+                }
+            }
+        }
+    }
+    return true;
+}
+
 std::optional<mutation>
 update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
    if (!verify_expected(_request, previous_item.get()) ||
@@ -2144,17 +2505,37 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
    auto& row = m.partition().clustered_row(*_schema, _ck);
    attribute_collector attrs_collector;
    bool any_updates = false;
-    auto do_update = [&] (bytes&& column_name, const rjson::value& json_value) {
+    auto do_update = [&] (bytes&& column_name, const rjson::value& json_value,
+                          const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) {
        any_updates = true;
-        if (_returnvalues == returnvalues::ALL_NEW ||
-            _returnvalues == returnvalues::UPDATED_NEW) {
-            rjson::set_with_string_name(_return_attributes,
-                    to_sstring_view(column_name), rjson::copy(json_value));
+        if (_returnvalues == returnvalues::ALL_NEW) {
+            rjson::replace_with_string_name(_return_attributes,
+                to_sstring_view(column_name), rjson::copy(json_value));
+        } else if (_returnvalues == returnvalues::UPDATED_NEW) {
+            rjson::value&& v = rjson::copy(json_value);
+            if (h) {
+                // If the operation was only on specific attribute paths,
+                // leave only them in _return_attributes.
+                if (hierarchy_filter(v, *h)) {
+                    rjson::set_with_string_name(_return_attributes,
+                        to_sstring_view(column_name), std::move(v));
+                }
+            } else {
+                rjson::set_with_string_name(_return_attributes,
+                    to_sstring_view(column_name), std::move(v));
+            }
        } else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
            std::string_view cn =  to_sstring_view(column_name);
            const rjson::value* col = rjson::find(*previous_item, cn);
            if (col) {
-                rjson::set_with_string_name(_return_attributes, cn, rjson::copy(*col));
+                rjson::value&& v = rjson::copy(*col);
+                if (h) {
+                    if (hierarchy_filter(v, *h)) {
+                        rjson::set_with_string_name(_return_attributes, cn, std::move(v));
+                    }
+                } else {
+                    rjson::set_with_string_name(_return_attributes, cn, std::move(v));
+                }
            }
        }
        const column_definition* cdef = _schema->get_column_definition(column_name);
@@ -2196,7 +2577,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
    // can just move previous_item later, when we don't need it any more.
    if (_returnvalues == returnvalues::ALL_NEW) {
        if (previous_item) {
-            _return_attributes = std::move(*previous_item);
+            _return_attributes = rjson::copy(*previous_item);
        } else {
            // If there is no previous item, usually a new item is created
            // and contains they given key. This may be cancelled at the end
@@ -2209,88 +2590,44 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
    }

    if (!_update_expression.empty()) {
-        std::unordered_set<std::string> seen_column_names;
-        for (auto& action : _update_expression.actions()) {
-            if (action._path.has_operators()) {
-                // FIXME: implement this case
-                throw api_error::validation("UpdateItem support for nested updates not yet implemented");
-            }
-            std::string column_name = action._path.root();
+        for (auto& actions : _update_expression) {
+            // The actions of _update_expression are grouped by top-level
+            // attributes. Here, all actions in actions.second share the same
+            // top-level attribute actions.first.
+            std::string column_name = actions.first;
            const column_definition* cdef = _schema->get_column_definition(to_bytes(column_name));
            if (cdef && cdef->is_primary_key()) {
-                throw api_error::validation(
-                        format("UpdateItem cannot update key column {}", column_name));
+                throw api_error::validation(format("UpdateItem cannot update key column {}", column_name));
            }
-            // DynamoDB forbids multiple updates in the same expression to
-            // modify overlapping document paths. Updates of one expression
-            // have the same timestamp, so it's unclear which would "win".
-            // FIXME: currently, without full support for document paths,
-            // we only check if the paths' roots are the same.
-            if (!seen_column_names.insert(column_name).second) {
-                throw api_error::validation(
-                        format("Invalid UpdateExpression: two document paths overlap with each other: {} and {}.",
-                                column_name, column_name));
-            }
-            std::visit(overloaded_functor {
-                [&] (const parsed::update_expression::action::set& a) {
-                    auto value = calculate_value(a._rhs, previous_item.get());
-                    do_update(to_bytes(column_name), value);
-                },
-                [&] (const parsed::update_expression::action::remove& a) {
+            if (actions.second.has_value()) {
+                // An action on a top-level attribute column_name. The single
+                // action is actions.second.get_value(). We can simply invoke
+                // the action and replace the attribute with its result:
+                std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
+                if (result) {
+                    do_update(to_bytes(column_name), *result);
+                } else {
                    do_delete(to_bytes(column_name));
-                },
-                [&] (const parsed::update_expression::action::add& a) {
-                    parsed::value base;
-                    parsed::value addition;
-                    base.set_path(action._path);
-                    addition.set_constant(a._valref);
-                    rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get());
-                    rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item.get());
-                    rjson::value result;
-                    // An ADD can be used to create a new attribute (when
-                    // v1.IsNull()) or to add to a pre-existing attribute:
-                    if (v1.IsNull()) {
-                        std::string v2_type = get_item_type_string(v2);
-                        if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
-                            result = v2;
-                        } else {
-                            throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2));
-                        }
-                    } else {
-                        std::string v1_type = get_item_type_string(v1);
-                        if (v1_type == "N") {
-                            if (get_item_type_string(v2) != "N") {
-                                throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
-                            }
-                            result = number_add(v1, v2);
-                        } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
-                            if (get_item_type_string(v2) != v1_type) {
-                                throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
-                            }
-                            result = set_sum(v1, v2);
-                        } else {
-                            throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
-                        }
-                    }
-                    do_update(to_bytes(column_name), result);
-                },
-                [&] (const parsed::update_expression::action::del& a) {
-                    parsed::value base;
-                    parsed::value subset;
-                    base.set_path(action._path);
-                    subset.set_constant(a._valref);
-                    rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get());
-                    rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item.get());
-                    if (!v1.IsNull()) {
-                        std::optional<rjson::value> result  = set_diff(v1, v2);
-                        if (result) {
-                            do_update(to_bytes(column_name), *result);
-                        } else {
-                            do_delete(to_bytes(column_name));
-                        }
-                    }
                }
-            }, action._action);
+            } else {
+                // We have actions on a path or more than one path in the same
+                // top-level attribute column_name - but not on the top-level
+                // attribute as a whole. We already read the full top-level
+                // attribute (see check_needs_read_before_write()), and now we
+                // need to modify pieces of it and write back the entire
+                // top-level attribute.
+                if (!previous_item) {
+                    throw api_error::validation(format("UpdateItem cannot update nested document path on non-existent item"));
+                }
+                const rjson::value *toplevel = rjson::find(*previous_item, column_name);
+                if (!toplevel) {
+                    throw api_error::validation(format("UpdateItem cannot update document path: missing attribute {}",
+                        column_name));
+                }
+                rjson::value result = rjson::copy(*toplevel);
+                hierarchy_actions(result, actions.second, previous_item.get());
+                do_update(to_bytes(column_name), std::move(result), &actions.second);
+            }
        }
    }
    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
@@ -2408,7 +2745,7 @@ static rjson::value describe_item(schema_ptr schema,
        const query::partition_slice& slice,
        const cql3::selection::selection& selection,
        const query::result& query_result,
-        const std::unordered_set<std::string>& attrs_to_get) {
+        const attrs_to_get& attrs_to_get) {
    std::optional<rjson::value> opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get);
    if (!opt_item) {
        // If there is no matching item, we're supposed to return an empty
@@ -2480,7 +2817,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    struct table_requests {
        schema_ptr schema;
        db::consistency_level cl;
-        std::unordered_set<std::string> attrs_to_get;
+        attrs_to_get attrs_to_get;
        struct single_request {
            partition_key pk;
            clustering_key ck;
@@ -2694,7 +3031,7 @@ void filter::for_filters_on(const noncopyable_function<void(std::string_view)>&
 class describe_items_visitor {
    typedef std::vector<const column_definition*> columns_t;
    const columns_t& _columns;
-    const std::unordered_set<std::string>& _attrs_to_get;
+    const attrs_to_get& _attrs_to_get;
    std::unordered_set<std::string> _extra_filter_attrs;
    const filter& _filter;
    typename columns_t::const_iterator _column_it;
@@ -2703,7 +3040,7 @@ class describe_items_visitor {
    size_t _scanned_count;

 public:
-    describe_items_visitor(const columns_t& columns, const std::unordered_set<std::string>& attrs_to_get, filter& filter)
+    describe_items_visitor(const columns_t& columns, const attrs_to_get& attrs_to_get, filter& filter)
            : _columns(columns)
            , _attrs_to_get(attrs_to_get)
            , _filter(filter)
@@ -2752,6 +3089,12 @@ public:
                    std::string attr_name = value_cast<sstring>(entry.first);
                    if (_attrs_to_get.empty() || _attrs_to_get.contains(attr_name) || _extra_filter_attrs.contains(attr_name)) {
                        bytes value = value_cast<bytes>(entry.second);
+                        // Even if _attrs_to_get asked to keep only a part of a
+                        // top-level attribute, we keep the entire attribute
+                        // at this stage, because the item filter might still
+                        // need the other parts (it was easier for us to keep
+                        // extra_filter_attrs at top-level granularity). We'll
+                        // filter the unneeded parts after item filtering.
                        rjson::set_with_string_name(_item, attr_name, deserialize_item(value));
                    }
                }
@@ -2762,11 +3105,24 @@ public:

    void end_row() {
        if (_filter.check(_item)) {
+            // As noted above, we kept entire top-level attributes listed in
+            // _attrs_to_get. We may need to only keep parts of them.
+            for (const auto& attr: _attrs_to_get) {
+                // If !attr.has_value() it means we were asked not to keep
+                // attr entirely, but just parts of it.
+                if (!attr.second.has_value()) {
+                    rjson::value* toplevel= rjson::find(_item, attr.first);
+                    if (toplevel && !hierarchy_filter(*toplevel, attr.second)) {
+                        rjson::remove_member(_item, attr.first);
+                    }
+                }
+            }
            // Remove the extra attributes _extra_filter_attrs which we had
            // to add just for the filter, and not requested to be returned:
            for (const auto& attr : _extra_filter_attrs) {
                rjson::remove_member(_item, attr);
            }
+
            rjson::push_back(_items, std::move(_item));
        }
        _item = rjson::empty_object();
@@ -2782,7 +3138,7 @@ public:
    }
 };

-static rjson::value describe_items(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, std::unordered_set<std::string>&& attrs_to_get, filter&& filter) {
+static rjson::value describe_items(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, attrs_to_get&& attrs_to_get, filter&& filter) {
    describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter);
    result_set->visit(visitor);
    auto scanned_count = visitor.get_scanned_count();
@@ -2823,7 +3179,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
        const rjson::value* exclusive_start_key,
        dht::partition_range_vector&& partition_ranges,
        std::vector<query::clustering_range>&& ck_bounds,
-        std::unordered_set<std::string>&& attrs_to_get,
+        attrs_to_get&& attrs_to_get,
        uint32_t limit,
        db::consistency_level cl,
        filter&& filter,
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -70,6 +70,76 @@ public:
    std::string to_json() const override;
 };

+namespace parsed {
+class path;
+};
+
+// An attribute_path_map object is used to hold data for various attributes
+// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path
+// has a root attribute, and then modified by member and index operators -
+// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then
+// "[2]" index, and finally ".c" member.
+// Data can be added to an attribute_path_map using the add() function, but
+// requires that attributes with data not be *overlapping* or *conflicting*:
+//
+// 1. Two attribute paths which are identical or an ancestor of one another
+//    are considered *overlapping* and not allowed. If a.b.c has data,
+//    we can't add more data in a.b.c or any of its descendants like a.b.c.d.
+//
+// 2. Two attribute paths which need the same parent to have both a member and
+//    an index are considered *conflicting* and not allowed. E.g., if a.b has
+//    data, you can't add a[1]. The meaning of adding both would be that the
+//    attribute a is both a map and an array, which isn't sensible.
+//
+// These two requirements are common to the two places where Alternator uses
+// this abstraction to describe how a hierarchical item is to be transformed:
+//
+// 1. In ProjectExpression: for filtering from a full top-level attribute
+//    only the parts for which user asked in ProjectionExpression.
+//
+// 2. In UpdateExpression: for taking the previous value of a top-level
+//    attribute, and modifying it based on the instructions in the user
+//    wrote in UpdateExpression.
+
+template<typename T>
+class attribute_path_map_node {
+public:
+    using data_t = T;
+    // We need the extra shared_ptr<> here because libstdc++ unordered_map
+    // doesn't work with incomplete types :-( We couldn't use lw_shared_ptr<>
+    // because it doesn't work for incomplete types either. We couldn't use
+    // std::unique_ptr<> because it makes the entire object uncopyable. We
+    // don't often need to copy such a map, but we do have some code that
+    // copies an attrs_to_get object, and is hard to find and remove.
+    // The shared_ptr should never be null.
+    using members_t =  std::unordered_map<std::string, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The indexes list is sorted because DynamoDB requires handling writes
+    // beyond the end of a list in index order.
+    using indexes_t = std::map<unsigned, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The prohibition on "overlap" and "conflict" explained above means
+    // That only one of data, members or indexes is non-empty.
+    std::optional<std::variant<data_t, members_t, indexes_t>> _content;
+
+    bool is_empty() const { return !_content; }
+    bool has_value() const { return _content && std::holds_alternative<data_t>(*_content); }
+    bool has_members() const { return _content && std::holds_alternative<members_t>(*_content); }
+    bool has_indexes() const { return _content && std::holds_alternative<indexes_t>(*_content); }
+    // get_members() assumes that has_members() is true
+    members_t& get_members() { return std::get<members_t>(*_content); }
+    const members_t& get_members() const { return std::get<members_t>(*_content); }
+    indexes_t& get_indexes() { return std::get<indexes_t>(*_content); }
+    const indexes_t& get_indexes() const { return std::get<indexes_t>(*_content); }
+    T& get_value() { return std::get<T>(*_content); }
+    const T& get_value() const { return std::get<T>(*_content); }
+};
+
+template<typename T>
+using attribute_path_map = std::unordered_map<std::string, attribute_path_map_node<T>>;
+
+using attrs_to_get_node = attribute_path_map_node<std::monostate>;
+using attrs_to_get = attribute_path_map<std::monostate>;
+
+
 class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
@@ -140,16 +210,14 @@ public:
        const query::partition_slice&,
        const cql3::selection::selection&,
        const query::result&,
-        const std::unordered_set<std::string>&);
+        const attrs_to_get&);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<bytes_opt>&,
-        const std::unordered_set<std::string>&,
+        const attrs_to_get&,
        rjson::value&,
        bool = false);

-
-
    void add_stream_options(const rjson::value& stream_spec, schema_builder&) const;
    void supplement_table_info(rjson::value& descr, const schema& schema) const;
    void supplement_table_stream_info(rjson::value& descr, const schema& schema) const;
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -130,6 +130,27 @@ void condition_expression::append(condition_expression&& a, char op) {
    }, _expression);
 }

+void path::check_depth_limit() {
+    if (1 + _operators.size() > depth_limit) {
+        throw expressions_syntax_error(format("Document path exceeded {} nesting levels", depth_limit));
+    }
+}
+
+std::ostream& operator<<(std::ostream& os, const path& p) {
+    os << p.root();
+    for (const auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                os << '.' << member;
+            },
+            [&] (unsigned index) {
+                os << '[' << index << ']';
+            }
+        }, op);
+    }
+    return os;
+}
+
 } // namespace parsed

 // The following resolve_*() functions resolve references in parsed
@@ -151,10 +172,9 @@ void condition_expression::append(condition_expression&& a, char op) {
 // we need to resolve the expression just once but then use it many times
 // (once for each item to be filtered).

-static void resolve_path(parsed::path& p,
+static std::optional<std::string> resolve_path_component(const std::string& column_name,
        const rjson::value* expression_attribute_names,
        std::unordered_set<std::string>& used_attribute_names) {
-    const std::string& column_name = p.root();
    if (column_name.size() > 0 && column_name.front() == '#') {
        if (!expression_attribute_names) {
            throw api_error::validation(
@@ -166,7 +186,30 @@ static void resolve_path(parsed::path& p,
                    format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
        }
        used_attribute_names.emplace(column_name);
-        p.set_root(std::string(rjson::to_string_view(*value)));
+        return std::string(rjson::to_string_view(*value));
+    }
+    return std::nullopt;
+}
+
+static void resolve_path(parsed::path& p,
+        const rjson::value* expression_attribute_names,
+        std::unordered_set<std::string>& used_attribute_names) {
+    std::optional<std::string> r = resolve_path_component(p.root(), expression_attribute_names, used_attribute_names);
+    if (r) {
+        p.set_root(std::move(*r));
+    }
+    for (auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (std::string& s) {
+                r = resolve_path_component(s, expression_attribute_names, used_attribute_names);
+                if (r) {
+                    s = std::move(*r);
+                }
+            },
+            [&] (unsigned index) {
+                // nothing to resolve
+            }
+        }, op);
    }
 }

@@ -603,52 +646,8 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
-            // TODO: There's duplication here with check_BEGINS_WITH().
-            // But unfortunately, the two functions differ a bit.
-
-            // If one of v1 or v2 is malformed or has an unsupported type
-            // (not B or S), what we do depends on whether it came from
-            // the user's query (is_constant()), or the item. Unsupported
-            // values in the query result in an error, but if they are in
-            // the item, we silently return false (no match).
-            bool bad = false;
-            if (!v1.IsObject() || v1.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v1));
-                }
-            } else if (v1.MemberBegin()->name != "S" && v1.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v1));
-                }
-            }
-            if (!v2.IsObject() || v2.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v2));
-                }
-            } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v2));
-                }
-            }
-            bool ret = false;
-            if (!bad) {
-                auto it1 = v1.MemberBegin();
-                auto it2 = v2.MemberBegin();
-                if (it1->name == it2->name) {
-                    if (it2->name == "S") {
-                        std::string_view val1 = rjson::to_string_view(it1->value);
-                        std::string_view val2 = rjson::to_string_view(it2->value);
-                        ret = val1.starts_with(val2);
-                    } else /* it2->name == "B" */ {
-                        ret = base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
-                    }
-                }
-            }
-            return to_bool_json(ret);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
        }
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
@@ -667,6 +666,55 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
 };

+// Given a parsed::path and an item read from the table, extract the value
+// of a certain attribute path, such as "a" or "a.b.c[3]". Returns a null
+// value if the item or the requested attribute does not exist.
+// Note that the item is assumed to be encoded in JSON using DynamoDB
+// conventions - each level of a nested document is a map with one key -
+// a type (e.g., "M" for map) - and its value is the representation of
+// that value.
+static rjson::value extract_path(const rjson::value* item,
+        const parsed::path& p, calculate_value_caller caller) {
+    if (!item) {
+        return rjson::null_value();
+    }
+    const rjson::value* v = rjson::find(*item, p.root());
+    if (!v) {
+        return rjson::null_value();
+    }
+    for (const auto& op : p.operators()) {
+        if (!v->IsObject() || v->MemberCount() != 1) {
+            // This shouldn't happen. We shouldn't have stored malformed
+            // objects. But today Alternator does not validate the structure
+            // of nested documents before storing them, so this can happen on
+            // read.
+            throw api_error::validation(format("{}: malformed item read: {}", *item));
+        }
+        const char* type = v->MemberBegin()->name.GetString();
+        v = &(v->MemberBegin()->value);
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                if (type[0] == 'M' && v->IsObject()) {
+                    v = rjson::find(*v, member);
+                } else {
+                    v = nullptr;
+                }
+            },
+            [&] (unsigned index) {
+                if (type[0] == 'L' && v->IsArray() && index < v->Size()) {
+                    v = &(v->GetArray()[index]);
+                } else {
+                    v = nullptr;
+                }
+            }
+        }, op);
+        if (!v) {
+            return rjson::null_value();
+        }
+    }
+    return rjson::copy(*v);
+}
+
 // Given a parsed::value, which can refer either to a constant value from
 // ExpressionAttributeValues, to the value of some attribute, or to a function
 // of other values, this function calculates the resulting value.
@@ -684,21 +732,12 @@ rjson::value calculate_value(const parsed::value& v,
            auto function_it = function_handlers.find(std::string_view(f._function_name));
            if (function_it == function_handlers.end()) {
                throw api_error::validation(
-                        format("UpdateExpression: unknown function '{}' called.", f._function_name));
+                        format("{}: unknown function '{}' called.", caller, f._function_name));
            }
            return function_it->second(caller, previous_item, f);
        },
        [&] (const parsed::path& p) -> rjson::value {
-            if (!previous_item) {
-                return rjson::null_value();
-            }
-            std::string update_path = p.root();
-            if (p.has_operators()) {
-                // FIXME: support this
-                throw api_error::validation("Reading attribute paths not yet implemented");
-            }
-            const rjson::value* previous_value = rjson::find(*previous_item, update_path);
-            return previous_value ? rjson::copy(*previous_value) : rjson::null_value();
+            return extract_path(previous_item, p, caller);
        }
    }, v._value);
 }
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -49,15 +49,23 @@ class path {
    // dot (e.g., ".xyz").
    std::string _root;
    std::vector<std::variant<std::string, unsigned>> _operators;
+    // It is useful to limit the depth of a user-specified path, because is
+    // allows us to use recursive algorithms without worrying about recursion
+    // depth. DynamoDB officially limits the length of paths to 32 components
+    // (including the root) so let's use the same limit.
+    static constexpr unsigned depth_limit = 32;
+    void check_depth_limit();
 public:
    void set_root(std::string root) {
        _root = std::move(root);
    }
    void add_index(unsigned i) {
        _operators.emplace_back(i);
+        check_depth_limit();
    }
    void add_dot(std::string(name)) {
        _operators.emplace_back(std::move(name));
+        check_depth_limit();
    }
    const std::string& root() const {
        return _root;
@@ -65,6 +73,13 @@ public:
    bool has_operators() const {
        return !_operators.empty();
    }
+    const std::vector<std::variant<std::string, unsigned>>& operators() const {
+        return _operators;
+    }
+    std::vector<std::variant<std::string, unsigned>>& operators() {
+        return _operators;
+    }
+    friend std::ostream& operator<<(std::ostream&, const path&);
 };

 // When an expression is first parsed, all constants are references, like
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -93,6 +93,10 @@ public:
                 [&] (const json::json_return_type& json_return_value) {
                     slogger.trace("api_handler success case");
                     if (json_return_value._body_writer) {
+                         // Unfortunately, write_body() forces us to choose
+                         // from a fixed and irrelevant list of "mime-types"
+                         // at this point. But we'll override it with the
+                         // one (application/x-amz-json-1.0) below.
                         rep->write_body("json", std::move(json_return_value._body_writer));
                     } else {
                         rep->_content += json_return_value._res;
@@ -105,14 +109,15 @@ public:

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
-    }), _type("json") { }
+    }) { }

    api_handler(const api_handler&) = default;
    future<std::unique_ptr<reply>> handle(const sstring& path,
            std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        return _f_handle(std::move(req), std::move(rep)).then(
                [this](std::unique_ptr<reply> rep) {
-                    rep->done(_type);
+                    rep->set_mime_type("application/x-amz-json-1.0");
+                    rep->done();
                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
                });
    }
@@ -126,7 +131,6 @@ protected:
    }

    future_handler_function _f_handle;
-    sstring _type;
 };

 class gated_handler : public handler_base {
@@ -192,24 +196,31 @@ future<> server::verify_signature(const request& req) {
        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
-    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
+    std::string_view authorization_header = authorization_it->second;
+    auto pos = authorization_header.find_first_of(' ');
+    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
+        throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+    }
+    authorization_header.remove_prefix(pos+1);
    std::string credential;
    std::string user_signature;
    std::string signed_headers_str;
    std::vector<std::string_view> signed_headers;
-    for (std::string_view entry : credentials_raw) {
+    do {
+        // Either one of a comma or space can mark the end of an entry
+        pos = authorization_header.find_first_of(" ,");
+        std::string_view entry = authorization_header.substr(0, pos);
+        if (pos != std::string_view::npos) {
+            authorization_header.remove_prefix(pos + 1);
+        }
+        if (entry.empty()) {
+            continue;
+        }
        std::vector<std::string_view> entry_split = split(entry, '=');
        if (entry_split.size() != 2) {
-            if (entry != "AWS4-HMAC-SHA256") {
-                throw api_error::invalid_signature(format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
-            }
            continue;
        }
        std::string_view auth_value = entry_split[1];
-        // Commas appear as an additional (quite redundant) delimiter
-        if (auth_value.back() == ',') {
-            auth_value.remove_suffix(1);
-        }
        if (entry_split[0] == "Credential") {
            credential = std::string(auth_value);
        } else if (entry_split[0] == "Signature") {
@@ -219,7 +230,8 @@ future<> server::verify_signature(const request& req) {
            signed_headers = split(auth_value, ';');
            std::sort(signed_headers.begin(), signed_headers.end());
        }
-    }
+    } while (pos != std::string_view::npos);
+
    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
        throw api_error::validation(format("Incorrect credential information format: {}", credential));
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -38,6 +38,7 @@ stats::stats() : api_operations{} {
 #define OPERATION_LATENCY(name, CamelCaseName) \
                seastar::metrics::make_histogram("op_latency", \
                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
+            OPERATION(batch_get_item, "BatchGetItem")
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -499,19 +499,11 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // TODO: creation time

    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-    // cannot really "resume" query, must iterate all data. because we cannot query neither "time" (pk) > something,
-    // or on expired...
-    // TODO: maybe add secondary index to topology table to enable this?
-    return _sdks.cdc_get_versioned_streams({ normal_token_owners }).then([this, &db, schema, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc), ttl](std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

-        // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
-        auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);
+    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
+    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-        auto i = topologies.lower_bound(low_ts);
-        // need first gen _intersecting_ the timestamp.
-        if (i != topologies.begin()) {
-            i = std::prev(i);
-        }
+    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([this, &db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

        auto e = topologies.end();
        auto prev = e;
@@ -519,9 +511,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

        std::optional<shard_id> last;

-        // i is now at the youngest generation we include. make a mark of it.
-        auto first = i;
-
+        auto i = topologies.begin();
        // if we're a paged query, skip to the generation where we left of.
        if (shard_start) {
            i = topologies.find(shard_start->time);
@@ -547,7 +537,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        };

        // need a prev even if we are skipping stuff
-        if (i != first) {
+        if (i != topologies.begin()) {
            prev = std::prev(i);
        }

@@ -855,16 +845,18 @@ future<executor::request_return_type> executor::get_records(client_state& client
    static const bytes op_column_name = cdc::log_meta_column_name_bytes("operation");
    static const bytes eor_column_name = cdc::log_meta_column_name_bytes("end_of_batch");

-    auto key_names = boost::copy_range<std::unordered_set<std::string>>(
+    auto key_names = boost::copy_range<attrs_to_get>(
        boost::range::join(std::move(base->partition_key_columns()), std::move(base->clustering_key_columns()))
-        | boost::adaptors::transformed([&] (const column_definition& cdef) { return cdef.name_as_text(); })
+        | boost::adaptors::transformed([&] (const column_definition& cdef) {
+            return std::make_pair<std::string, attrs_to_get_node>(cdef.name_as_text(), {}); })
    );
    // Include all base table columns as values (in case pre or post is enabled).
    // This will include attributes not stored in the frozen map column
-    auto attr_names = boost::copy_range<std::unordered_set<std::string>>(base->regular_columns()
+    auto attr_names = boost::copy_range<attrs_to_get>(base->regular_columns()
        // this will include the :attrs column, which we will also force evaluating. 
        // But not having this set empty forces out any cdc columns from actual result 
-        | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.name_as_text(); })
+        | boost::adaptors::transformed([] (const column_definition& cdef) {
+            return std::make_pair<std::string, attrs_to_get_node>(cdef.name_as_text(), {}); })
    );

    std::vector<const column_definition*> columns;
@@ -1028,7 +1020,9 @@ future<executor::request_return_type> executor::get_records(client_state& client
        }

        // ugh. figure out if we are and end-of-shard
-        return cdc::get_local_streams_timestamp().then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
+        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+        
+        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
            auto& shard = iter.shard;            

            if (shard.time < ts && ts < high_ts) {
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1105,14 +1105,6 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"ignore_nodes",
-                     "description":"List of dead nodes to ingore in removenode operation",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
                  }
               ]
            }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -331,15 +331,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
@@ -656,7 +656,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -664,7 +664,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -672,7 +672,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -680,7 +680,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -688,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -696,7 +696,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -27,7 +27,6 @@
 #include <time.h>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/adaptor/filtered.hpp>
-#include <boost/algorithm/string/trim_all.hpp>
 #include "service/storage_service.hh"
 #include "service/load_meter.hh"
 #include "db/commitlog/commitlog.hh"
@@ -226,7 +225,7 @@ void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>&
            try {
                res = fut.get0();
            } catch (std::exception& e) {
-                return make_exception_future<json::json_return_type>(httpd::server_error_exception(e.what()));
+                return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
            }
            return make_ready_future<json::json_return_type>(json::json_return_type(res));
        });
@@ -497,22 +496,7 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::remove_node.set(r, [](std::unique_ptr<request> req) {
        auto host_id = req->get_query_param("host_id");
-        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
-        auto ignore_nodes = std::list<gms::inet_address>();
-        for (std::string n : ignore_nodes_strs) {
-            try {
-                std::replace(n.begin(), n.end(), '\"', ' ');
-                std::replace(n.begin(), n.end(), '\'', ' ');
-                boost::trim_all(n);
-                if (!n.empty()) {
-                    auto node = gms::inet_address(n);
-                    ignore_nodes.push_back(node);
-                }
-            } catch (...) {
-                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}", ignore_nodes_strs, n));
-            }
-        }
-        return service::get_local_storage_service().removenode(host_id, std::move(ignore_nodes)).then([] {
+        return service::get_local_storage_service().removenode(host_id).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -39,7 +39,7 @@ public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
    using fragment_type = bytes_view;
-    static constexpr size_type max_chunk_size() { return 128 * 1024; }
+    static constexpr size_type max_chunk_size() { return max_alloc_size() - sizeof(chunk); }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -59,6 +59,7 @@ private:
        void operator delete(void* ptr) { free(ptr); }
    };
    static constexpr size_type default_chunk_size{512};
+    static constexpr size_type max_alloc_size() { return 128 * 1024; }
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
@@ -132,16 +133,15 @@ private:
        return _current->size - _current->offset;
    }
    // Figure out next chunk size.
-    //   - must be enough for data_size
+    //   - must be enough for data_size + sizeof(chunk)
    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
-    //   - do not exceed max_chunk_size
+    //   - should not exceed max_alloc_size, unless data_size requires so
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
                : _initial_chunk_size;
-        next_size = std::min(next_size, max_chunk_size());
-        // FIXME: check for overflow?
+        next_size = std::min(next_size, max_alloc_size());
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
    }
    // Makes room for a contiguous region of given size.
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -264,6 +264,9 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
+        if (!_read_context->partition_exists()) {
+            return read_from_underlying(timeout);
+        }
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -22,10 +22,14 @@
 #include <boost/type.hpp>
 #include <random>
 #include <unordered_set>
+#include <algorithm>
 #include <seastar/core/sleep.hh>
+#include <algorithm>
+#include <seastar/core/coroutine.hh>

 #include "keys.hh"
 #include "schema_builder.hh"
+#include "database.hh"
 #include "db/config.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
@@ -36,6 +40,7 @@
 #include "gms/gossiper.hh"

 #include "cdc/generation.hh"
+#include "cdc/cdc_options.hh"

 extern logging::logger cdc_log;

@@ -174,10 +179,29 @@ bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

+std::vector<token_range_description>&& topology_description::entries() && {
+    return std::move(_entries);
+}
+
+static std::vector<stream_id> create_stream_ids(
+        size_t index, dht::token start, dht::token end, size_t shard_count, uint8_t ignore_msb) {
+    std::vector<stream_id> result;
+    result.reserve(shard_count);
+    dht::sharder sharder(shard_count, ignore_msb);
+    for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+        auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+        // compose the id from token and the "index" of the range end owning vnode
+        // as defined by token sort order. Basically grouping within this
+        // shard set.
+        result.emplace_back(stream_id(t, index));
+    }
+    return result;
+}
+
 class topology_description_generator final {
    const db::config& _cfg;
    const std::unordered_set<dht::token>& _bootstrap_tokens;
@@ -217,18 +241,9 @@ class topology_description_generator final {
        desc.token_range_end = end;

        auto [shard_count, ignore_msb] = get_sharding_info(end);
-        desc.streams.reserve(shard_count);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
        desc.sharding_ignore_msb = ignore_msb;

-        dht::sharder sharder(shard_count, ignore_msb);
-        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
-            // compose the id from token and the "index" of the range end owning vnode
-            // as defined by token sort order. Basically grouping within this
-            // shard set.
-            desc.streams.emplace_back(stream_id(t, index));
-        }
-
        return desc;
    }
 public:
@@ -294,6 +309,38 @@ future<db_clock::time_point> get_local_streams_timestamp() {
    });
 }

+// non-static for testing
+size_t limit_of_streams_in_topology_description() {
+    // Each stream takes 16B and we don't want to exceed 4MB so we can have
+    // at most 262144 streams but not less than 1 per vnode.
+    return 4 * 1024 * 1024 / 16;
+}
+
+// non-static for testing
+topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
+    int64_t streams_count = 0;
+    for (auto& tr_desc : desc.entries()) {
+        streams_count += tr_desc.streams.size();
+    }
+
+    size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
+    if (limit >= streams_count) {
+        return std::move(desc);
+    }
+    size_t streams_per_vnode_limit = limit / desc.entries().size();
+    auto entries = std::move(desc).entries();
+    auto start = entries.back().token_range_end;
+    for (size_t idx = 0; idx < entries.size(); ++idx) {
+        auto end = entries[idx].token_range_end;
+        if (entries[idx].streams.size() > streams_per_vnode_limit) {
+            entries[idx].streams =
+                create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
+        }
+        start = end;
+    }
+    return topology_description(std::move(entries));
+}
+
 // Run inside seastar::async context.
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
@@ -306,6 +353,18 @@ db_clock::time_point make_new_cdc_generation(
    using namespace std::chrono;
    auto gen = topology_description_generator(cfg, bootstrap_tokens, tmptr, g).generate();

+    // If the cluster is large we may end up with a generation that contains
+    // large number of streams. This is problematic because we store the
+    // generation in a single row. For a generation with large number of rows
+    // this will lead to a row that can be as big as 32MB. This is much more
+    // than the limit imposed by commitlog_segment_size_in_mb. If the size of
+    // the row that describes a new generation grows above
+    // commitlog_segment_size_in_mb, the write will fail and the new node won't
+    // be able to join. To avoid such problem we make sure that such row is
+    // always smaller than 4MB. We do that by removing some CDC streams from
+    // each vnode if the total number of streams is too large.
+    gen = limit_number_of_streams_if_needed(std::move(gen));
+
    // Begin the race.
    auto ts = db_clock::now() + (
            (!add_delay || ring_delay == milliseconds(0)) ? milliseconds(0) : (
@@ -321,31 +380,23 @@ std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_ad
    return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
 }

-// Run inside seastar::async context.
-static void do_update_streams_description(
+static future<> do_update_streams_description(
        db_clock::time_point streams_ts,
        db::system_distributed_keyspace& sys_dist_ks,
        db::system_distributed_keyspace::context ctx) {
-    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
-        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
-        return;
+    if (co_await sys_dist_ks.cdc_desc_exists(streams_ts, ctx)) {
+        cdc_log.info("Generation {}: streams description table already updated.", streams_ts);
+        co_return;
    }

    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.

-    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    auto topo = co_await sys_dist_ks.read_cdc_topology_description(streams_ts, ctx);
    if (!topo) {
-        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+        throw no_generation_data_exception(streams_ts);
    }

-    std::set<cdc::stream_id> streams_set;
-    for (auto& entry: topo->entries()) {
-        streams_set.insert(entry.streams.begin(), entry.streams.end());
-    }
-
-    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
-
-    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    co_await sys_dist_ks.create_cdc_desc(streams_ts, *topo, ctx);
    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
 }

@@ -355,7 +406,7 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source& abort_src) {
    try {
-        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
    } catch(...) {
        cdc_log.warn(
            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
@@ -368,7 +419,7 @@ void update_streams_description(
            while (true) {
                sleep_abortable(std::chrono::seconds(60), abort_src).get();
                try {
-                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
                    return;
                } catch (...) {
                    cdc_log.warn(
@@ -380,4 +431,176 @@ void update_streams_description(
    }
 }

+static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
+    return db_clock::time_point{std::chrono::milliseconds(utils::UUID_gen::get_adjusted_timestamp(uuid))};
+}
+
+static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
+        db::system_distributed_keyspace& sys_dist_ks,
+        abort_source& abort_src,
+        const noncopyable_function<unsigned()>& get_num_token_owners) {
+    while (true) {
+        try {
+            co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
+        } catch (...) {
+            cdc_log.warn(
+                    "Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
+                    std::current_exception());
+        }
+        co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+    }
+}
+
+// Contains a CDC log table's creation time (extracted from its schema's id)
+// and its CDC TTL setting.
+struct time_and_ttl {
+    db_clock::time_point creation_time;
+    int ttl;
+};
+
+/*
+ * See `maybe_rewrite_streams_descriptions`.
+ * This is the long-running-in-the-background part of that function.
+ * It returns the timestamp of the last rewritten generation (if any).
+ */
+static future<std::optional<db_clock::time_point>> rewrite_streams_descriptions(
+        std::vector<time_and_ttl> times_and_ttls,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    cdc_log.info("Retrieving generation timestamps for rewriting...");
+    auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
+    cdc_log.info("Generation timestamps retrieved.");
+
+    // Find first generation timestamp such that some CDC log table may contain data before this timestamp.
+    // This predicate is monotonic w.r.t the timestamps.
+    auto now = db_clock::now();
+    std::sort(tss.begin(), tss.end());
+    auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
+        // partition_point finds first element that does *not* satisfy the predicate.
+        return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
+                [&] (const time_and_ttl& tat) {
+            // In this CDC log table there are no entries older than the table's creation time
+            // or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
+            // If ttl is set to 0 then entries in this table never expire. In that case we look
+            // only at the table's creation time.
+            auto no_entries_older_than =
+                (tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
+                    - std::chrono::seconds(10);
+            return no_entries_older_than < ts;
+        });
+    });
+
+    // Find first generation timestamp such that some CDC log table may contain data in this generation.
+    // This and all later generations need to be written to the new streams table.
+    if (first != tss.begin()) {
+        --first;
+    }
+
+    if (first == tss.end()) {
+        cdc_log.info("No generations to rewrite.");
+        co_return std::nullopt;
+    }
+
+    cdc_log.info("First generation to rewrite: {}", *first);
+
+    bool each_success = true;
+    co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
+        while (true) {
+            try {
+                co_return co_await do_update_streams_description(ts, *sys_dist_ks, { get_num_token_owners() });
+            } catch (const no_generation_data_exception& e) {
+                cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
+                each_success = false;
+                co_return;
+            } catch (...) {
+                cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
+            }
+            co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+        }
+    });
+
+    if (each_success) {
+        cdc_log.info("Rewriting stream tables finished successfully.");
+    } else {
+        cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
+    }
+
+    if (first != tss.end()) {
+        co_return *std::prev(tss.end());
+    }
+
+    co_return std::nullopt;
+}
+
+future<> maybe_rewrite_streams_descriptions(
+        const database& db,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    if (!db.has_schema(sys_dist_ks->NAME, sys_dist_ks->CDC_DESC_V1)) {
+        // This cluster never went through a Scylla version which used this table
+        // or the user deleted the table. Nothing to do.
+        co_return;
+    }
+
+    if (co_await db::system_keyspace::cdc_is_rewritten()) {
+        co_return;
+    }
+
+    if (db.get_config().cdc_dont_rewrite_streams()) {
+        cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
+        co_return;
+    }
+
+    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
+    std::vector<time_and_ttl> times_and_ttls;
+    for (auto& [_, cf] : db.get_column_families()) {
+        auto& s = *cf->schema();
+        auto base = cdc::get_base_table(db, s.ks_name(), s.cf_name());
+        if (!base) {
+            // Not a CDC log table.
+            continue;
+        }
+        auto& cdc_opts = base->cdc_options();
+        if (!cdc_opts.enabled()) {
+            // This table is named like a CDC log table but it's not one.
+            continue;
+        }
+
+        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id()), cdc_opts.ttl()});
+    }
+
+    if (times_and_ttls.empty()) {
+        // There's no point in rewriting old generations' streams (they don't contain any data).
+        cdc_log.info("No CDC log tables present, not rewriting stream tables.");
+        co_return co_await db::system_keyspace::cdc_set_rewritten(std::nullopt);
+    }
+
+    // It's safe to discard this future: the coroutine keeps system_distributed_keyspace alive
+    // and the abort source's lifetime extends the lifetime of any other service.
+    (void)(([_times_and_ttls = std::move(times_and_ttls), _sys_dist_ks = std::move(sys_dist_ks),
+                _get_num_token_owners = std::move(get_num_token_owners), &_abort_src = abort_src] () mutable -> future<> {
+        auto times_and_ttls = std::move(_times_and_ttls);
+        auto sys_dist_ks = std::move(_sys_dist_ks);
+        auto get_num_token_owners = std::move(_get_num_token_owners);
+        auto& abort_src = _abort_src;
+
+        // This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
+        // and some nodes that are UP may still be marked as DOWN by us.
+        // Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
+        // it doesn't - we'll retry - but it's nice if we succeed without any warnings).
+        co_await sleep_abortable(std::chrono::seconds(10), abort_src);
+
+        cdc_log.info("Rewriting stream tables in the background...");
+        auto last_rewritten = co_await rewrite_streams_descriptions(
+                std::move(times_and_ttls),
+                std::move(sys_dist_ks),
+                std::move(get_num_token_owners),
+                abort_src);
+
+        co_await db::system_keyspace::cdc_set_rewritten(last_rewritten);
+    })());
+}
+
 } // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -41,6 +41,7 @@
 #include "db_clock.hh"
 #include "dht/token.hh"
 #include "locator/token_metadata.hh"
+#include "utils/chunked_vector.hh"

 namespace seastar {
    class abort_source;
@@ -65,6 +66,7 @@ public:

    stream_id() = default;
    stream_id(bytes);
+    stream_id(dht::token, size_t);

    bool is_set() const;
    bool operator==(const stream_id&) const;
@@ -78,9 +80,6 @@ public:

    partition_key to_partition_key(const schema& log_schema) const;
    static int64_t token_from_bytes(bytes_view);
-private:
-    friend class topology_description_generator;
-    stream_id(dht::token, size_t);
 };

 /* Describes a mapping of tokens to CDC streams in a token range.
@@ -113,7 +112,8 @@ public:
    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
 };

 /**
@@ -122,14 +122,19 @@ public:
 */ 
 class streams_version {
 public:
-    std::vector<stream_id> streams;
+    utils::chunked_vector<stream_id> streams;
    db_clock::time_point timestamp;
-    std::optional<db_clock::time_point> expired;

-    streams_version(std::vector<stream_id> s, db_clock::time_point ts, std::optional<db_clock::time_point> exp)
+    streams_version(utils::chunked_vector<stream_id> s, db_clock::time_point ts)
        : streams(std::move(s))
        , timestamp(ts)
-        , expired(std::move(exp))
+    {}
+};
+
+class no_generation_data_exception : public std::runtime_error {
+public:
+    no_generation_data_exception(db_clock::time_point generation_ts)
+        : std::runtime_error(format("could not find generation data for timestamp {}", generation_ts))
    {}
 };

@@ -194,4 +199,15 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source&);

+/* Part of the upgrade procedure. Useful in case where the version of Scylla that we're upgrading from
+ * used the "cdc_streams_descriptions" table. This procedure ensures that the new "cdc_streams_descriptions_v2"
+ * table contains streams of all generations that were present in the old table and may still contain data
+ * (i.e. there exist CDC log tables that may contain rows with partition keys being the stream IDs from
+ * these generations). */
+future<> maybe_rewrite_streams_descriptions(
+        const database&,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
 } // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -709,16 +709,16 @@ private:
       }
       return false;
    }
-    bool compare(const T&, const value_type& v);
+    int32_t compare(const T&, const value_type& v);
 };

 template<>
-bool maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v.first);
 }

 template<>
-bool maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v);
 }

@@ -981,9 +981,9 @@ static bytes get_bytes(const atomic_cell_view& acv) {
    return acv.value().linearize();
 }

-static bytes_view get_bytes_view(const atomic_cell_view& acv, std::vector<bytes>& buf) {
+static bytes_view get_bytes_view(const atomic_cell_view& acv, std::forward_list<bytes>& buf) {
    return acv.value().is_fragmented()
-        ? bytes_view{buf.emplace_back(acv.value().linearize())}
+        ? bytes_view{buf.emplace_front(acv.value().linearize())}
        : acv.value().first_fragment();
 }

@@ -1138,9 +1138,9 @@ struct process_row_visitor {

                struct udt_visitor : public collection_visitor {
                    std::vector<bytes_opt> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::vector<bytes>& buf)
+                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _added_cells(num_keys), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1149,7 +1149,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                udt_visitor v(_ttl_column, type.size(), buf);

                visit_collection(v);
@@ -1168,9 +1168,9 @@ struct process_row_visitor {

                struct map_or_list_visitor : public collection_visitor {
                    std::vector<std::pair<bytes_view, bytes_view>> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    map_or_list_visitor(ttl_opt& ttl_column, std::vector<bytes>& buf)
+                    map_or_list_visitor(ttl_opt& ttl_column, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1179,7 +1179,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                map_or_list_visitor v(_ttl_column, buf);

                visit_collection(v);
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -51,7 +51,8 @@ static cdc::stream_id get_stream(
    return entry.streams[shard_id];
 }

-static cdc::stream_id get_stream(
+// non-static for testing
+cdc::stream_id get_stream(
        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -99,8 +99,8 @@ listen_address: localhost
 # listen_on_broadcast_address: false

 # port for the CQL native transport to listen for clients on
-# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
-# To disable the CQL native transport, set this option to 0.
+# For security reasons, you should not expose this port to the internet. Firewall it if needed.
+# To disable the CQL native transport, remove this option and configure native_transport_port_ssl.
 native_transport_port: 9042

 # Like native_transport_port, but clients are forwarded to specific shards, based on the
--- a/configure.py
+++ b/configure.py
@@ -278,9 +278,10 @@ modes = {

 scylla_tests = set([
    'test/boost/UUID_test',
+    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/anchorless_list_test',
    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
@@ -854,6 +855,7 @@ scylla_core = (['database.cc',
                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'mutation_writer/shard_based_splitting_writer.cc',
+                'mutation_writer/feed_writers.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )
@@ -1031,7 +1033,7 @@ pure_boost_tests = set([
 ])

 tests_not_using_seastar_test_framework = set([
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/small_vector_test',
    'test/manual/gossip',
    'test/manual/message',
@@ -1105,7 +1107,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
 ]

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
-deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']
+deps['test/boost/alternator_unit_test'] += ['alternator/base64.cc']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
 deps['test/boost/raft_fsm_test'] =  ['test/boost/raft_fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
@@ -1967,7 +1969,7 @@ with open(buildfile_tmp, 'w') as f:
            command = ./dist/debian/debian_files_gen.py
        build $builddir/debian/debian: debian_files_gen | always
        rule extract_node_exporter
-            command = tar -C build -xvpf {node_exporter_filename} && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
+            command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
        build $builddir/node_exporter: extract_node_exporter | always
        ''').format(**globals()))

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -29,6 +29,7 @@

 #include "cql3/constants.hh"
 #include "cql3/lists.hh"
+#include "cql3/statements/request_validations.hh"
 #include "cql3/tuples.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/list.hh"
@@ -414,6 +415,8 @@ bool is_one_of(const column_value& col, term& rhs, const column_value_eval_bag&
    } else if (auto mkr = dynamic_cast<lists::marker*>(&rhs)) {
        // This is `a IN ?`.  RHS elements are values representable as bytes_opt.
        const auto values = static_pointer_cast<lists::value>(mkr->bind(bag.options));
+        statements::request_validations::check_not_null(
+                values, "Invalid null value for column %s", col.col->name_as_text());
        return boost::algorithm::any_of(values->get_elements(), [&] (const bytes_opt& b) {
                return equal(b, col, bag);
            });
@@ -580,6 +583,7 @@ value_list get_IN_values(
        if (val == constants::UNSET_VALUE) {
            throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
        }
+        statements::request_validations::check_not_null(val, "Invalid null value for column %s", column_name);
        return to_sorted_vector(static_pointer_cast<lists::value>(val)->get_elements() | non_null | deref, comparator);
    }
    throw std::logic_error(format("get_IN_values(single column) on invalid term {}", *t));
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -181,13 +181,18 @@ inline
 shared_ptr<function>
 make_from_json_function(database& db, const sstring& keyspace, data_type t) {
    return make_native_scalar_function<true>("fromjson", t, {utf8_type},
-            [&db, &keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
-        bytes_opt parsed_json_value;
-        if (!json_value.IsNull()) {
-            parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            [&db, keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+        try {
+            rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
+            bytes_opt parsed_json_value;
+            if (!json_value.IsNull()) {
+                parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            }
+            return parsed_json_value;
+        } catch(rjson::error& e) {
+            throw exceptions::function_execution_exception("fromJson",
+                format("Failed parsing fromJson parameter: {}", e.what()), keyspace, {t->name()});
        }
-        return parsed_json_value;
    });
 }

--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -78,7 +78,22 @@ public:
        return Pure;
    }
    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
-        return _func(sf, parameters);
+        try {
+            return _func(sf, parameters);
+        } catch(exceptions::cassandra_exception&) {
+            // If the function's code took the time to produce an official
+            // cassandra_exception, pass it through. Otherwise, below we will
+            // wrap the unknown exception in a function_execution_exception.
+            throw;
+        } catch(...) {
+            std::vector<sstring> args;
+            args.reserve(arg_types().size());
+            for (const data_type& a : arg_types()) {
+                args.push_back(a->name());
+            }
+            throw exceptions::function_execution_exception(name().name,
+                format("Failed execution of function {}: {}", name(), std::current_exception()), name().keyspace, std::move(args));
+        }
    }
 };

--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -668,10 +668,14 @@ struct internal_query_state {
    bool more_results = true;
 };

-::shared_ptr<internal_query_state> query_processor::create_paged_state(const sstring& query_string,
-        const std::initializer_list<data_value>& values, int32_t page_size) {
+::shared_ptr<internal_query_state> query_processor::create_paged_state(
+        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
+        const std::initializer_list<data_value>& values,
+        int32_t page_size) {
    auto p = prepare_internal(query_string);
-    auto opts = make_internal_options(p, values, db::consistency_level::ONE, infinite_timeout_config, page_size);
+    auto opts = make_internal_options(p, values, cl, timeout_config, page_size);
    ::shared_ptr<internal_query_state> res = ::make_shared<internal_query_state>(
            internal_query_state{
                    query_string,
@@ -935,17 +939,20 @@ bool query_processor::migration_subscriber::should_invalidate(
    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
        const std::initializer_list<data_value>& values,
+        int32_t page_size,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, values), std::move(f));
+    return for_each_cql_result(create_paged_state(query_string, cl, timeout_config, values, page_size), std::move(f));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, {}), std::move(f));
+    return query_internal(query_string, db::consistency_level::ONE, infinite_timeout_config, {}, 1000, std::move(f));
 }

 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -224,75 +224,52 @@ public:
    /*!
     * \brief iterate over all cql results using paging
     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
+     * You create a statement with optional parameters and pass
+     * a function that goes over the result rows.
     *
-     * The passed function would be called for all the results, return stop_iteration::yes
-     * to stop during iteration.
+     * The passed function would be called for all rows; return future<stop_iteration::yes>
+     * to stop iteration.
     *
     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
-                ....
-                ....
-                return stop_iteration::no;
-            });
-
-     * You can use place holder in the query, the prepared statement will only be done once.
-     *
-     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return false the iteration would stop
-     * args - arbitrary number of query parameters
-     */
-    template<typename... Args>
-    future<> query(
-            const sstring& query_string,
-            std::function<stop_iteration(const cql3::untyped_result_set_row&)>&& f,
-            Args&&... args) {
-        return for_each_cql_result(
-                create_paged_state(query_string, { data_value(std::forward<Args>(args))... }), std::move(f));
-    }
-
-    /*!
-     * \brief iterate over all cql results using paging
-     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
-     *
-     * The passed function would be called for all the results, return future<stop_iteration::yes>
-     * to stop during iteration.
-     *
-     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
+            return query_internal(
+                    "SELECT * from system.compaction_history",
+                    db::consistency_level::ONE,
+                    infinite_timeout_config,
+                    {},
+                    [&history] (const cql3::untyped_result_set::row& row) mutable {
                ....
                ....
                return make_ready_future<stop_iteration>(stop_iteration::no);
            });

-     * You can use place holder in the query, the prepared statement will only be done once.
+     * You can use placeholders in the query, the statement will only be prepared once.
     *
-     *
-     * query_string - the cql string, can contain place holder
-     * values - query parameters value
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * cl - consistency level of the query
+     * timeout_config - timeout configuration
+     * values - values to be substituted for the placeholders in the query
+     * page_size - maximum page size
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
+            db::consistency_level cl,
+            const timeout_config& timeout_config,
            const std::initializer_list<data_value>& values,
+            int32_t page_size,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

    /*
     * \brief iterate over all cql results using paging
-     * An overload of the query with future function without query parameters.
+     * An overload of query_internal without query parameters
+     * using CL = ONE, no timeout, and page size = 1000.
     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

@@ -354,8 +331,10 @@ private:
     */
    ::shared_ptr<internal_query_state> create_paged_state(
            const sstring& query_string,
-            const std::initializer_list<data_value>& = { },
-            int32_t page_size = 1000);
+            db::consistency_level,
+            const timeout_config&,
+            const std::initializer_list<data_value>&,
+            int32_t page_size);

    /*!
     * \brief run a query using paging
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -551,16 +551,27 @@ bool statement_restrictions::need_filtering() const {
        // clustering restrictions.  Therefore, a continuous clustering range is guaranteed.
        return false;
    }
-    if (!_clustering_columns_restrictions->needs_filtering(*_schema)) { // Guaranteed continuous clustering range.
-        return false;
-    }
-    // Now we know there are some clustering-column restrictions that are out-of-order or not EQ.  A naive base-table
-    // query must be filtered.  What about an index-table query?  That can only avoid filtering if there is exactly one
-    // EQ supported by an index.
-    return !(_clustering_columns_restrictions->size() == 1 && _has_queriable_ck_index);

-    // TODO: it is also possible to avoid filtering here if a non-empty CK prefix is specified and token_known, plus
-    // there's exactly one out-of-order-but-index-supported clustering-column restriction.
+    if (_has_queriable_ck_index && _uses_secondary_indexing) {
+        // In cases where we use an index, clustering column restrictions might cause the need for filtering.
+        // TODO: This is overly conservative, there are some cases when this returns true but filtering
+        // is not needed. Because of that the database will sometimes perform filtering when it's not actually needed.
+        // Query performance shouldn't be affected much, at most we will filter rows that are all correct.
+        // Here are some cases to consider:
+        // On a table with primary key (p, c1, c2, c3) with an index on c3
+        // WHERE c3 = ? - doesn't require filtering
+        // WHERE c1 = ? AND c2 = ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c3 = ? - doesn't require filtering, but we conservatively report it does
+        // WHERE p = ? AND c1 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 = ? AND c3 = ? - doesn't use an index
+        // WHERE p = ? AND c1 = ? AND c2 < ? AND c3 = ? - doesn't require filtering, but we report it does
+        return _clustering_columns_restrictions->size() > 1;
+    }
+    // Now we know that the query doesn't use an index.
+
+    // The only thing that can cause filtering now are the clustering columns.
+    return _clustering_columns_restrictions->needs_filtering(*_schema);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -306,6 +306,13 @@ create_index_statement::announce_migration(service::storage_proxy& proxy) const
                    format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
        }
    }
+    auto index_table_name = secondary_index::index_table_name(accepted_name);
+    if (db.has_schema(keyspace(), index_table_name)) {
+        return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
+            exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
+                accepted_name, index_table_name))
+        );
+    }
    ++_cql_stats->secondary_index_creates;
    schema_builder builder{schema};
    builder.with_index(index);
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -460,7 +460,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
        if (!view_col) {
            throw std::runtime_error(format("Base key column not found in the view: {}", base_col.name_as_text()));
        }
-        if (base_col.type != view_col->type) {
+        if (base_col.type->without_reversed() != *view_col->type) {
            throw std::runtime_error(format("Mismatched types for base and view columns {}: {} and {}",
                    base_col.name_as_text(), base_col.type->cql3_type_name(), view_col->type->cql3_type_name()));
        }
@@ -964,6 +964,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
@@ -1145,7 +1146,11 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_g
                if (single_ck_restrictions) {
                    auto prefix_restrictions = single_ck_restrictions->get_longest_prefix_restrictions();
                    auto clustering_restrictions_from_base = ::make_shared<restrictions::single_column_clustering_key_restrictions>(_view_schema, *prefix_restrictions);
+                    const auto indexed_column = _view_schema->get_column_definition(to_bytes(_index.target_column()));
                    for (auto restriction_it : clustering_restrictions_from_base->restrictions()) {
+                        if (restriction_it.first == indexed_column) {
+                            continue; // In the index table, the indexed column is the partition (not clustering) key.
+                        }
                        clustering_restrictions->merge_with(restriction_it.second);
                    }
                }
--- a/database.cc
+++ b/database.cc
@@ -760,9 +760,6 @@ void database::set_format_by_config() {
 }

 database::~database() {
-    _read_concurrency_sem.clear_inactive_reads();
-    _streaming_concurrency_sem.clear_inactive_reads();
-    _system_read_concurrency_sem.clear_inactive_reads();
 }

 void database::update_version(const utils::UUID& version) {
@@ -850,11 +847,22 @@ future<> database::parse_system_tables(distributed<service::storage_proxy>& prox
            });
    }).then([&proxy, &mm, this] {
        return do_parse_schema_tables(proxy, db::schema_tables::VIEWS, [this, &proxy, &mm] (schema_result_value_type &v) {
-            return create_views_from_schema_partition(proxy, v.second).then([this, &mm] (std::vector<view_ptr> views) {
-                return parallel_for_each(views.begin(), views.end(), [this, &mm] (auto&& v) {
-                    return this->add_column_family_and_make_directory(v).then([this, &mm, v] {
-                        return maybe_update_legacy_secondary_index_mv_schema(mm.local(), *this, v);
-                    });
+            return create_views_from_schema_partition(proxy, v.second).then([this, &mm, &proxy] (std::vector<view_ptr> views) {
+                return parallel_for_each(views.begin(), views.end(), [this, &mm, &proxy] (auto&& v) {
+                    // TODO: Remove once computed columns are guaranteed to be featured in the whole cluster.
+                    // we fix here the schema in place in oreder to avoid races (write commands comming from other coordinators).
+                    view_ptr fixed_v = maybe_fix_legacy_secondary_index_mv_schema(*this, v, nullptr, preserve_version::yes);
+                    view_ptr v_to_add = fixed_v ? fixed_v : v;
+                    future<> f = this->add_column_family_and_make_directory(v_to_add);
+                    if (bool(fixed_v)) {
+                        v_to_add = fixed_v;
+                        auto&& keyspace = find_keyspace(v->ks_name()).metadata();
+                        auto mutations = db::schema_tables::make_update_view_mutations(keyspace, view_ptr(v), fixed_v, api::new_timestamp(), true);
+                        f = f.then([this, &proxy, mutations = std::move(mutations)] {
+                            return db::schema_tables::merge_schema(proxy, _feat, std::move(mutations));
+                        });
+                    }
+                    return f;
                });
            });
        });
@@ -1940,7 +1948,11 @@ sstring database::get_available_index_name(const sstring &ks_name, const sstring
    auto base_name = index_metadata::get_default_index_name(cf_name, index_name_root);
    sstring accepted_name = base_name;
    int i = 0;
-    while (existing_names.contains(accepted_name)) {
+    auto name_accepted = [&] {
+        auto index_table_name = secondary_index::index_table_name(accepted_name);
+        return !has_schema(ks_name, index_table_name) && !existing_names.contains(accepted_name);
+    };
+    while (!name_accepted()) {
        accepted_name = base_name + "_" + std::to_string(++i);
    }
    return accepted_name;
@@ -2005,6 +2017,13 @@ future<>
 database::stop() {
    assert(!_large_data_handler->running());

+    // Inactive reads might hold on to sstables, blocking the
+    // `sstables_manager::close()` calls below. No one will come back for these
+    // reads at this point so clear them before proceeding with the shutdown.
+    _read_concurrency_sem.clear_inactive_reads();
+    _streaming_concurrency_sem.clear_inactive_reads();
+    _system_read_concurrency_sem.clear_inactive_reads();
+
    // try to ensure that CL has done disk flushing
    future<> maybe_shutdown_commitlog = _commitlog != nullptr ? _commitlog->shutdown() : make_ready_future<>();
    return maybe_shutdown_commitlog.then([this] {
--- a/database.hh
+++ b/database.hh
@@ -240,9 +240,13 @@ public:
        return _memtables.back();
    }

-    // The caller has to make sure the element exist before calling this.
+    // # 8904 - this method is akin to std::set::erase(key_type), not
+    // erase(iterator). Should be tolerant against non-existing.
    void erase(const shared_memtable& element) {
-        _memtables.erase(boost::range::find(_memtables, element));
+        auto i = boost::range::find(_memtables, element);
+        if (i != _memtables.end()) {
+            _memtables.erase(i);
+        }
    }
    void clear() {
        _memtables.clear();
@@ -893,7 +897,7 @@ public:
        return _pending_writes_phaser.start();
    }

-    future<> await_pending_writes() {
+    future<> await_pending_writes() noexcept {
        return _pending_writes_phaser.advance_and_await();
    }

@@ -905,7 +909,7 @@ public:
        return _pending_reads_phaser.start();
    }

-    future<> await_pending_reads() {
+    future<> await_pending_reads() noexcept {
        return _pending_reads_phaser.advance_and_await();
    }

@@ -917,7 +921,7 @@ public:
        return _pending_streams_phaser.start();
    }

-    future<> await_pending_streams() {
+    future<> await_pending_streams() noexcept {
        return _pending_streams_phaser.advance_and_await();
    }

@@ -925,11 +929,11 @@ public:
        return _pending_streams_phaser.operations_in_progress();
    }

-    future<> await_pending_flushes() {
+    future<> await_pending_flushes() noexcept {
        return _pending_flushes_phaser.advance_and_await();
    }

-    future<> await_pending_ops() {
+    future<> await_pending_ops() noexcept {
        return when_all(await_pending_reads(), await_pending_writes(), await_pending_streams(), await_pending_flushes()).discard_result();
    }

--- a/db/config.cc
+++ b/db/config.cc
@@ -780,6 +780,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Time period in seconds after which unused schema versions will be evicted from the local schema registry cache. Default is 1 second.")
    , max_concurrent_requests_per_shard(this, "max_concurrent_requests_per_shard",liveness::LiveUpdate, value_status::Used, std::numeric_limits<uint32_t>::max(),
        "Maximum number of concurrent requests a single shard can handle before it starts shedding extra load. By default, no requests will be shed.")
+    , cdc_dont_rewrite_streams(this, "cdc_dont_rewrite_streams", value_status::Used, false,
+            "Disable rewriting streams from cdc_streams_descriptions to cdc_streams_descriptions_v2. Should not be necessary, but the procedure is expensive and prone to failures; this config option is left as a backdoor in case some user requires manual intervention.")
    , alternator_port(this, "alternator_port", value_status::Used, 0, "Alternator API port")
    , alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port")
    , alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address")
--- a/db/config.hh
+++ b/db/config.hh
@@ -322,6 +322,7 @@ public:
    named_value<unsigned> user_defined_function_contiguous_allocation_limit_bytes;
    named_value<uint32_t> schema_registry_grace_period;
    named_value<uint32_t> max_concurrent_requests_per_shard;
+    named_value<bool> cdc_dont_rewrite_streams;

    named_value<uint16_t> alternator_port;
    named_value<uint16_t> alternator_https_port;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -120,10 +120,9 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
 future<> manager::stop() {
    manager_logger.info("Asked to stop");

-    if (_strorage_service_anchor) {
-        _strorage_service_anchor->unregister_subscriber(this);
-    }
+  auto f = _strorage_service_anchor ? _strorage_service_anchor->unregister_subscriber(this) : make_ready_future<>();

+  return f.finally([this] {
    set_stopping();

    return _draining_eps_gate.close().finally([this] {
@@ -134,6 +133,7 @@ future<> manager::stop() {
            manager_logger.info("Stopped");
        }).discard_result();
    });
+  });
 }

 future<> manager::compute_hints_dir_device_id() {
--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -124,7 +124,7 @@ static future<> try_record(std::string_view large_table, const sstables::sstable
    const auto sstable_name = sst.get_filename();
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes)", desc, ks_name, cf_name, pk_str, extra_path, size);
+    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
    return db::qctx->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -140,9 +140,10 @@ future<> cql_table_large_data_handler::record_large_partitions(const sstables::s
 void cql_table_large_data_handler::log_too_many_rows(const sstables::sstable& sst, const sstables::key& partition_key,
        uint64_t rows_count) const {
    const schema& s = *sst.get_schema();
-    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows)",
+    const auto sstable_name = sst.get_filename();
+    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows) to {}",
                           s.ks_name(), s.cf_name(), partition_key.to_partition_key(s).with_schema(s),
-                           rows_count);
+                           rows_count, sstable_name);
 }

 future<> cql_table_large_data_handler::record_large_cells(const sstables::sstable& sst, const sstables::key& partition_key,
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -1204,7 +1204,42 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
        return create_table_from_mutations(proxy, std::move(sm));
    });
    auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), [&] (schema_mutations sm) {
-        return create_view_from_mutations(proxy, std::move(sm));
+        // The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
+        // If we don't do it we are leaving a window where write commands to this schema are illegal.
+        // There are 3 possibilities:
+        // 1. The table was altered - in this case we want the view to correspond to this new table schema.
+        // 2. The table was just created - the table is guarantied to be published with the view in that case.
+        // 3. The view itself was altered - in that case we already know the base table so we can take it from
+        //    the database object.
+        view_ptr vp = create_view_from_mutations(proxy, std::move(sm));
+        schema_ptr base_schema;
+        for (auto&& s : tables_diff.altered) {
+            if (s.new_schema.get()->ks_name() == vp->ks_name() && s.new_schema.get()->cf_name() == vp->view_info()->base_name() ) {
+                base_schema = s.new_schema;
+                break;
+            }
+        }
+        if (!base_schema) {
+            for (auto&& s : tables_diff.created) {
+                if (s.get()->ks_name() == vp->ks_name() && s.get()->cf_name() == vp->view_info()->base_name() ) {
+                    base_schema = s;
+                    break;
+                }
+            }
+        }
+
+        if (!base_schema) {
+            base_schema = proxy.local().local_db().find_schema(vp->ks_name(), vp->view_info()->base_name());
+        }
+
+        // Now when we have a referenced base - just in case we are registering an old view (this can happen in a mixed cluster)
+        // lets make it write enabled by updating it's compute columns.
+        view_ptr fixed_vp = maybe_fix_legacy_secondary_index_mv_schema(proxy.local().get_db().local(), vp, base_schema, preserve_version::yes);
+        if(fixed_vp) {
+            vp = fixed_vp;
+        }
+        vp->view_info()->set_base_info(vp->view_info()->make_base_dependent_view_info(*base_schema));
+        return vp;
    });

    proxy.local().get_db().invoke_on_all([&] (database& db) {
@@ -3032,8 +3067,7 @@ std::vector<sstring> all_table_names(schema_features features) {
           boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); }));
 }

-future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v) {
-    // TODO(sarna): Remove once computed columns are guaranteed to be featured in the whole cluster.
+view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version) {
    // Legacy format for a secondary index used a hardcoded "token" column, which ensured a proper
    // order for indexed queries. This "token" column is now implemented as a computed column,
    // but for the sake of compatibility we assume that there might be indexes created in the legacy
@@ -3041,26 +3075,32 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
    // columns marked as computed (because they were either created on a node that supports computed
    // columns or were fixed by this utility function), it's safe to remove this function altogether.
    if (v->clustering_key_size() == 0) {
-        return make_ready_future<>();
+        return view_ptr(nullptr);
    }
    const column_definition& first_view_ck = v->clustering_key_columns().front();
    if (first_view_ck.is_computed()) {
-        return make_ready_future<>();
+        return view_ptr(nullptr);
+    }
+
+    if (!base_schema) {
+        base_schema = db.find_schema(v->view_info()->base_id());
    }

-    table& base = db.find_column_family(v->view_info()->base_id());
-    schema_ptr base_schema = base.schema();
    // If the first clustering key part of a view is a column with name not found in base schema,
    // it implies it might be backing an index created before computed columns were introduced,
    // and as such it must be recreated properly.
    if (!base_schema->columns_by_name().contains(first_view_ck.name())) {
        schema_builder builder{schema_ptr(v)};
        builder.mark_column_computed(first_view_ck.name(), std::make_unique<legacy_token_column_computation>());
-        return mm.announce_view_update(view_ptr(builder.build()));
+        if (preserve_version) {
+            builder.with_version(v->version());
+        }
+        return view_ptr(builder.build());
    }
-    return make_ready_future<>();
+    return view_ptr(nullptr);
 }

+
 namespace legacy {

 table_schema_version schema_mutations::digest() const {
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -239,7 +239,9 @@ std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata

 std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

-future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v);
+class preserve_version_tag {};
+using preserve_version = bool_class<preserve_version_tag>;
+view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version);

 sstring serialize_kind(column_kind kind);
 column_kind deserialize_kind(sstring kind);
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -43,9 +43,13 @@

 namespace db {

-future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name) {
+future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter) {
    auto& ks = _db.local().find_keyspace(ks_name);
-    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name)] (auto& pair) {
+    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name), filter = std::move(filter)] (auto& pair) {
+        auto& cf_name = pair.first;
+        if (filter && std::find(filter->begin(), filter->end(), cf_name) == filter->end()) {
+            return make_ready_future<>();
+        }        
        auto& cf = _db.local().find_column_family(pair.second);
        return cf.snapshot_exists(name).then([ks_name = std::move(ks_name), name] (bool exists) {
            if (exists) {
@@ -111,7 +115,7 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
    }

    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag)] {
-        return check_snapshot_not_exist(ks_name, tag).then([this, ks_name, tables = std::move(tables), tag] {
+        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag] {
            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag](const std::vector<sstring>& tables) {
                return do_for_each(tables, [ks_name, tag, this] (const sstring& table_name) {
                    if (table_name.find(".") != sstring::npos) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -40,6 +40,8 @@

 #pragma once

+#include <vector>
+
 #include <seastar/core/sharded.hh>
 #include <seastar/core/future.hh>
 #include "database.hh"
@@ -112,7 +114,7 @@ private:
    seastar::rwlock _lock;
    seastar::gate _ops;

-    future<> check_snapshot_not_exist(sstring ks_name, sstring name);
+    future<> check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter = {});

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_modify_operation(Func&&);
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -35,12 +35,14 @@

 #include <seastar/core/seastar.hh>
 #include <seastar/core/shared_ptr.hh>
+#include <seastar/core/coroutine.hh>
+#include <seastar/core/future-util.hh>

 #include <boost/range/adaptor/transformed.hpp>

 #include <optional>
 #include <vector>
-#include <optional>
+#include <set>

 extern logging::logger cdc_log;

@@ -91,12 +93,31 @@ schema_ptr cdc_generations() {
 /* A user-facing table providing identifiers of the streams used in CDC generations. */
 schema_ptr cdc_desc() {
    thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC);
-        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC, {id})
+        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2);
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2, {id})
                /* The timestamp of this CDC generation. */
                .with_column("time", timestamp_type, column_kind::partition_key)
-                /* The set of stream identifiers used in this CDC generation. */
+                /* For convenience, the list of stream IDs in this generation is split into token ranges
+                 * which the stream IDs were mapped to (by the partitioner) when the generation was created.  */
+                .with_column("range_end", long_type, column_kind::clustering_key)
+                /* The set of stream identifiers used in this CDC generation for the token range
+                 * ending on `range_end`. */
                .with_column("streams", cdc_streams_set_type)
+                .with_version(system_keyspace::generate_schema_version(id))
+                .build();
+    }();
+    return schema;
+}
+
+/* A user-facing table providing CDC generation timestamps. */
+schema_ptr cdc_timestamps() {
+    thread_local auto schema = [] {
+        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS);
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS, {id})
+                /* This is a single-partition table. The partition key is always "timestamps". */
+                .with_column("key", utf8_type, column_kind::partition_key)
+                /* The timestamp of this CDC generation. */
+                .with_column("time", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
                /* Expiration time of this CDC generation (or null if not expired). */
                .with_column("expired", timestamp_type)
                .with_version(system_keyspace::generate_schema_version(id))
@@ -105,11 +126,14 @@ schema_ptr cdc_desc() {
    return schema;
 }

+static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
+
 static std::vector<schema_ptr> all_tables() {
    return {
        view_build_status(),
        cdc_generations(),
        cdc_desc(),
+        cdc_timestamps(),
    };
 }

@@ -117,13 +141,15 @@ bool system_distributed_keyspace::is_extra_durable(const sstring& cf_name) {
    return cf_name == CDC_TOPOLOGY_DESCRIPTION;
 }

-system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm)
+system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
        : _qp(qp)
-        , _mm(mm) {
+        , _mm(mm)
+        , _sp(sp) {
 }

 future<> system_distributed_keyspace::start() {
    if (this_shard_id() != 0) {
+        _started = true;
        return make_ready_future<>();
    }

@@ -148,18 +174,18 @@ future<> system_distributed_keyspace::start() {
                });
            });
        });
-    });
+    }).then([this] { _started = true; });
 }

 future<> system_distributed_keyspace::stop() {
    return make_ready_future<>();
 }

-static const timeout_config internal_distributed_timeout_config = [] {
-    using namespace std::chrono_literals;
-    const auto t = 10s;
+static timeout_config get_timeout_config(db::timeout_clock::duration t) {
    return timeout_config{ t, t, t, t, t, t, t };
-}();
+}
+
+static const timeout_config internal_distributed_timeout_config = get_timeout_config(std::chrono::seconds(10));

 future<std::unordered_map<utils::UUID, sstring>> system_distributed_keyspace::view_status(sstring ks_name, sstring view_name) const {
    return _qp.execute_internal(
@@ -326,24 +352,69 @@ system_distributed_keyspace::expire_cdc_topology_description(
            false).discard_result();
 }

-static set_type_impl::native_type prepare_cdc_streams(const std::vector<cdc::stream_id>& streams) {
-    set_type_impl::native_type ret;
-    for (auto& s: streams) {
-        ret.push_back(data_value(s.to_bytes()));
+static future<std::vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
+        const database& db,
+        db_clock::time_point time,
+        const cdc::topology_description& desc) {
+    auto s = db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2);
+
+    auto ts = api::new_timestamp();
+    std::vector<mutation> res;
+    res.emplace_back(s, partition_key::from_singular(*s, time));
+    size_t size_estimate = 0;
+    for (auto& e : desc.entries()) {
+        // We want to keep each mutation below ~1 MB.
+        if (size_estimate >= 1000 * 1000) {
+            res.emplace_back(s, partition_key::from_singular(*s, time));
+            size_estimate = 0;
+        }
+
+        set_type_impl::native_type streams;
+        streams.reserve(e.streams.size());
+        for (auto& stream : e.streams) {
+            streams.push_back(data_value(stream.to_bytes()));
+        }
+
+        // We estimate 20 bytes per stream ID.
+        // Stream IDs themselves weigh 16 bytes each (2 * sizeof(int64_t))
+        // but there's metadata to be taken into account.
+        // It has been verified experimentally that 20 bytes per stream ID is a good estimate.
+        size_estimate += e.streams.size() * 20;
+        res.back().set_cell(clustering_key::from_singular(*s, dht::token::to_int64(e.token_range_end)),
+                to_bytes("streams"), make_set_value(cdc_streams_set_type, std::move(streams)), ts);
+
+        co_await make_ready_future<>(); // maybe yield
    }
-    return ret;
+
+    co_return res;
 }

 future<>
 system_distributed_keyspace::create_cdc_desc(
        db_clock::time_point time,
-        const std::vector<cdc::stream_id>& streams,
+        const cdc::topology_description& desc,
        context ctx) {
-    return _qp.execute_internal(
-            format("INSERT INTO {}.{} (time, streams) VALUES (?,?)", NAME, CDC_DESC),
+    using namespace std::chrono_literals;
+
+    auto ms = co_await get_cdc_streams_descriptions_v2_mutation(_qp.db(), time, desc);
+    co_await max_concurrent_for_each(ms, 20, [&] (mutation& m) -> future<> {
+        // We use the storage_proxy::mutate API since CQL is not the best for handling large batches.
+        co_await _sp.mutate(
+            { std::move(m) },
+            quorum_if_many(ctx.num_token_owners),
+            db::timeout_clock::now() + 10s,
+            nullptr, // trace_state
+            empty_service_permit(),
+            false // raw_counters
+        );
+    });
+
+    // Commit the description.
+    co_await _qp.execute_internal(
+            format("INSERT INTO {}.{} (key, time) VALUES (?, ?)", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
-            { time, make_set_value(cdc_streams_set_type, prepare_cdc_streams(streams)) },
+            { CDC_TIMESTAMPS_KEY, time },
            false).discard_result();
 }

@@ -353,7 +424,7 @@ system_distributed_keyspace::expire_cdc_desc(
        db_clock::time_point expiration_time,
        context ctx) {
    return _qp.execute_internal(
-            format("UPDATE {}.{} SET expired = ? WHERE time = ?", NAME, CDC_DESC),
+            format("UPDATE {}.{} SET expired = ? WHERE time = ?", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
            { expiration_time, streams_ts },
@@ -364,11 +435,44 @@ future<bool>
 system_distributed_keyspace::cdc_desc_exists(
        db_clock::time_point streams_ts,
        context ctx) {
-    return _qp.execute_internal(
-            format("SELECT time FROM {}.{} WHERE time = ?", NAME, CDC_DESC),
+    // Reading from this table on a freshly upgraded node that is the first to announce the CDC_TIMESTAMPS
+    // schema would most likely result in replicas refusing to return data, telling the node that they can't
+    // find the schema. Indeed, it takes some time for the nodes to synchronize their schema; schema is
+    // only eventually consistent.
+    //
+    // This problem doesn't occur on writes since writes enforce schema pull if the receiving replica
+    // notices that the write comes from an unknown schema, but it does occur on reads.
+    //
+    // Hence we work around it with a hack: we send a mutation with an empty partition to force our replicas
+    // to pull the schema.
+    //
+    // This is not strictly necessary; the code that calls this function does it in a retry loop
+    // so eventually, after the schema gets pulled, the read would succeed.
+    // Still, the errors are also unnecessary and if we can get rid of them - let's do it.
+    //
+    // FIXME: find a more elegant way to deal with this ``problem''.
+    if (!_forced_cdc_timestamps_schema_sync) {
+        using namespace std::chrono_literals;
+        auto s = _qp.db().find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS);
+        mutation m(s, partition_key::from_singular(*s, CDC_TIMESTAMPS_KEY));
+        co_await _sp.mutate(
+            { std::move(m) },
+            quorum_if_many(ctx.num_token_owners),
+            db::timeout_clock::now() + 10s,
+            nullptr, // trace_state
+            empty_service_permit(),
+            false // raw_counters
+        );
+
+        _forced_cdc_timestamps_schema_sync = true;
+    }
+
+    // At this point replicas know the schema, we can perform the actual read...
+    co_return co_await _qp.execute_internal(
+            format("SELECT time FROM {}.{} WHERE key = ? AND time = ?", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
-            { streams_ts },
+            { CDC_TIMESTAMPS_KEY, streams_ts },
            false
    ).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) -> bool {
        return !cql_result->empty() && cql_result->one().has("time");
@@ -376,27 +480,76 @@ system_distributed_keyspace::cdc_desc_exists(
 }

 future<std::map<db_clock::time_point, cdc::streams_version>> 
-system_distributed_keyspace::cdc_get_versioned_streams(context ctx) {
-    return _qp.execute_internal(
-            format("SELECT * FROM {}.{}", NAME, CDC_DESC),
+system_distributed_keyspace::cdc_get_versioned_streams(db_clock::time_point not_older_than, context ctx) {
+    auto timestamps_cql = co_await _qp.execute_internal(
+            format("SELECT time FROM {}.{} WHERE key = ?", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
-            {},
-            false
-    ).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
-        std::map<db_clock::time_point, cdc::streams_version> result;
+            { CDC_TIMESTAMPS_KEY },
+            false);

-        for (auto& row : *cql_result) {
-            auto ts = row.get_as<db_clock::time_point>("time");
-            auto exp = row.get_opt<db_clock::time_point>("expired");
-            std::vector<cdc::stream_id> ids;
-            row.get_list_data<bytes>("streams", std::back_inserter(ids)); 
-            result.emplace(ts, cdc::streams_version(std::move(ids), ts, exp));
+    std::vector<db_clock::time_point> timestamps;
+    timestamps.reserve(timestamps_cql->size());
+    for (auto& row : *timestamps_cql) {
+        timestamps.push_back(row.get_as<db_clock::time_point>("time"));
+    }
+
+    // `time` is the table's clustering key, so the results are already sorted
+    auto first = std::lower_bound(timestamps.rbegin(), timestamps.rend(), not_older_than);
+    // need first gen _intersecting_ the timestamp.
+    if (first != timestamps.rbegin()) {
+        --first;
+    }
+
+    std::map<db_clock::time_point, cdc::streams_version> result;
+    co_await max_concurrent_for_each(first, timestamps.rend(), 5, [this, &ctx, &result] (db_clock::time_point ts) -> future<> {
+        auto streams_cql = co_await _qp.execute_internal(
+                format("SELECT streams FROM {}.{} WHERE time = ?", NAME, CDC_DESC_V2),
+                quorum_if_many(ctx.num_token_owners),
+                internal_distributed_timeout_config,
+                { ts },
+                false);
+
+        utils::chunked_vector<cdc::stream_id> ids;
+        for (auto& row : *streams_cql) {
+            row.get_list_data<bytes>("streams", std::back_inserter(ids));
+            co_await make_ready_future<>(); // maybe yield
        }

-        return result;
+        result.emplace(ts, cdc::streams_version{std::move(ids), ts});
    });
+
+    co_return result;
 }

+future<db_clock::time_point> 
+system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
+    auto timestamp_cql = co_await _qp.execute_internal(
+            format("SELECT time FROM {}.{} WHERE key = ? limit 1", NAME, CDC_TIMESTAMPS),
+            quorum_if_many(ctx.num_token_owners),
+            internal_distributed_timeout_config,
+            { CDC_TIMESTAMPS_KEY },
+            false);
+
+    co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
+}
+
+future<std::vector<db_clock::time_point>>
+system_distributed_keyspace::get_cdc_desc_v1_timestamps(context ctx) {
+    std::vector<db_clock::time_point> res;
+    co_await _qp.query_internal(
+            format("SELECT time FROM {}.{}", NAME, CDC_DESC_V1),
+            quorum_if_many(ctx.num_token_owners),
+            // This is a long and expensive scan (mostly due to #8061).
+            // Give it a bit more time than usual.
+            get_timeout_config(std::chrono::seconds(60)),
+            {},
+            1000,
+            [&] (const cql3::untyped_result_set_row& r) {
+        res.push_back(r.get_as<db_clock::time_point>("time"));
+        return make_ready_future<stop_iteration>(stop_iteration::no);
+    });
+    co_return res;
+}

 }
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -41,6 +41,10 @@ namespace cdc {
    class streams_version;
 } // namespace cdc

+namespace service {
+    class storage_proxy;
+}
+
 namespace db {

 class system_distributed_keyspace {
@@ -51,8 +55,16 @@ public:
    /* Nodes use this table to communicate new CDC stream generations to other nodes. */
    static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";

-    /* This table is used by CDC clients to learn about avaliable CDC streams. */
-    static constexpr auto CDC_DESC = "cdc_streams_descriptions";
+    /* This table is used by CDC clients to learn about available CDC streams. */
+    static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
+
+    /* Used by CDC clients to learn CDC generation timestamps. */
+    static constexpr auto CDC_TIMESTAMPS = "cdc_generation_timestamps";
+
+    /* Previous version of the "cdc_streams_descriptions_v2" table.
+     * We use it in the upgrade procedure to ensure that CDC generations appearing
+     * in the old table also appear in the new table, if necessary. */
+    static constexpr auto CDC_DESC_V1 = "cdc_streams_descriptions";

    /* Information required to modify/query some system_distributed tables, passed from the caller. */
    struct context {
@@ -62,17 +74,23 @@ public:
 private:
    cql3::query_processor& _qp;
    service::migration_manager& _mm;
+    service::storage_proxy& _sp;
+
+    bool _started = false;
+    bool _forced_cdc_timestamps_schema_sync = false;

 public:
    /* Should writes to the given table always be synchronized by commitlog (flushed to disk)
     * before being acknowledged? */
    static bool is_extra_durable(const sstring& cf_name);

-    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&);
+    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);

    future<> start();
    future<> stop();

+    bool started() const { return _started; }
+
    future<std::unordered_map<utils::UUID, sstring>> view_status(sstring ks_name, sstring view_name) const;
    future<> start_view_build(sstring ks_name, sstring view_name) const;
    future<> finish_view_build(sstring ks_name, sstring view_name) const;
@@ -82,11 +100,18 @@ public:
    future<std::optional<cdc::topology_description>> read_cdc_topology_description(db_clock::time_point streams_ts, context);
    future<> expire_cdc_topology_description(db_clock::time_point streams_ts, db_clock::time_point expiration_time, context);

-    future<> create_cdc_desc(db_clock::time_point streams_ts, const std::vector<cdc::stream_id>&, context);
+    future<> create_cdc_desc(db_clock::time_point streams_ts, const cdc::topology_description&, context);
    future<> expire_cdc_desc(db_clock::time_point streams_ts, db_clock::time_point expiration_time, context);
    future<bool> cdc_desc_exists(db_clock::time_point streams_ts, context);

-    future<std::map<db_clock::time_point, cdc::streams_version>> cdc_get_versioned_streams(context);
+    /* Get all generation timestamps appearing in the "cdc_streams_descriptions" table
+     * (the old CDC stream description table). */
+    future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(context);
+
+    future<std::map<db_clock::time_point, cdc::streams_version>> cdc_get_versioned_streams(db_clock::time_point not_older_than, context);
+
+    future<db_clock::time_point> cdc_current_generation_timestamp(context);
+
 };

 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1574,6 +1574,21 @@ future<> update_cdc_streams_timestamp(db_clock::time_point tp) {
            .discard_result().then([] { return force_blocking_flush(v3::CDC_LOCAL); });
 }

+static const sstring CDC_REWRITTEN_KEY = "rewritten";
+
+future<> cdc_set_rewritten(std::optional<db_clock::time_point> tp) {
+    if (tp) {
+        return qctx->execute_cql(
+                format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
+                CDC_REWRITTEN_KEY, *tp).discard_result();
+    } else {
+        // Insert just the row marker.
+        return qctx->execute_cql(
+                format("INSERT INTO system.{} (key) VALUES (?)", v3::CDC_LOCAL),
+                CDC_REWRITTEN_KEY).discard_result();
+    }
+}
+
 future<> force_blocking_flush(sstring cfname) {
    assert(qctx);
    return qctx->_qp.invoke_on_all([cfname = std::move(cfname)] (cql3::query_processor& qp) {
@@ -1646,6 +1661,14 @@ future<std::optional<db_clock::time_point>> get_saved_cdc_streams_timestamp() {
    });
 }

+future<bool> cdc_is_rewritten() {
+    // We don't care about the actual timestamp; it's additional information for debugging purposes.
+    return qctx->execute_cql(format("SELECT key FROM system.{} WHERE key = ?", v3::CDC_LOCAL), CDC_REWRITTEN_KEY)
+            .then([] (::shared_ptr<cql3::untyped_result_set> msg) {
+        return !msg->empty();
+    });
+}
+
 bool bootstrap_complete() {
    return get_bootstrap_state() == bootstrap_state::COMPLETED;
 }
@@ -1864,7 +1887,7 @@ future<> get_compaction_history(compaction_history_consumer&& f) {
    return do_with(compaction_history_consumer(std::move(f)),
            [](compaction_history_consumer& consumer) mutable {
        sstring req = format("SELECT * from system.{}", COMPACTION_HISTORY);
-        return qctx->qp().query(req, [&consumer] (const cql3::untyped_result_set::row& row) mutable {
+        return qctx->qp().query_internal(req, [&consumer] (const cql3::untyped_result_set::row& row) mutable {
            compaction_history_entry entry;
            entry.id = row.get_as<utils::UUID>("id");
            entry.ks = row.get_as<sstring>("keyspace_name");
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -634,5 +634,8 @@ future<> save_paxos_proposal(const schema& s, const service::paxos::proposal& pr
 future<> save_paxos_decision(const schema& s, const service::paxos::proposal& decision, db::timeout_clock::time_point timeout);
 future<> delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout);

+future<bool> cdc_is_rewritten();
+future<> cdc_set_rewritten(std::optional<db_clock::time_point>);
+
 } // namespace system_keyspace
 } // namespace db
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -1170,7 +1170,7 @@ get_view_natural_endpoint(const sstring& keyspace_name,
 }

 static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<gms::inet_address>&& pending_endpoints,
-        frozen_mutation_and_schema& mut, const dht::token& base_token, const dht::token& view_token,
+        frozen_mutation_and_schema&& mut, const dht::token& base_token, const dht::token& view_token,
        service::allow_hints allow_hints, tracing::trace_state_ptr tr_state) {

    tracing::trace(tr_state, "Sending view update for {}.{} to {}, with pending endpoints = {}; base token = {}; view token = {}",
@@ -1189,7 +1189,7 @@ static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<
 // appropriate paired replicas. This is done asynchronously - we do not wait
 // for the writes to complete.
 future<> mutate_MV(
-        const dht::token& base_token,
+        dht::token base_token,
        std::vector<frozen_mutation_and_schema> view_updates,
        db::view::stats& stats,
        cf_stats& cf_stats,
@@ -1205,27 +1205,7 @@ future<> mutate_MV(
        auto& keyspace_name = mut.s->ks_name();
        auto target_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto remote_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
-        auto maybe_account_failure = [tr_state, &stats, &cf_stats, units = pending_view_updates.split(mut.fm.representation().size())] (
-                future<>&& f,
-                gms::inet_address target,
-                bool is_local,
-                size_t remotes) {
-            if (f.failed()) {
-                stats.view_updates_failed_local += is_local;
-                stats.view_updates_failed_remote += remotes;
-                cf_stats.total_view_updates_failed_local += is_local;
-                cf_stats.total_view_updates_failed_remote += remotes;
-                auto ep = f.get_exception();
-                tracing::trace(tr_state, "Failed to apply {}view update for {} and {} remote endpoints",
-                        seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
-                vlogger.error("Error applying view update to {}: {}", target, ep);
-                return make_exception_future<>(std::move(ep));
-            } else {
-                tracing::trace(tr_state, "Successfully applied {}view update for {} and {} remote endpoints",
-                        seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
-                return make_ready_future<>();
-            }
-        };
+        auto sem_units = pending_view_updates.split(mut.fm.representation().size());

        // First, find the local endpoint and ensure that if it exists,
        // it will be the target endpoint. That way, all endpoints in the
@@ -1262,11 +1242,20 @@ future<> mutate_MV(
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
            future<> local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
-                    [&stats,
-                     maybe_account_failure = std::move(maybe_account_failure),
-                     mut_ptr = std::move(mut_ptr)] (future<>&& f) {
+                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
+                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
-                return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
+                if (f.failed()) {
+                    ++stats.view_updates_failed_local;
+                    ++cf_stats.total_view_updates_failed_local;
+                    auto ep = f.get_exception();
+                    tracing::trace(tr_state, "Failed to apply local view update for {}", my_address);
+                    vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
+                            my_address, s->ks_name(), s->cf_name(), base_token, view_token, ep);
+                    return make_exception_future<>(std::move(ep));
+                }
+                tracing::trace(tr_state, "Successfully applied local view update for {}", my_address);
+                return make_ready_future<>();
            });
            fs->push_back(std::move(local_view_update));
            // We just applied a local update to the target endpoint, so it should now be removed
@@ -1288,11 +1277,23 @@ future<> mutate_MV(
            size_t updates_pushed_remote = remote_endpoints.size() + 1;
            stats.view_updates_pushed_remote += updates_pushed_remote;
            cf_stats.total_view_updates_pushed_remote += updates_pushed_remote;
-            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), mut, base_token, view_token, allow_hints, tr_state).then_wrapped(
-                    [target_endpoint,
-                     updates_pushed_remote,
-                     maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
-                return maybe_account_failure(std::move(f), std::move(*target_endpoint), false, updates_pushed_remote);
+            schema_ptr s = mut.s;
+            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), std::move(mut), base_token, view_token, allow_hints, tr_state).then_wrapped(
+                    [s = std::move(s), &stats, &cf_stats, tr_state, base_token, view_token, target_endpoint, updates_pushed_remote,
+                            units = sem_units.split(sem_units.count())] (future<>&& f) mutable {
+                if (f.failed()) {
+                    stats.view_updates_failed_remote += updates_pushed_remote;
+                    cf_stats.total_view_updates_failed_remote += updates_pushed_remote;
+                    auto ep = f.get_exception();
+                    tracing::trace(tr_state, "Failed to apply view update for {} and {} remote endpoints",
+                            *target_endpoint, updates_pushed_remote);
+                    vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
+                            *target_endpoint, s->ks_name(), s->cf_name(), base_token, view_token, ep);
+                    return make_exception_future<>(std::move(ep));
+                }
+                tracing::trace(tr_state, "Successfully applied view update for {} and {} remote endpoints",
+                        *target_endpoint, updates_pushed_remote);
+                return make_ready_future<>();
            });
            if (wait_for_all) {
                fs->push_back(std::move(view_update));
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -153,7 +153,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
 future<> mutate_MV(
-        const dht::token& base_token,
+        dht::token base_token,
        std::vector<frozen_mutation_and_schema> view_updates,
        db::view::stats& stats,
        cf_stats& cf_stats,
--- a/digester.hh
+++ b/digester.hh
@@ -58,7 +58,8 @@ public:

    template<typename T, typename... Args>
    void feed_hash(const T& value, Args&&... args) {
-        std::visit([&] (auto& hasher) noexcept -> void {
+        // FIXME uncomment the noexcept marking once clang bug 50994 is fixed or gcc compilation is turned on
+        std::visit([&] (auto& hasher) /* noexcept(noexcept(::feed_hash(hasher, value, args...))) */ -> void {
            ::feed_hash(hasher, value, std::forward<Args>(args)...);
        }, _impl);
    };
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -87,7 +87,8 @@ WantedBy=multi-user.target
            run('sysctl -p /etc/sysctl.d/99-scylla-coredump.conf', shell=True, check=True)

        fp = tempfile.NamedTemporaryFile()
-        fp.write(b'kill -SEGV $$')
+        fp.write(b'ulimit -c unlimited\n')
+        fp.write(b'kill -SEGV $$\n')
        fp.flush()
        p = subprocess.Popen(['/bin/bash', fp.name], stdout=subprocess.PIPE)
        pid = p.pid
--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -22,6 +22,7 @@

 import os
 import sys
+import argparse
 import shlex
 import distro
 from scylla_util import *
@@ -46,7 +47,12 @@ if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
-    if not os.path.exists('/sys/devices/system/cpu/cpufreq/policy0/scaling_governor'):
+    parser = argparse.ArgumentParser(description='CPU scaling setup script for Scylla.')
+    parser.add_argument('--force', dest='force', action='store_true',
+                        help='force running setup even CPU scaling unsupported')
+    args = parser.parse_args()
+
+    if not args.force and not os.path.exists('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor'):
        print('This computer doesn\'t supported CPU scaling configuration.')
        sys.exit(0)
    if not is_debian_variant():
@@ -56,6 +62,11 @@ if __name__ == '__main__':
        if not shutil.which('cpufreq-set'):
            pkg_install('cpufrequtils')
    if is_debian_variant():
+        try:
+            ondemand = systemd_unit('ondemand')
+            ondemand.disable()
+        except:
+            pass
        cfg = sysconfig_parser('/etc/default/cpufrequtils')
        cfg.set('GOVERNOR', 'performance')
        cfg.commit()
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -244,17 +244,17 @@ if __name__ == "__main__":
                # and https://cloud.google.com/compute/docs/disks/local-ssd#nvme
                # note that scylla iotune might measure more, this is GCP recommended
                mbs=1024*1024
-                if nr_disks >= 1 & nr_disks < 4:
+                if nr_disks >= 1 and nr_disks < 4:
                    disk_properties["read_iops"] = 170000 * nr_disks
                    disk_properties["read_bandwidth"] = 660 * mbs * nr_disks
                    disk_properties["write_iops"] = 90000 * nr_disks
                    disk_properties["write_bandwidth"] = 350 * mbs * nr_disks
-                elif nr_disks >= 4 & nr_disks <= 8:
+                elif nr_disks >= 4 and nr_disks <= 8:
                    disk_properties["read_iops"] = 680000
                    disk_properties["read_bandwidth"] = 2650 * mbs
                    disk_properties["write_iops"] = 360000
                    disk_properties["write_bandwidth"] = 1400 * mbs
-                elif nr_disks == "16":
+                elif nr_disks == 16:
                    disk_properties["read_iops"] = 1600000
                    disk_properties["read_bandwidth"] = 4521251328
                    #below is google, above is our measured
@@ -263,7 +263,7 @@ if __name__ == "__main__":
                    disk_properties["write_bandwidth"] = 2759452672
                    #below is google, above is our measured
                    #disk_properties["write_bandwidth"] = 3120 * mbs
-                elif nr_disks == "24":
+                elif nr_disks == 24:
                    disk_properties["read_iops"] = 2400000
                    disk_properties["read_bandwidth"] = 5921532416
                    #below is google, above is our measured
@@ -281,3 +281,5 @@ if __name__ == "__main__":
                run_iotune()
        else:
            run_iotune()
+        os.chmod(etcdir() + '/scylla.d/io_properties.yaml', 0o644)
+        os.chmod(etcdir() + '/scylla.d/io.conf', 0o644)
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -90,12 +90,12 @@ if __name__ == '__main__':
            with open('/etc/ntp.conf') as f:
                conf = f.read()
            if args.subdomain:
-                conf2 = re.sub(r'server\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', 'server \\1.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
+                conf2 = re.sub(r'(server|pool)\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', '\\1 \\2.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
                with open('/etc/ntp.conf', 'w') as f:
                    f.write(conf2)
                conf = conf2
-            match = re.search(r'^server\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
-            server = match.group(1)
+            match = re.search(r'^(server|pool)\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
+            server = match.group(2)
            ntpd = systemd_unit('ntpd.service')
            ntpd.stop()
            # ignore error, ntpd may able to adjust clock later
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -82,6 +82,7 @@ def create_perftune_conf(cfg):
        yaml = run('/opt/scylladb/scripts/perftune.py ' + params, shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
        with open('/etc/scylla.d/perftune.yaml', 'w') as f:
            f.write(yaml)
+        os.chmod('/etc/scylla.d/perftune.yaml', 0o644)
        return True
    else:
        return False
@@ -141,4 +142,3 @@ if __name__ == '__main__':
            print(f'Exception occurred while creating perftune.yaml: {e}')
            print('To fix the error, please re-run scylla_setup.')
            sys.exit(1)
-
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -27,8 +27,11 @@ import grp
 import sys
 import stat
 import distro
+from pathlib import Path
 from scylla_util import *
 from subprocess import run
+import distro
+from pkg_resources import parse_version

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -85,8 +88,14 @@ if __name__ == '__main__':
            raiddevs_to_try = [args.raiddev, ]
        for fsdev in raiddevs_to_try:
            raiddevname = os.path.basename(fsdev)
-            if not os.path.exists(f'/sys/block/{raiddevname}/md/array_state'):
+            array_state = Path(f'/sys/block/{raiddevname}/md/array_state')
+            # mdX is not allocated
+            if not array_state.exists():
                break
+            with array_state.open() as f:
+                # allocated, but no devices, not running
+                if f.read().strip() == 'clear':
+                    break
            print(f'{fsdev} is already using')
        else:
            if args.raiddev is None:
@@ -108,6 +117,25 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
+    # XXX: Workaround for mdmonitor.service issue on CentOS8
+    if is_redhat_variant() and distro.version() == '8':
+        mdadm_rpm = run('rpm -q mdadm', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+        match = re.match(r'^mdadm-([0-9]+\.[0-9]+-[a-zA-Z0-9]+)\.', mdadm_rpm)
+        mdadm_version = match.group(1)
+        if parse_version('4.1-14') < parse_version(mdadm_version):
+            repo_data = '''
+[BaseOS_8_3_2011]
+name=CentOS8.3.2011 - Base
+baseurl=http://vault.centos.org/8.3.2011/BaseOS/$basearch/os/
+gpgcheck=1
+enabled=0
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial
+'''[1:-1]
+            with open('/etc/yum.repos.d/CentOS-Vault-8.3.repo', 'w') as f:
+                f.write(repo_data)
+            run('dnf downgrade --enablerepo=BaseOS_8_3_2011 -y mdadm', shell=True, check=True)
+            run('dnf install -y python3-dnf-plugin-versionlock', shell=True, check=True)
+            run('dnf versionlock add mdadm', shell=True, check=True)
    try:
        md_service = systemd_unit('mdmonitor.service')
    except SystemdException:
@@ -156,7 +184,7 @@ Before=scylla-server.service
 After={after}

 [Mount]
-What=UUID={uuid}
+What=/dev/disk/by-uuid/{uuid}
 Where={mount_at}
 Type=xfs
 Options=noatime
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -176,11 +176,6 @@ def warn_offline(setup):
 def warn_offline_missing_pkg(setup, pkg):
    colorprint('{red}{setup} disabled by default, since {pkg} not available.{nocolor}', setup=setup, pkg=pkg)

-def current_umask():
-    current = os.umask(0)
-    os.umask(current)
-    return current
-
 if __name__ == '__main__':
    if not is_nonroot() and os.getuid() > 0:
        print('Requires root permission.')
@@ -331,12 +326,6 @@ if __name__ == '__main__':
    selinux_reboot_required = False
    set_clocksource = False

-    umask = current_umask()
-    # files have to be world-readable
-    if not is_nonroot() and (umask & 0o7) != 0o2:
-        colorprint('{red}Scylla does not work with current umask setting ({umask}),\nplease restore umask to the default value (0022).{nocolor}', umask='{0:o}'.format(umask).zfill(4))
-        sys.exit(1)
-
    if interactive:
        colorprint('{green}Skip any of the following steps by answering \'no\'{nocolor}')

@@ -375,11 +364,13 @@ if __name__ == '__main__':
            if version_check:
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: True\n')
+                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
                systemd_unit('scylla-housekeeping-daily.timer').unmask()
                systemd_unit('scylla-housekeeping-restart.timer').unmask()
            else:
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: False\n')
+                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
                hk_daily = systemd_unit('scylla-housekeeping-daily.timer')
                hk_daily.mask()
                hk_daily.stop()
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -36,6 +36,7 @@ from subprocess import run, DEVNULL
 import distro
 from scylla_sysconfdir import SYSCONFDIR

+from multiprocessing import cpu_count

 def scriptsdir_p():
    p = Path(sys.argv[0]).resolve()
@@ -146,6 +147,11 @@ class gcp_instance:
            if af == socket.AF_INET:
                addr, port = sa
                if addr == "169.254.169.254":
+                    # Make sure it is not on GKE
+                    try:
+                        gcp_instance().__instance_metadata("machine-type")
+                    except urllib.error.HTTPError:
+                        return False
                    return True
        return False

@@ -315,9 +321,10 @@ class gcp_instance:
                    logging.warning(
                        "This machine doesn't have enough CPUs for allocated number of NVMEs (at least 32 cpus for >=16 disks). Performance will suffer.")
                    return False
-                diskSize = self.firstNvmeSize
                if diskCount < 1:
+                    logging.warning("No ephemeral disks were found.")
                    return False
+                diskSize = self.firstNvmeSize
                max_disktoramratio = 105
                # 30:1 Disk/RAM ratio must be kept at least(AWS), we relax this a little bit
                # on GCP we are OK with {max_disktoramratio}:1 , n1-standard-2 can cope with 1 disk, not more
@@ -380,6 +387,8 @@ class aws_instance:
            raise Exception("found more than one disk mounted at root'".format(root_dev_candidates))

        root_dev = root_dev_candidates[0].device
+        if root_dev == '/dev/root':
+            root_dev = run('findmnt -n -o SOURCE /', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
        nvmes_present = list(filter(nvme_re.match, os.listdir("/dev")))
        return {"root": [ root_dev ], "ephemeral": [ x for x in nvmes_present if not root_dev.startswith(os.path.join("/dev/", x)) ] }

--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -1,2 +1,2 @@
 # Raise max AIO events
-fs.aio-max-nr = 1048576
+fs.aio-max-nr = 5578536
--- a/dist/common/systemd/scylla-fstrim.timer
+++ b/dist/common/systemd/scylla-fstrim.timer
@@ -1,7 +1,5 @@
 [Unit]
 Description=Run Scylla fstrim daily
-After=scylla-server.service
-BindsTo=scylla-server.service

 [Timer]
 OnCalendar=Sat *-*-* 00:00:00
--- a/dist/debian/debian/rules
+++ b/dist/debian/debian/rules
@@ -29,11 +29,11 @@ ifeq ($(product),scylla)
 	dh_installinit --no-start
 else
 	dh_installinit --no-start --name scylla-server
+	dh_installinit --no-start --name scylla-node-exporter
 endif
 	dh_installinit --no-start --name scylla-housekeeping-daily
 	dh_installinit --no-start --name scylla-housekeeping-restart
 	dh_installinit --no-start --name scylla-fstrim
-	dh_installinit --no-start --name node-exporter

 override_dh_strip:
 	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,9 +9,9 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
-    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
+    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.postrm
+++ b/dist/debian/debian/scylla-server.postrm
@@ -12,8 +12,6 @@ case "$1" in
        if [ "$1" = "purge" ]; then
            rm -rf /etc/systemd/system/scylla-server.service.d/
        fi
-        rm -f /etc/systemd/system/var-lib-systemd-coredump.mount
-        rm -f /etc/systemd/system/var-lib-scylla.mount
        ;;
 esac

--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=666.development
+ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/branch-4.4/latest/scylla.repo
+ARG VERSION=4.4.9

 ADD scylla_bashrc /scylla_bashrc

--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
@@ -4,3 +4,4 @@ stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
+stopwaitsecs=900
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/dist/offline_installer/redhat/build_offline_installer.sh
+++ b/dist/offline_installer/redhat/build_offline_installer.sh
@@ -26,26 +26,31 @@ fi
 print_usage() {
    echo "build_offline_installer.sh --repo [URL]"
    echo "  --repo  repository for fetching scylla rpm, specify .repo file URL"
-    echo "  --releasever  use specific minor version of the distribution repo (ex: 7.4)"
+    echo "  --image [IMAGE]  Use the specified docker IMAGE"
+    echo "  --no-docker  Build offline installer without using docker"
    exit 1
 }

-is_rhel7_variant() {
-    [ "$ID" = "rhel" -o "$ID" = "ol" -o "$ID" = "centos" ] && [[ "$VERSION_ID" =~ ^7 ]]
-}
+here="$(realpath $(dirname "$0"))"
+releasever=`rpm -q --provides $(rpm -q --whatprovides "system-release(releasever)") | grep "system-release(releasever)"| uniq |  cut -d ' ' -f 3`

 REPO=
-RELEASEVER=
+IMAGE=docker.io/centos:7
+NO_DOCKER=false
 while [ $# -gt 0 ]; do
    case "$1" in
        "--repo")
            REPO=$2
            shift 2
            ;;
-        "--releasever")
-            RELEASEVER=$2
+        "--image")
+            IMAGE=$2
            shift 2
            ;;
+        "--no-docker")
+            NO_DOCKER=true
+            shift 1
+            ;;
        *)
            print_usage
            ;;
@@ -59,25 +64,17 @@ if [ -z $REPO ]; then
    exit 1
 fi

-if ! is_rhel7_variant; then
-    echo "Unsupported distribution"
-    exit 1
-fi
-
-if [ "$ID" = "centos" ]; then
-    if [ ! -f /etc/yum.repos.d/epel.repo ]; then
-        sudo yum install -y epel-release
+if ! $NO_DOCKER; then
+    if [[ -f ~/.config/scylladb/dbuild ]]; then
+        . ~/.config/scylladb/dbuild
    fi
-    RELEASE=7
-else
-    if [ ! -f /etc/yum.repos.d/epel.repo ]; then
-        sudo rpm -Uvh https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+    if which docker >/dev/null 2>&1 ; then
+      tool=${DBUILD_TOOL-docker}
+    elif which podman >/dev/null 2>&1 ; then
+      tool=${DBUILD_TOOL-podman}
+    else
+      echo "Please make sure you install either podman or docker on this machine to run dbuild" && exit 1
    fi
-    RELEASE=7Server
-fi
-
-if [ ! -f /usr/bin/yumdownloader ]; then
-    sudo yum -y install yum-utils
 fi

 if [ ! -f /usr/bin/wget ]; then
@@ -85,29 +82,55 @@ if [ ! -f /usr/bin/wget ]; then
 fi

 if [ ! -f /usr/bin/makeself ]; then
-    sudo yum -y install makeself
+    if $NO_DOCKER; then
+        # makeself on EPEL7 is too old, borrow it from EPEL8
+        # since there is no dependency on the package, it just work
+        if [ $release_major = '7' ]; then
+            sudo rpm --import https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8
+            sudo cp "$here"/lib/epel8.repo /etc/yum.repos.d/
+            YUM_OPTS="--enablerepo=epel8"
+        elif [ $release_major = '8' ]; then
+            yum -y install epel-release || yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+        fi
+    fi
+    sudo yum -y install "$YUM_OPTS" makeself
 fi

 if [ ! -f /usr/bin/createrepo ]; then
    sudo yum -y install createrepo
 fi

-sudo yum -y install yum-plugin-downloadonly
+makeself_ver=$(makeself --version|cut -d ' ' -f 3|sed -e 's/\.//g')
+if [ $makeself_ver -lt 240 ]; then
+    echo "$(makeself --version) is too old, please install 2.4.0 or later"
+    exit 1
+fi

-cd /etc/yum.repos.d/
-sudo wget -N $REPO
-cd -
-
-sudo rm -rf build/installroot build/offline_installer build/scylla_offline_installer.sh
+sudo rm -rf build/installroot build/offline_docker build/offline_installer build/scylla_offline_installer.sh
 mkdir -p build/installroot
 mkdir -p build/installroot/etc/yum/vars
-sudo sh -c "echo $RELEASE >> build/installroot/etc/yum/vars/releasever"
+
+mkdir -p build/offline_docker
+wget "$REPO" -O build/offline_docker/scylla.repo
+cp "$here"/lib/install_deps.sh build/offline_docker
+cp "$here"/lib/Dockerfile.in build/offline_docker/Dockerfile
+sed -i -e "s#@@IMAGE@@#$IMAGE#" build/offline_docker/Dockerfile
+
+cd build/offline_docker
+if $NO_DOCKER; then
+    sudo cp scylla.repo /etc/yum.repos.d/scylla.repo
+    sudo ./install_deps.sh
+else
+    image_id=$($tool build -q .)
+fi
+cd -

 mkdir -p build/offline_installer
-cp dist/offline_installer/redhat/header build/offline_installer
-if [ -n "$RELEASEVER" ]; then
-    YUMOPTS="--releasever=$RELEASEVER"
+cp "$here"/lib/header build/offline_installer
+if $NO_DOCKER; then
+    "$here"/lib/construct_offline_repo.sh
+else
+    ./tools/toolchain/dbuild --image "$image_id" -- "$here"/lib/construct_offline_repo.sh
 fi
-sudo yum -y install $YUMOPTS --downloadonly --installroot=`pwd`/build/installroot --downloaddir=build/offline_installer scylla sudo ntp ntpdate net-tools kernel-tools
 (cd build/offline_installer; createrepo -v .)
-(cd build; makeself offline_installer scylla_offline_installer.sh "Scylla offline package" ./header)
+(cd build; makeself --keep-umask offline_installer scylla_offline_installer.sh "Scylla offline package" ./header)
--- a/dist/offline_installer/redhat/lib/Dockerfile.in
+++ b/dist/offline_installer/redhat/lib/Dockerfile.in
@@ -0,0 +1,5 @@
+FROM @@IMAGE@@
+ADD install_deps.sh install_deps.sh
+RUN ./install_deps.sh
+ADD scylla.repo /etc/yum.repos.d/scylla.repo
+CMD /bin/bash
--- a/dist/offline_installer/redhat/lib/construct_offline_repo.sh
+++ b/dist/offline_installer/redhat/lib/construct_offline_repo.sh
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+
+releasever=`rpm -q --provides $(rpm -q --whatprovides "system-release(releasever)") | grep "system-release(releasever)"| uniq |  cut -d ' ' -f 3`
+
+# Can ignore error since we only needed when files exists
+cp /etc/yum/vars/* build/installroot/etc/yum/vars/ ||:
+
+# run yum in non-root mode using fakeroot
+fakeroot yum -y install --downloadonly --releasever="$releasever" --installroot=`pwd`/build/installroot --downloaddir=build/offline_installer scylla sudo chrony net-tools kernel-tools mdadm xfsprogs
--- a/dist/offline_installer/redhat/lib/epel8.repo
+++ b/dist/offline_installer/redhat/lib/epel8.repo
@@ -0,0 +1,7 @@
+[epel8]
+name=Extra Packages for Enterprise Linux 8 - $basearch
+#baseurl=https://download.fedoraproject.org/pub/epel/8/Everything/$basearch
+metalink=https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir
+enabled=0
+gpgcheck=1
+countme=1
--- a/dist/offline_installer/redhat/lib/header
+++ b/dist/offline_installer/redhat/lib/header
--- a/dist/offline_installer/redhat/lib/install_deps.sh
+++ b/dist/offline_installer/redhat/lib/install_deps.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+. /etc/os-release
+
+release_major=$(echo $VERSION_ID|sed -e 's/^\([0-9]*\)[^0-9]*.*/\1/')
+
+if [ ! -f /etc/yum.repos.d/epel.repo ]; then
+    yum -y install epel-release || yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-"$release_major".noarch.rpm
+fi
+if [ ! -f /usr/bin/fakeroot ]; then
+    yum -y install fakeroot
+fi
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -54,7 +54,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf %{product}-python3
+Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
 Conflicts:      abrt
 AutoReqProv:    no

@@ -78,13 +78,18 @@ getent passwd scylla || /usr/sbin/useradd -g scylla -s /sbin/nologin -r -d %{_sh
 %post server
 /opt/scylladb/scripts/scylla_post_install.sh

-%systemd_post scylla-server.service
+if [ $1 -eq 1 ] ; then
+    /usr/bin/systemctl preset scylla-server.service ||:
+fi

 %preun server
-%systemd_preun scylla-server.service
+if [ $1 -eq 0 ] ; then
+    /usr/bin/systemctl --no-reload disable scylla-server.service ||:
+    /usr/bin/systemctl stop scylla-server.service ||:
+fi

 %postun server
-%systemd_postun scylla-server.service
+/usr/bin/systemctl daemon-reload ||:

 %posttrans server
 if  [ -d /tmp/%{name}-%{version}-%{release} ]; then
@@ -137,9 +142,9 @@ rm -rf $RPM_BUILD_ROOT
 %ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
 %ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
 /etc/systemd/system/scylla-server.service.d/dependencies.conf
-%ghost /etc/systemd/system/var-lib-systemd-coredump.mount
+%ghost %config /etc/systemd/system/var-lib-systemd-coredump.mount
 %ghost /etc/systemd/system/scylla-cpupower.service
-%ghost /etc/systemd/system/var-lib-scylla.mount
+%ghost %config /etc/systemd/system/var-lib-scylla.mount

 %package conf
 Group:          Applications/Databases
@@ -205,9 +210,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
@@ -224,10 +229,18 @@ URL:            https://github.com/prometheus/node_exporter
 Prometheus exporter for machine metrics, written in Go with pluggable metric collectors.

 %post node-exporter
-%systemd_post node-exporter.service
+if [ $1 -eq 1 ] ; then
+    /usr/bin/systemctl preset scylla-node-exporter.service ||:
+fi

 %preun node-exporter
-%systemd_preun node-exporter.service
+if [ $1 -eq 0 ] ; then
+    /usr/bin/systemctl --no-reload disable scylla-node-exporter.service ||:
+    /usr/bin/systemctl stop scylla-node-exporter.service ||:
+fi
+
+%postun node-exporter
+/usr/bin/systemctl daemon-reload ||:

 %files node-exporter
 %defattr(-,root,root)
--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -87,25 +87,13 @@ progresses and compatibility continues to improve.
 * UpdateTable: Not supported.
 * ListTables: Supported.
 ### Item Operations
-* GetItem: Support almost complete except that projection expressions can
-  only ask for top-level attributes.
-* PutItem: Support almost complete except that condition expressions can
-  only refer to to-level attributes.
-* UpdateItem: Nested documents are supported but updates to nested attributes
-  are not (e.g., `SET a.b[3].c=val`), and neither are nested attributes in
-  condition expressions.
-* DeleteItem: Mostly works, but again does not support nested attributes
-  in condition expressions.
+* GetItem, PutItem, UpdateItem, DeleteItem fully supported.
 ### Batch Operations
-* BatchGetItem: Almost complete except that projection expressions can only
-  ask for top-level attributes.
-* BatchWriteItem: Supported. Doesn't limit the number of items (DynamoDB
-  limits to 25) or size of items (400 KB) or total request size (16 MB).
+* BatchGetItem, BatchWriteItem fully supported.
+  Doesn't limit the number of items (DynamoDB limits to 25) or size of items
+  (400 KB) or total request size (16 MB).
 ### Scans
 Scan and Query are mostly supported, with the following limitations:
-* As above, projection expressions only support top-level attributes.
-* The ScanFilter/QueryFilter parameter for filtering results is fully
-  supported, but the newer FilterExpression syntax is not yet supported.
 * The "Select" options which allows to count items instead of returning them
  is not yet supported.
 ### Secondary Indexes
@@ -297,11 +285,10 @@ policies" section.
 DynamoDB allows attributes to be **nested** - a top-level attribute may
 be a list or a map, and each of its elements may further be lists or
 maps, etc. Alternator currently stores the entire content of a top-level
-attribute as one JSON object. This is good enough for most needs, except
-one DynamoDB feature which we cannot support safely: we cannot modify
-a non-top-level attribute (e.g., a.b[3].c) directly without RMW. We plan
-to fix this in a future version by rethinking the data model we use for
-attributes, or rethinking our implementation of RMW (as explained above).
+attribute as one JSON object. This means that UpdateItem requests which
+want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
+Alternator implements such requests by reading the entire top-level
+attribute a, modifying only a.b[3].c, and then writing back a.

 ```eval_rst
 .. toctree::
@@ -309,4 +296,4 @@ attributes, or rethinking our implementation of RMW (as explained above).

    getting-started
    compatibility
-```
+```
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -61,12 +61,6 @@ behave the same in Alternator. However, there are a few features which we have
 not implemented yet. Unimplemented features return an error when used, so
 they should be easy to detect. Here is a list of these unimplemented features:

-* Missing support for **atribute paths** like `a.b[3].c`.
-  Nested attributes _are_ supported, but Alternator does not yet allow reading
-  or writing directly a piece of a nested attributes using an attribute path -
-  only top-level attributes can be read or written directly.
-  https://github.com/scylladb/scylla/issues/5024
-
 * Currently in Alternator, a GSI (Global Secondary Index) can only be added
  to a table at table creation time. Unlike DynamoDB which also allows adding
  a GSI (but not an LSI) to an existing table using an UpdateTable operation.
--- a/docs/design-notes/cdc.md
+++ b/docs/design-notes/cdc.md
@@ -146,6 +146,15 @@ Next, the node starts gossiping the timestamp of the new generation together wit
        }).get();
 ```

+The node persists the currently gossiped timestamp in order to recover it on restart in the `system.cdc_local` table. This is the schema:
+```
+CREATE TABLE system.cdc_local (
+    key text PRIMARY KEY,
+    streams_timestamp timestamp
+) ...
+```
+The timestamp is kept under the `"cdc_local"` key in the `streams_timestamp` column.
+
 When other nodes learn about the generation, they'll extract it from the `cdc_generation_descriptions` table and save it using `cdc::metadata::insert(db_clock::time_point, topology_description&&)`.
 Notice that nodes learn about the generation together with the new node's tokens. When they learn about its tokens they'll immediately start sending writes to the new node (in the case of bootstrapping, it will become a pending replica). But the old generation will still be operating for a minute or two. Thus colocation will be lost for a while. This problem will be fixed when the two-phase-commit approach is implemented.

@@ -157,9 +166,54 @@ Due to the need of maintaining colocation we don't allow the client to send writ
 Suppose that a write is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`). Then we only allow the write if its timestamp is in the interval [`T`, `C + generation_leeway`), where `generation_leeway` is a small time-inteval constant (e.g. 5 seconds).
 Reason: we cannot allow writes before `T`, because they belong to the old generation whose token ranges might no longer refine the current vnodes, so the corresponding log write would not necessarily be colocated with the base write. We also cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.

-### Streams description table
+### Streams description tables

-The `cdc_streams_descriptions` table in the `system_distributed` keyspace allows CDC clients to learn about available sets of streams and the time intervals they are operating at. It's definition is as follows (db/system_distributed_keyspace.cc):
+The `cdc_streams_descriptions_v2` table in the `system_distributed` keyspace allows CDC clients to learn about available sets of streams and the time intervals they are operating at. It's definition is as follows (db/system_distributed_keyspace.cc):
+```
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2, {id})
+                /* The timestamp of this CDC generation. */
+                .with_column("time", timestamp_type, column_kind::partition_key)
+                /* For convenience, the list of stream IDs in this generation is split into token ranges
+                 * which the stream IDs were mapped to (by the partitioner) when the generation was created.  */
+                .with_column("range_end", long_type, column_kind::clustering_key)
+                /* The set of stream identifiers used in this CDC generation for the token range
+                 * ending on `range_end`. */
+                .with_column("streams", cdc_streams_set_type)
+                .with_version(system_keyspace::generate_schema_version(id))
+                .build();
+```
+where
+```
+thread_local data_type cdc_stream_tuple_type = tuple_type_impl::get_instance({long_type, long_type});
+thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(cdc_stream_tuple_type, false);
+```
+This table contains each generation's timestamp (as partition key) and the set of stream IDs used by this generation grouped by token ranges that the stream IDs are mapped to. It is meant to be user-facing, in contrast to `cdc_generation_descriptions` which is used internally.
+
+There is a second table that contains just the generations' timestamps, `cdc_generation_timestamps`:
+```
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS, {id})
+                /* This is a single-partition table. The partition key is always "timestamps". */
+                .with_column("key", utf8_type, column_kind::partition_key)
+                /* The timestamp of this CDC generation. */
+                .with_column("time", timestamp_type, column_kind::clustering_key)
+                /* Expiration time of this CDC generation (or null if not expired). */
+                .with_column("expired", timestamp_type)
+                .with_version(system_keyspace::generate_schema_version(id))
+                .build();
+```
+It is a single-partition table, containing the timestamps of generations found in `cdc_streams_descriptions_v2` in separate clustered rows. It allows clients to efficiently query if there are any new generations, e.g.:
+```
+SELECT time FROM system_distributed.cdc_generation_timestamps` WHERE time > X
+```
+where `X` is the last timestamp known by that particular client.
+
+When nodes learn about a CDC generation through gossip, they race to update these description tables by first inserting the set of rows containing this generation's stream IDs into `cdc_streams_descriptions_v2` and then, if the node succeeds, by inserting its timestamp into `cdc_generation_timestamps` (see `cdc::update_streams_description`). This operation is idempotent so it doesn't matter if multiple nodes do it at the same time.
+
+Note that the first phase of inserting stream IDs may fail in the middle; in that case, the partition for that generation may contain partial information. Thus a client can only safely read a partition from `cdc_streams_descriptions_v2` (i.e. without the risk of observing only a part of the stream IDs) if they first observe its timestamp in `cdc_generation_timestamps`.
+
+### Streams description table V1 and rewriting
+
+As the name suggests, `cdc_streams_descriptions_v2` is the second version of the streams description table. The previous schema was:
 ```
        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC, {id})
                /* The timestamp of this CDC generation. */
@@ -171,14 +225,26 @@ The `cdc_streams_descriptions` table in the `system_distributed` keyspace allows
                .with_version(system_keyspace::generate_schema_version(id))
                .build();
 ```
-where
-```
-thread_local data_type cdc_stream_tuple_type = tuple_type_impl::get_instance({long_type, long_type});
-thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(cdc_stream_tuple_type, false);
-```
-This table simply contains each generation's timestamp (as partition key) and the set of stream IDs used by this generation. It is meant to be user-facing, in contrast to `cdc_generation_descriptions` which is used internally.

-When nodes learn about a CDC generation through gossip, they race to update the description table by inserting a proper row (see `cdc::update_streams_description`). This operation is idempotent so it doesn't matter if multiple nodes do it at the same time.
+The entire set of stream IDs (for all token ranges) was stored as a single collection. With large clusters the collection could grow quite big: for example, with 100 nodes 64 shards each and 256 vnodes per node, a new generation would contain 1,6M stream IDs, resulting in a ~32MB collection. For reasons described in issue #7993 this would disqualify the previous schema.
+
+However, that was the schema used in the Scylla 4.3 release. For clusters that used CDC with this schema we need to ensure that stream descriptions residing in the old table appear in the new table as well (if necessary, i.e. if these streams may still contain some data).
+
+To do that, we perform a rewrite procedure. Each node does the following on restart:
+1. Check if the `system_distributed.cdc_streams_descriptions` table exists. If it doesn't, there's nothing to rewrite, so stop.
+2. Check if the `system.cdc_local` table contains a row with `key = "rewritten"`. If it does then rewrite was already performed, so stop.
+3. Check if there is a table with CDC enabled. If not, add a row with `key = "rewritten"` to `system.cdc_local` and stop; no rewriting is necessary (and won't be) since old generations - even if they exists - are not needed.
+4. Retrieve all generation timestamps from the old streams description table by performing a full range scan: `select time from system_distributed.cdc_streams_descriptions`. This may be a long/expensive operation, hence it's performed in a background task (the procedure is moved to background in this step).
+5. Filter out timestamps that are "too old". A generation timestamp is "too old" if there is a greater timestamp `T` such that for every table with CDC enabled, `now - ttl > T`, where `now` is the current time and `ttl` is the table's TTL setting. This means that the table cannot contain data that belongs to the "too old" generation. Thus, if each table passes this check for a given generation, that generation doesn't need to be rewritten.
+6. For each timestamp that's left:
+6.1 if it's already present in the new table, skip it (we check this by querying `cdc_generation_timestamps`
+6.2 fetch the generation (by querying `cdc_generation_descriptions`)
+6.3 insert the generation's streams into the new table
+7. Insert a row with `key = "rewritten"` into `system.cdc_local`.
+
+Note that every node will perform this procedure on upgrade, but there's a high chance that only one of them actually proceeds all the way to step 6.2 if upgrade is performed correctly, i.e. in a rolling fashion (nodes are restarted one-by-one).
+
+In order to prevent new nodes to do the rewriting (we only want upgrading nodes to do it), we insert the `key = "rewritten"` row on bootstrap as well, before we start this procedure (so the node won't pass the second check).

 #### TODO: expired generations
-The `expired` column in `cdc_streams_descriptions` and `cdc_generation_descriptions` means that this generation was superseded by some new generation and will soon be removed (its table entry will be gone). This functionality is yet to be implemented.
+The `expired` column in `cdc_generation_timestamps` and `cdc_generation_descriptions` means that this generation was superseded by some new generation and will soon be removed (its table entry will be gone). This functionality is yet to be implemented.
--- a/exceptions/exceptions.hh
+++ b/exceptions/exceptions.hh
@@ -340,4 +340,18 @@ public:
    unsupported_operation_exception(const sstring& msg) : std::runtime_error("unsupported operation: " + msg) {}
 };

+class function_execution_exception : public cassandra_exception {
+public:
+    const sstring ks_name;
+    const sstring func_name;
+    const std::vector<sstring> args;
+    function_execution_exception(sstring func_name_, sstring detail, sstring ks_name_, std::vector<sstring> args_) noexcept
+        : cassandra_exception{exception_code::FUNCTION_FAILURE,
+            format("execution of {} failed: {}", func_name_, detail)}
+        , ks_name(std::move(ks_name_))
+        , func_name(std::move(func_name_))
+        , args(std::move(args_))
+    { }
+};
+
 }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1445,7 +1445,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);

    // Do not mark a node with status shutdown as UP.
-    auto status = get_gossip_status(local_state);
+    auto status = sstring(get_gossip_status(local_state));
    if (status == sstring(versioned_value::SHUTDOWN)) {
        logger.warn("Skip marking node {} with status = {} as UP", addr, status);
        return;
@@ -1464,6 +1464,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        return;
    }

+    // Make a copy for endpoint_state because the code below can yield
+    endpoint_state state = local_state;
    _live_endpoints.push_back(addr);
    if (_endpoints_to_talk_with.empty()) {
        _endpoints_to_talk_with.push_back({addr});
@@ -1475,8 +1477,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_alive(addr, local_state);
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_alive(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
@@ -1485,11 +1487,12 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
 void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as down {}", addr);
    local_state.mark_dead();
+    endpoint_state state = local_state;
    _live_endpoints.resize(std::distance(_live_endpoints.begin(), std::remove(_live_endpoints.begin(), _live_endpoints.end(), addr)));
    _unreachable_endpoints[addr] = now();
-    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_dead(addr, local_state);
+    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(state));
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_dead(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
@@ -1792,6 +1795,8 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes,
                }).handle_exception_type([node, &fall_back_to_syn_msg] (seastar::rpc::unknown_verb_error&) {
                    logger.warn("Node {} does not support get_endpoint_states verb", node);
                    fall_back_to_syn_msg = true;
+                }).handle_exception_type([node, &nodes_down] (seastar::rpc::timeout_error&) {
+                    logger.warn("The get_endpoint_states verb to node {} was timeout", node);
                }).handle_exception_type([node, &nodes_down] (seastar::rpc::closed_error&) {
                    nodes_down++;
                    logger.warn("Node {} is down for get_endpoint_states verb", node);
--- a/hashing.hh
+++ b/hashing.hh
@@ -62,7 +62,7 @@ struct appending_hash;
 template<typename H, typename T, typename... Args>
 requires Hasher<H>
 inline
-void feed_hash(H& h, const T& value, Args&&... args) noexcept {
+void feed_hash(H& h, const T& value, Args&&... args) noexcept(noexcept(std::declval<appending_hash<T>>()(h, value, args...))) {
    appending_hash<T>()(h, value, std::forward<Args>(args)...);
 };

--- a/idl/partition_checksum.idl.hh
+++ b/idl/partition_checksum.idl.hh
@@ -103,22 +103,3 @@ enum class repair_row_level_start_status: uint8_t {
 struct repair_row_level_start_response {
    repair_row_level_start_status status;
 };
-
-enum class node_ops_cmd : uint32_t {
-     removenode_prepare,
-     removenode_heartbeat,
-     removenode_sync_data,
-     removenode_abort,
-     removenode_done,
-};
-
-struct node_ops_cmd_request {
-    node_ops_cmd cmd;
-    utils::UUID ops_uuid;
-    std::list<gms::inet_address> ignore_nodes;
-    std::list<gms::inet_address> leaving_nodes;
-};
-
-struct node_ops_cmd_response {
-    bool ok;
-};
--- a/install.sh
+++ b/install.sh
@@ -150,6 +150,10 @@ EOF
    chmod +x "$install"
 }

+install() {
+    command install -Z "$@"
+}
+
 installconfig() {
    local perm="$1"
    local src="$2"
@@ -210,13 +214,13 @@ if [ -z "$python3" ]; then
 fi
 rpython3=$(realpath -m "$root/$python3")
 if ! $nonroot; then
-    retc="$root/etc"
-    rsysconfdir="$root/$sysconfdir"
-    rusr="$root/usr"
-    rsystemd="$rusr/lib/systemd/system"
+    retc=$(realpath -m "$root/etc")
+    rsysconfdir=$(realpath -m "$root/$sysconfdir")
+    rusr=$(realpath -m "$root/usr")
+    rsystemd=$(realpath -m "$rusr/lib/systemd/system")
    rdoc="$rprefix/share/doc"
-    rdata="$root/var/lib/scylla"
-    rhkdata="$root/var/lib/scylla-housekeeping"
+    rdata=$(realpath -m "$root/var/lib/scylla")
+    rhkdata=$(realpath -m "$root/var/lib/scylla-housekeeping")
 else
    retc="$rprefix/etc"
    rsysconfdir="$rprefix/$sysconfdir"
@@ -245,6 +249,7 @@ if ! $nonroot; then
    done
 fi
 # scylla-node-exporter
+install -d -m755 "$rsysconfdir" "$rsystemd"
 install -d -m755 "$rprefix"/node_exporter
 install -d -m755 "$rprefix"/node_exporter/licenses
 install -m755 node_exporter/node_exporter "$rprefix"/node_exporter
@@ -278,7 +283,6 @@ fi

 # scylla-server
 install -m755 -d "$rprefix"
-install -m755 -d "$rsysconfdir"
 install -m755 -d "$retc/scylla.d"
 installconfig 644 dist/common/sysconfig/scylla-housekeeping "$rsysconfdir"
 installconfig 644 dist/common/sysconfig/scylla-server "$rsysconfdir"
@@ -286,7 +290,7 @@ for file in dist/common/scylla.d/*.conf; do
    installconfig 644 "$file" "$retc"/scylla.d
 done

-install -d -m755 "$retc"/scylla "$rsystemd" "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
+install -d -m755 "$retc"/scylla "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
 install -m644 dist/common/systemd/scylla-fstrim.service -Dt "$rsystemd"
 install -m644 dist/common/systemd/scylla-housekeeping-daily.service -Dt "$rsystemd"
 install -m644 dist/common/systemd/scylla-housekeeping-restart.service -Dt "$rsystemd"
--- a/locator/network_topology_strategy.cc
+++ b/locator/network_topology_strategy.cc
@@ -273,8 +273,12 @@ void network_topology_strategy::validate_options() const {
 }

 std::optional<std::set<sstring>> network_topology_strategy::recognized_options() const {
-    // We explicitely allow all options
-    return std::nullopt;
+    std::set<sstring> datacenters;
+    for (const auto& [dc_name, endpoints] : _shared_token_metadata.get()->get_topology().get_datacenter_endpoints()) {
+        datacenters.insert(dc_name);
+    }
+    // We only allow datacenter names as options
+    return datacenters;
 }

 using registry = class_registrator<abstract_replication_strategy, network_topology_strategy, const sstring&, const shared_token_metadata&, snitch_ptr&, const std::map<sstring, sstring>&>;
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -335,7 +335,6 @@ public:
    void remove_bootstrap_tokens(std::unordered_set<token> tokens);

    void add_leaving_endpoint(inet_address endpoint);
-    void del_leaving_endpoint(inet_address endpoint);
 public:
    void remove_endpoint(inet_address endpoint);
 #if 0
@@ -1658,10 +1657,6 @@ void token_metadata_impl::add_leaving_endpoint(inet_address endpoint) {
     _leaving_endpoints.emplace(endpoint);
 }

-void token_metadata_impl::del_leaving_endpoint(inet_address endpoint) {
-     _leaving_endpoints.erase(endpoint);
-}
-
 void token_metadata_impl::add_replacing_endpoint(inet_address existing_node, inet_address replacing_node) {
    tlogger.info("Added node {} as pending replacing endpoint which replaces existing node {}",
            replacing_node, existing_node);
@@ -1932,11 +1927,6 @@ token_metadata::add_leaving_endpoint(inet_address endpoint) {
    _impl->add_leaving_endpoint(endpoint);
 }

-void
-token_metadata::del_leaving_endpoint(inet_address endpoint) {
-    _impl->del_leaving_endpoint(endpoint);
-}
-
 void
 token_metadata::remove_endpoint(inet_address endpoint) {
    _impl->remove_endpoint(endpoint);
--- a/locator/token_metadata.hh
+++ b/locator/token_metadata.hh
@@ -238,7 +238,6 @@ public:
    void remove_bootstrap_tokens(std::unordered_set<token> tokens);

    void add_leaving_endpoint(inet_address endpoint);
-    void del_leaving_endpoint(inet_address endpoint);

    void remove_endpoint(inet_address endpoint);

--- a/main.cc
+++ b/main.cc
@@ -1063,7 +1063,7 @@ int main(int ac, char** av) {
                gms::stop_gossiping().get();
            });

-            sys_dist_ks.start(std::ref(qp), std::ref(mm)).get();
+            sys_dist_ks.start(std::ref(qp), std::ref(mm), std::ref(proxy)).get();

            ss.init_server().get();
            sst_format_selector.sync();
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -477,6 +477,7 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    // as well as reduce latency as there are potentially many requests
    // blocked on schema version request.
    case messaging_verb::GOSSIP_DIGEST_SYN:
+    case messaging_verb::GOSSIP_DIGEST_ACK:
    case messaging_verb::GOSSIP_DIGEST_ACK2:
    case messaging_verb::GOSSIP_SHUTDOWN:
    case messaging_verb::GOSSIP_ECHO:
@@ -504,7 +505,6 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM:
    case messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM:
    case messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM:
-    case messaging_verb::NODE_OPS_CMD:
    case messaging_verb::HINT_MUTATION:
        return 1;
    case messaging_verb::CLIENT_ID:
@@ -512,7 +512,6 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::READ_DATA:
    case messaging_verb::READ_MUTATION_DATA:
    case messaging_verb::READ_DIGEST:
-    case messaging_verb::GOSSIP_DIGEST_ACK:
    case messaging_verb::DEFINITIONS_UPDATE:
    case messaging_verb::TRUNCATE:
    case messaging_verb::MIGRATION_REQUEST:
@@ -1350,17 +1349,6 @@ future<std::vector<row_level_diff_detect_algorithm>> messaging_service::send_rep
    return send_message<future<std::vector<row_level_diff_detect_algorithm>>>(this, messaging_verb::REPAIR_GET_DIFF_ALGORITHMS, std::move(id));
 }

-// Wrapper for NODE_OPS_CMD
-void messaging_service::register_node_ops_cmd(std::function<future<node_ops_cmd_response> (const rpc::client_info& cinfo, node_ops_cmd_request)>&& func) {
-    register_handler(this, messaging_verb::NODE_OPS_CMD, std::move(func));
-}
-future<> messaging_service::unregister_node_ops_cmd() {
-    return unregister_handler(messaging_verb::NODE_OPS_CMD);
-}
-future<node_ops_cmd_response> messaging_service::send_node_ops_cmd(msg_addr id, node_ops_cmd_request req) {
-    return send_message<future<node_ops_cmd_response>>(this, messaging_verb::NODE_OPS_CMD, std::move(id), std::move(req));
-}
-
 void
 messaging_service::register_paxos_prepare(std::function<future<foreign_ptr<std::unique_ptr<service::paxos::prepare_response>>>(
        const rpc::client_info&, rpc::opt_time_point, query::read_command cmd, partition_key key, utils::UUID ballot,
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -143,8 +143,7 @@ enum class messaging_verb : int32_t {
    HINT_MUTATION = 42,
    PAXOS_PRUNE = 43,
    GOSSIP_GET_ENDPOINT_STATES = 44,
-    NODE_OPS_CMD = 45,
-    LAST = 46,
+    LAST = 45,
 };

 } // namespace netw
@@ -395,11 +394,6 @@ public:
    future<> unregister_repair_get_diff_algorithms();
    future<std::vector<row_level_diff_detect_algorithm>> send_repair_get_diff_algorithms(msg_addr id);

-    // Wrapper for NODE_OPS_CMD
-    void register_node_ops_cmd(std::function<future<node_ops_cmd_response> (const rpc::client_info& cinfo, node_ops_cmd_request)>&& func);
-    future<> unregister_node_ops_cmd();
-    future<node_ops_cmd_response> send_node_ops_cmd(msg_addr id, node_ops_cmd_request);
-
    // Wrapper for GOSSIP_ECHO verb
    void register_gossip_echo(std::function<future<> ()>&& func);
    future<> unregister_gossip_echo();
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -26,6 +26,7 @@

 #include "mutation_reader.hh"
 #include <seastar/core/future-util.hh>
+#include <seastar/core/coroutine.hh>
 #include "flat_mutation_reader.hh"
 #include "schema_registry.hh"
 #include "mutation_compactor.hh"
@@ -1176,6 +1177,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

+    _drop_partition_start = false;
+    _drop_static_row = false;
+
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1261,13 +1265,25 @@ void evictable_reader::maybe_validate_partition_start(const flat_mutation_reader
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // should be the same partition
+        if (_drop_partition_start) { // we expect to continue from the same partition
+            // We cannot assume the partition we stopped the read at is still alive
+            // when we recreate the reader. It might have been compacted away in the
+            // meanwhile, so allow for a larger partition too.
            require(
-                    cmp_res == 0,
-                    "{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    cmp_res <= 0,
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
+            // Reset drop flags and next pos if we are not continuing from the same partition
+            if (cmp_res < 0) {
+                // Close previous partition, we are not going to continue it.
+                push_mutation_fragment(*_schema, _permit, partition_end{});
+                _drop_partition_start = false;
+                _drop_static_row = false;
+                _next_position_in_partition = position_in_partition::for_partition_start();
+                _trim_range_tombstones = false;
+            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
@@ -1318,9 +1334,14 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
        _drop_partition_start = false;
        return true;
    }
-    if (_drop_static_row && mf.is_static_row()) {
-        _drop_static_row = false;
-        return true;
+    // Unlike partition-start above, a partition is not guaranteed to have a
+    // static row fragment. So reset the flag regardless of whether we could
+    // drop one or not.
+    // We are guaranteed to get here only right after dropping a partition-start,
+    // so if we are not seeing a static row here, the partition doesn't have one.
+    if (_drop_static_row) {
+         _drop_static_row = false;
+        return mf.is_static_row();
    }
    return false;
 }
@@ -1505,18 +1526,18 @@ future<> evictable_reader::fast_forward_to(const dht::partition_range& pr, db::t
    _end_of_stream = false;

    if (_reader) {
-        return _reader->fast_forward_to(pr, timeout);
+        co_await _reader->fast_forward_to(pr, timeout);
+        _range_override.reset();
+        co_return;
    }
    if (!_reader_created || !_irh) {
-        return make_ready_future<>();
+        co_return;
    }
    if (auto reader_opt = try_resume()) {
-        auto f = reader_opt->fast_forward_to(pr, timeout);
-        return f.then([this, reader = std::move(*reader_opt)] () mutable {
-            maybe_pause(std::move(reader));
-        });
+        co_await reader_opt->fast_forward_to(pr, timeout);
+        _range_override.reset();
+        maybe_pause(std::move(*reader_opt));
    }
-    return make_ready_future<>();
 }

 evictable_reader_handle::evictable_reader_handle(evictable_reader& r) : _r(&r)
@@ -1569,8 +1590,8 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
 private:
    shared_ptr<reader_lifecycle_policy> _lifecycle_policy;
    const unsigned _shard;
-    const dht::partition_range* _pr;
-    const query::partition_slice& _ps;
+    dht::partition_range _pr;
+    query::partition_slice _ps;
    const io_priority_class& _pc;
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
@@ -1596,7 +1617,7 @@ public:
        : impl(std::move(schema), std::move(permit))
        , _lifecycle_policy(std::move(lifecycle_policy))
        , _shard(shard)
-        , _pr(&pr)
+        , _pr(pr)
        , _ps(ps)
        , _pc(pc)
        , _trace_state(std::move(trace_state))
@@ -1681,7 +1702,7 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
            });
            auto s = gs.get();
            auto rreader = make_foreign(std::make_unique<evictable_reader>(evictable_reader::auto_pause::yes, std::move(ms),
-                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), *_pr, _ps, _pc, _trace_state, _fwd_mr));
+                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), _pr, _ps, _pc, _trace_state, _fwd_mr));
            tracing::trace(_trace_state, "Creating shard reader on shard: {}", this_shard_id());
            auto f = rreader->fill_buffer(timeout);
            return f.then([rreader = std::move(rreader)] () mutable {
@@ -1730,7 +1751,7 @@ void shard_reader::next_partition() {
 }

 future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
-    _pr = &pr;
+    _pr = pr;

    if (!_reader && !_read_ahead) {
        // No need to fast-forward uncreated readers, they will be passed the new
@@ -1739,12 +1760,12 @@ future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeo
    }

    auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
-    return f.then([this, &pr, timeout] {
+    return f.then([this, timeout] {
        _end_of_stream = false;
        clear_buffer();

-        return smp::submit_to(_shard, [this, &pr, timeout] {
-            return _reader->fast_forward_to(pr, timeout);
+        return smp::submit_to(_shard, [this, timeout] {
+            return _reader->fast_forward_to(_pr, timeout);
        });
    });
 }
@@ -2308,9 +2329,9 @@ position_reader_queue::~position_reader_queue() {}
 // are not implemented and throw an error; the reader is only used for single partition queries.
 //
 // Assumes that:
-// - the queue contains at least one reader,
 // - there are no static rows,
-// - the returned fragments do not contain partition tombstones.
+// - the returned fragments do not contain partition tombstones,
+// - the merged readers return fragments from the same partition (but some or even all of them may be empty).
 class clustering_order_reader_merger {
    const schema_ptr _schema;
    const reader_permit _permit;
@@ -2422,12 +2443,17 @@ class clustering_order_reader_merger {
            if (!mf) {
                // The reader returned end-of-stream before returning end-of-partition
                // (otherwise we would have removed it in a previous peek). This means that
-                // we are in forwarding mode and the reader won't return any more fragments in the current range.
+                // either the reader was empty from the beginning (not even returning a `partition_start`)
+                // or we are in forwarding mode and the reader won't return any more fragments in the current range.
                // If the reader's upper bound is smaller then the end of the current range then it won't
                // return any more fragments in later ranges as well (subsequent fast-forward-to ranges
                // are non-overlapping and strictly increasing), so we can remove it now.
-                // Otherwise it may start returning fragments later, so we save it for the moment
-                // in _halted_readers and will bring it back when we get fast-forwarded.
+                // Otherwise, if it previously returned a `partition_start`, it may start returning more fragments
+                // later (after we fast-forward) so we save it for the moment in _halted_readers and will bring it
+                // back when we get fast-forwarded.
+                // We also save the reader if it was empty from the beginning (no `partition_start`) since
+                // it makes the code simpler (to check for this here we would need additional state); it is a bit wasteful
+                // but completely empty readers should be rare.
                if (_cmp(it->upper_bound, _pr_end) < 0) {
                    _all_readers.erase(it);
                } else {
@@ -2557,19 +2583,6 @@ public:
                        : position_in_partition_view::after_all_clustered_rows())
        , _should_emit_partition_end(fwd_sm == streamed_mutation::forwarding::no)
    {
-        // The first call to `_reader_queue::pop` uses `after_all_clustered_rows`
-        // so we obtain at least one reader; we will return this reader's `partition_start`
-        // as the first fragment.
-        auto rs = _reader_queue->pop(position_in_partition_view::after_all_clustered_rows());
-        for (auto& r: rs) {
-            _all_readers.push_front(std::move(r));
-            _unpeeked_readers.push_back(_all_readers.begin());
-        }
-
-        if (rs.empty()) {
-            // No readers, no partition.
-            _should_emit_partition_end = false;
-        }
    }

    // We assume that operator() is called sequentially and that the caller doesn't use the batch
@@ -2586,8 +2599,22 @@ public:
            return peek_readers(timeout).then([this, timeout] { return (*this)(timeout); });
        }

-        auto next_peeked_pos = _peeked_readers.empty() ? _pr_end : _peeked_readers.front()->reader.peek_buffer().position();
-        // There might be queued readers containing fragments with positions <= next_peeked_pos:
+        // Before we return a batch of fragments using currently opened readers we must check the queue
+        // for potential new readers that must be opened. There are three cases which determine how ``far''
+        // should we look:
+        // - If there are some peeked readers in the heap, we must check for new readers
+        //   whose `min_position`s are <= the position of the first peeked reader; there is no need
+        //   to check for ``later'' readers (yet).
+        // - Otherwise, if we already fetched a partition start fragment, we need to look no further
+        //   than the end of the current position range (_pr_end).
+        // - Otherwise we need to look for any reader (by calling the queue with `after_all_clustered_rows`),
+        //   even for readers whose `min_position`s may be outside the current position range since they
+        //   may be the only readers which have a `partition_start` fragment which we need to return
+        //   before end-of-stream.
+        auto next_peeked_pos =
+            _peeked_readers.empty()
+                ? (_partition_start_fetched ? _pr_end : position_in_partition_view::after_all_clustered_rows())
+                : _peeked_readers.front()->reader.peek_buffer().position();
        if (!_reader_queue->empty(next_peeked_pos)) {
            auto rs = _reader_queue->pop(next_peeked_pos);
            for (auto& r: rs) {
@@ -2601,8 +2628,11 @@ public:
            // We are either in forwarding mode and waiting for a fast-forward,
            // or we've exhausted all the readers.
            if (_should_emit_partition_end) {
-                // Not forwarding, so all readers must be exhausted. Return the last fragment.
-                _current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
+                // Not forwarding, so all readers must be exhausted.
+                // Return a partition end fragment unless all readers have been empty from the beginning.
+                if (_partition_start_fetched) {
+                    _current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
+                }
                _should_emit_partition_end = false;
            }
            return make_ready_future<mutation_fragment_batch>(_current_batch);
--- a/mutation_writer/feed_writers.cc
+++ b/mutation_writer/feed_writers.cc
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2021 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "feed_writers.hh"
+
+namespace mutation_writer {
+
+bucket_writer::bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
+    : _schema(schema)
+    , _handle(std::move(queue_reader.second))
+    , _consume_fut(consumer(std::move(queue_reader.first)))
+{ }
+
+bucket_writer::bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
+    : bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer)
+{ }
+
+future<> bucket_writer::consume(mutation_fragment mf) {
+    return _handle.push(std::move(mf));
+}
+
+void bucket_writer::consume_end_of_stream() {
+    _handle.push_end_of_stream();
+}
+
+void bucket_writer::abort(std::exception_ptr ep) noexcept {
+    _handle.abort(std::move(ep));
+}
+
+future<> bucket_writer::close() noexcept {
+    return std::move(_consume_fut);
+}
+
+} // mutation_writer
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -22,10 +22,31 @@
 #pragma once

 #include "flat_mutation_reader.hh"
+#include "mutation_reader.hh"

 namespace mutation_writer {
 using reader_consumer = noncopyable_function<future<> (flat_mutation_reader)>;

+class bucket_writer {
+    schema_ptr _schema;
+    queue_reader_handle _handle;
+    future<> _consume_fut;
+
+private:
+    bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer);
+
+public:
+    bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer);
+
+    future<> consume(mutation_fragment mf);
+
+    void consume_end_of_stream();
+
+    void abort(std::exception_ptr ep) noexcept;
+
+    future<> close() noexcept;
+};
+
 template <typename Writer>
 requires MutationFragmentConsumer<Writer, future<>>
 future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
@@ -36,13 +57,22 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
                return when_all_succeed(std::move(f1), std::move(f2)).discard_result();
            });
+        }).then([&wr] {
+            wr.consume_end_of_stream();
        }).then_wrapped([&wr] (future<> f) {
            if (f.failed()) {
                auto ex = f.get_exception();
                wr.abort(ex);
-                return make_exception_future<>(ex);
+                return wr.close().then_wrapped([ex = std::move(ex)] (future<> f) mutable {
+                    if (f.failed()) {
+                        // The consumer is expected to fail when aborted,
+                        // so just ignore any exception.
+                        (void)f.get_exception();
+                    }
+                    return make_exception_future<>(std::move(ex));
+                });
            } else {
-                return wr.consume_end_of_stream();
+                return wr.close();
            }
        });
    });
--- a/mutation_writer/shard_based_splitting_writer.cc
+++ b/mutation_writer/shard_based_splitting_writer.cc
@@ -31,36 +31,7 @@
 namespace mutation_writer {

 class shard_based_splitting_mutation_writer {
-    class shard_writer {
-        queue_reader_handle _handle;
-        future<> _consume_fut;
-    private:
-        shard_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
-            : _handle(std::move(queue_reader.second))
-            , _consume_fut(consumer(std::move(queue_reader.first))) {
-        }
-
-    public:
-        shard_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
-            : shard_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
-        }
-        future<> consume(mutation_fragment mf) {
-            return _handle.push(std::move(mf));
-        }
-        future<> consume_end_of_stream() {
-            // consume_end_of_stream is always called from a finally block,
-            // and that's because we wait for _consume_fut to return. We
-            // don't want to generate another exception here if the read was
-            // aborted.
-            if (!_handle.is_terminated()) {
-                _handle.push_end_of_stream();
-            }
-            return std::move(_consume_fut);
-        }
-        void abort(std::exception_ptr ep) {
-            _handle.abort(ep);
-        }
-    };
+    using shard_writer = bucket_writer;

 private:
    schema_ptr _schema;
@@ -105,13 +76,12 @@ public:
        return write_to_shard(mutation_fragment(*_schema, _permit, std::move(pe)));
    }

-    future<> consume_end_of_stream() {
-        return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
-            if (!shard) {
-                return make_ready_future<>();
+    void consume_end_of_stream() {
+        for (auto& shard : _shards) {
+            if (shard) {
+                shard->consume_end_of_stream();
            }
-            return shard->consume_end_of_stream();
-        });
+        }
    }
    void abort(std::exception_ptr ep) {
        for (auto&& shard : _shards) {
@@ -120,6 +90,11 @@ public:
            }
        }
    }
+    future<> close() noexcept {
+        return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
+            return shard ? shard->close() : make_ready_future<>();
+        });
+    }
 };

 future<> segregate_by_shard(flat_mutation_reader producer, reader_consumer consumer) {
--- a/mutation_writer/timestamp_based_splitting_writer.cc
+++ b/mutation_writer/timestamp_based_splitting_writer.cc
@@ -109,22 +109,12 @@ small_flat_map<Key, Value, Size>::find(const key_type& k) {
 class timestamp_based_splitting_mutation_writer {
    using bucket_id = int64_t;

-    class bucket_writer {
-        schema_ptr _schema;
-        queue_reader_handle _handle;
-        future<> _consume_fut;
+    class timestamp_bucket_writer : public bucket_writer {
        bool _has_current_partition = false;

-    private:
-        bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
-            : _schema(std::move(schema))
-            , _handle(std::move(queue_reader.second))
-            , _consume_fut(consumer(std::move(queue_reader.first))) {
-        }
-
    public:
-        bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
-            : bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
+        timestamp_bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
+            : bucket_writer(schema, std::move(permit), consumer) {
        }
        void set_has_current_partition() {
            _has_current_partition = true;
@@ -135,18 +125,6 @@ class timestamp_based_splitting_mutation_writer {
        bool has_current_partition() const {
            return _has_current_partition;
        }
-        future<> consume(mutation_fragment mf) {
-            return _handle.push(std::move(mf));
-        }
-        future<> consume_end_of_stream() {
-            if (!_handle.is_terminated()) {
-                _handle.push_end_of_stream();
-            }
-            return std::move(_consume_fut);
-        }
-        void abort(std::exception_ptr ep) {
-            _handle.abort(ep);
-        }
    };

 private:
@@ -155,7 +133,7 @@ private:
    classify_by_timestamp _classifier;
    reader_consumer _consumer;
    partition_start _current_partition_start;
-    std::unordered_map<bucket_id, bucket_writer> _buckets;
+    std::unordered_map<bucket_id, timestamp_bucket_writer> _buckets;
    std::vector<bucket_id> _buckets_used_for_current_partition;

 private:
@@ -186,16 +164,21 @@ public:
    future<> consume(range_tombstone&& rt);
    future<> consume(partition_end&& pe);

-    future<> consume_end_of_stream() {
-        return parallel_for_each(_buckets, [] (std::pair<const bucket_id, bucket_writer>& bucket) {
-            return bucket.second.consume_end_of_stream();
-        });
+    void consume_end_of_stream() {
+        for (auto& b : _buckets) {
+            b.second.consume_end_of_stream();
+        }
    }
    void abort(std::exception_ptr ep) {
        for (auto&& b : _buckets) {
            b.second.abort(ep);
        }
    }
+    future<> close() noexcept {
+        return parallel_for_each(_buckets, [] (std::pair<const bucket_id, timestamp_bucket_writer>& b) {
+            return b.second.close();
+        });
+    }
 };

 future<> timestamp_based_splitting_mutation_writer::write_to_bucket(bucket_id bucket, mutation_fragment&& mf) {
--- a/query-result.hh
+++ b/query-result.hh
@@ -205,6 +205,10 @@ public:
            auto to_block = std::min(_used_memory - _blocked_bytes, n);
            _blocked_bytes += to_block;
            stop = (_limiter->update_and_check(to_block) && _stop_on_global_limit) || stop;
+            if (stop && !_short_read_allowed) {
+                // If we are here we stopped because of the global limit.
+                throw std::runtime_error("Maximum amount of memory for building query results is exhausted, unpaged query cannot be finished");
+            }
        }
        return stop;
    }
--- a/range_tombstone.hh
+++ b/range_tombstone.hh
@@ -267,9 +267,14 @@ public:
        return _current_tombstone;
    }

-    const std::deque<range_tombstone>& range_tombstones_for_row(const clustering_key_prefix& ck) {
+    std::vector<range_tombstone> range_tombstones_for_row(const clustering_key_prefix& ck) {
        drop_unneeded_tombstones(ck);
-        return _range_tombstones;
+        std::vector<range_tombstone> result(_range_tombstones.begin(), _range_tombstones.end());
+        auto cmp = [&] (const range_tombstone& rt1, const range_tombstone& rt2) {
+            return _cmp(rt1.start_bound(), rt2.start_bound());
+        };
+        std::sort(result.begin(), result.end(), cmp);
+        return result;
    }

    std::deque<range_tombstone> range_tombstones() && {
--- a/read_context.hh
+++ b/read_context.hh
@@ -141,6 +141,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    mutation_source_opt _underlying_snapshot;
    dht::partition_range _sm_range;
    std::optional<dht::decorated_key> _key;
+    bool _partition_exists;
    row_cache::phase_type _phase;
 public:
    read_context(row_cache& cache,
@@ -189,22 +190,34 @@ public:
    autoupdating_underlying_reader& underlying() { return _underlying; }
    row_cache::phase_type phase() const { return _phase; }
    const dht::decorated_key& key() const { return *_key; }
+    bool partition_exists() const { return _partition_exists; }
    void on_underlying_created() { ++_underlying_created; }
    bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
 public:
    future<> ensure_underlying(db::timeout_clock::time_point timeout) {
        if (_underlying_snapshot) {
-            return create_underlying(true, timeout);
+            return create_underlying(timeout).then([this, timeout] {
+                return _underlying.underlying()(timeout).then([this] (mutation_fragment_opt&& mfopt) {
+                    _partition_exists = bool(mfopt);
+                });
+            });
        }
+        // We know that partition exists because all the callers of
+        // enter_partition(const dht::decorated_key&, row_cache::phase_type)
+        // check that and there's no other way of setting _underlying_snapshot
+        // to empty. Except for calling create_underlying.
+        _partition_exists = true;
        return make_ready_future<>();
    }
 public:
-    future<> create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout);
+    future<> create_underlying(db::timeout_clock::time_point timeout);
    void enter_partition(const dht::decorated_key& dk, mutation_source& snapshot, row_cache::phase_type phase) {
        _phase = phase;
        _underlying_snapshot = snapshot;
        _key = dk;
    }
+    // Precondition: each caller needs to make sure that partition with |dk| key
+    //               exists in underlying before calling this function.
    void enter_partition(const dht::decorated_key& dk, row_cache::phase_type phase) {
        _phase = phase;
        _underlying_snapshot = {};
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -76,7 +76,7 @@ class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intru
    sstring _op_name;
    std::string_view _op_name_view;
    reader_resources _resources;
-    reader_permit::state _state = reader_permit::state::registered;
+    reader_permit::state _state = reader_permit::state::active;

 public:
    struct value_tag {};
@@ -124,22 +124,17 @@ public:
    }

    void on_admission() {
-        _state = reader_permit::state::admitted;
-        _semaphore.consume(_resources);
+        _state = reader_permit::state::active;
    }

    void consume(reader_resources res) {
        _resources += res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.consume(res);
-        }
+        _semaphore.consume(res);
    }

    void signal(reader_resources res) {
        _resources -= res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.signal(res);
-        }
+        _semaphore.signal(res);
    }

    reader_resources resources() const {
@@ -206,14 +201,11 @@ reader_resources reader_permit::consumed_resources() const {

 std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
    switch (s) {
-        case reader_permit::state::registered:
-            os << "registered";
-            break;
        case reader_permit::state::waiting:
            os << "waiting";
            break;
-        case reader_permit::state::admitted:
-            os << "admitted";
+        case reader_permit::state::active:
+            os << "active";
            break;
    }
    return os;
@@ -250,7 +242,7 @@ struct permit_group_key_hash {

 using permit_groups = std::unordered_map<permit_group_key, permit_stats, permit_group_key_hash>;

-static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) {
+static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state) {
    struct permit_summary {
        const schema* s;
        std::string_view op_name;
@@ -266,25 +258,17 @@ static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const pe
        }
    }

-    std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) {
-        if (sort_by_memory) {
-            return a.memory < b.memory;
-        } else {
-            return a.count < b.count;
-        }
+    std::ranges::sort(permit_summaries, [] (const permit_summary& a, const permit_summary& b) {
+        return a.memory < b.memory;
    });

    permit_stats total;

-    auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) {
-        if (sort_by_memory) {
-            fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
-        } else {
-            fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3);
-        }
+    auto print_line = [&os] (auto col1, auto col2, auto col3) {
+        fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
    };

-    fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count");
+    fmt::print(os, "Permits with state {}\n", state);
    print_line("count", "memory", "name");
    for (const auto& summary : permit_summaries) {
        total.count += summary.count;
@@ -310,11 +294,9 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
    permit_stats total;

    fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem);
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::active);
    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false);
-    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting);
    fmt::print(os, "\n");
    fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory));
 }
@@ -375,7 +357,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
 reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(std::unique_ptr<inactive_read> ir) {
    // Implies _inactive_reads.empty(), we don't queue new readers before
    // evicting all inactive reads.
-    if (_wait_list.empty()) {
+    if (_wait_list.empty() && _resources.memory > 0) {
        const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
        (void)_;
        ++_stats.inactive_reads;
@@ -425,13 +407,13 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read() {
 }

 bool reader_concurrency_semaphore::has_available_units(const resources& r) const {
-    return bool(_resources) && _resources >= r;
+    // Special case: when there is no active reader (based on count) admit one
+    // regardless of availability of memory.
+    return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
 }

 bool reader_concurrency_semaphore::may_proceed(const resources& r) const {
-    // Special case: when there is no active reader (based on count) admit one
-    // regardless of availability of memory.
-    return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count);
+    return _wait_list.empty() && has_available_units(r);
 }

 future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
@@ -482,6 +464,12 @@ void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
    }
 }

+std::string reader_concurrency_semaphore::dump_diagnostics() const {
+    std::ostringstream os;
+    do_dump_reader_permit_diagnostics(os, *this, *_permit_list, "user request");
+    return os.str();
+}
+
 // A file that tracks the memory usage of buffers resulting from read
 // operations.
 class tracking_file_impl : public file_impl {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -237,4 +237,6 @@ public:
    }

    void broken(std::exception_ptr ex);
+
+    std::string dump_diagnostics() const;
 };
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -91,9 +91,8 @@ public:
    class resource_units;

    enum class state {
-        registered, // read is registered, but didn't attempt admission yet
        waiting, // waiting for admission
-        admitted,
+        active,
    };

    class impl;
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -54,14 +54,6 @@ logging::logger rlogger("repair");

 static sharded<netw::messaging_service>* _messaging;

-void node_ops_info::check_abort() {
-    if (abort) {
-        auto msg = format("Node operation with ops_uuid={} is aborted", ops_uuid);
-        rlogger.warn("{}", msg);
-        throw std::runtime_error(msg);
-    }
-}
-
 class node_ops_metrics {
 public:
    node_ops_metrics() {
@@ -319,7 +311,7 @@ float node_ops_metrics::repair_finished_percentage() {
 tracker::tracker(size_t nr_shards, size_t max_repair_memory)
    : _shutdown(false)
    , _repairs(nr_shards) {
-    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range()));
+    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range() / 4));
    rlogger.info("Setting max_repair_memory={}, max_repair_memory_per_range={}, max_repair_ranges_in_parallel={}",
        max_repair_memory, max_repair_memory_per_range(), nr);
    _range_parallelism_semaphores.reserve(nr_shards);
@@ -443,16 +435,6 @@ void tracker::abort_all_repairs() {
    rlogger.info0("Aborted {} repair job(s)", count);
 }

-void tracker::abort_repair_node_ops(utils::UUID ops_uuid) {
-    for (auto& x : _repairs[this_shard_id()]) {
-        auto& ri = x.second;
-        if (ri->ops_uuid() && ri->ops_uuid().value() == ops_uuid) {
-            rlogger.info0("Aborted repair jobs for ops_uuid={}", ops_uuid);
-            ri->abort();
-        }
-    }
-}
-
 float tracker::report_progress(streaming::stream_reason reason) {
    uint64_t nr_ranges_finished = 0;
    uint64_t nr_ranges_total = 0;
@@ -811,8 +793,7 @@ repair_info::repair_info(seastar::sharded<database>& db_,
    repair_uniq_id id_,
    const std::vector<sstring>& data_centers_,
    const std::vector<sstring>& hosts_,
-    streaming::stream_reason reason_,
-    std::optional<utils::UUID> ops_uuid)
+    streaming::stream_reason reason_)
    : db(db_)
    , messaging(ms_)
    , sharder(get_sharder_for_tables(db_, keyspace_, table_ids_))
@@ -826,8 +807,7 @@ repair_info::repair_info(seastar::sharded<database>& db_,
    , hosts(hosts_)
    , reason(reason_)
    , nr_ranges_total(ranges.size())
-    , _row_level_repair(db.local().features().cluster_supports_row_level_repair())
-    , _ops_uuid(std::move(ops_uuid)) {
+    , _row_level_repair(db.local().features().cluster_supports_row_level_repair()) {
 }

 future<> repair_info::do_streaming() {
@@ -1646,7 +1626,7 @@ static int do_repair_start(seastar::sharded<database>& db, seastar::sharded<netw
                _node_ops_metrics.repair_total_ranges_sum += ranges.size();
                auto ri = make_lw_shared<repair_info>(db, ms,
                        std::move(keyspace), std::move(ranges), std::move(table_ids),
-                        id, std::move(data_centers), std::move(hosts), streaming::stream_reason::repair, id.uuid);
+                        id, std::move(data_centers), std::move(hosts), streaming::stream_reason::repair);
                return repair_ranges(ri);
            });
            repair_results.push_back(std::move(f));
@@ -1716,15 +1696,14 @@ static future<> sync_data_using_repair(seastar::sharded<database>& db,
        sstring keyspace,
        dht::token_range_vector ranges,
        std::unordered_map<dht::token_range, repair_neighbors> neighbors,
-        streaming::stream_reason reason,
-        std::optional<utils::UUID> ops_uuid) {
+        streaming::stream_reason reason) {
    if (ranges.empty()) {
        return make_ready_future<>();
    }
-    return smp::submit_to(0, [&db, &ms, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] () mutable {
+    return smp::submit_to(0, [&db, &ms, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
        repair_uniq_id id = repair_tracker().next_repair_command();
        rlogger.info("repair id {} to sync data for keyspace={}, status=started", id, keyspace);
-        return repair_tracker().run(id, [id, &db, &ms, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] () mutable {
+        return repair_tracker().run(id, [id, &db, &ms, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
            auto cfs = list_column_families(db.local(), keyspace);
            if (cfs.empty()) {
                rlogger.warn("repair id {} to sync data for keyspace={}, no table in this keyspace", id, keyspace);
@@ -1734,12 +1713,12 @@ static future<> sync_data_using_repair(seastar::sharded<database>& db,
            std::vector<future<>> repair_results;
            repair_results.reserve(smp::count);
            for (auto shard : boost::irange(unsigned(0), smp::count)) {
-                auto f = db.invoke_on(shard, [&db, &ms, keyspace, table_ids, id, ranges, neighbors, reason, ops_uuid] (database& localdb) mutable {
+                auto f = db.invoke_on(shard, [&db, &ms, keyspace, table_ids, id, ranges, neighbors, reason] (database& localdb) mutable {
                    auto data_centers = std::vector<sstring>();
                    auto hosts = std::vector<sstring>();
                    auto ri = make_lw_shared<repair_info>(db, ms,
                            std::move(keyspace), std::move(ranges), std::move(table_ids),
-                            id, std::move(data_centers), std::move(hosts), reason, ops_uuid);
+                            id, std::move(data_centers), std::move(hosts), reason);
                    ri->neighbors = std::move(neighbors);
                    return repair_ranges(ri);
                });
@@ -1804,6 +1783,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip, utils::can_yield::yes);
            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
+            bool everywhere_topology = strat.get_type() == locator::replication_strategy_type::everywhere_topology;

            //Active ranges
            auto metadata_clone = tmptr->clone_only_token_map().get0();
@@ -1881,7 +1861,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
                        };
                        auto old_endpoints_in_local_dc = get_old_endpoints_in_local_dc();
                        auto rf_in_local_dc = get_rf_in_local_dc();
-                        if (old_endpoints.size() == strat.get_replication_factor()) {
+                        if (everywhere_topology) {
+                            neighbors = old_endpoints_in_local_dc;
+                        } else if (old_endpoints.size() == strat.get_replication_factor()) {
                            // For example, with RF = 3 and 3 nodes n1, n2, n3
                            // in the cluster, n4 is bootstrapped, old_replicas
                            // = {n1, n2, n3}, new_replicas = {n1, n2, n4}, n3
@@ -1933,16 +1915,16 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
                }
            }
            auto nr_ranges = desired_ranges.size();
-            sync_data_using_repair(db, ms, keyspace_name, std::move(desired_ranges), std::move(range_sources), reason, {}).get();
+            sync_data_using_repair(db, ms, keyspace_name, std::move(desired_ranges), std::move(range_sources), reason).get();
            rlogger.info("bootstrap_with_repair: finished with keyspace={}, nr_ranges={}", keyspace_name, nr_ranges);
        }
        rlogger.info("bootstrap_with_repair: finished with keyspaces={}", keyspaces);
    });
 }

-static future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node, shared_ptr<node_ops_info> ops) {
+static future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node) {
    using inet_address = gms::inet_address;
-    return seastar::async([&db, &ms, tmptr = std::move(tmptr), leaving_node = std::move(leaving_node), ops] () mutable {
+    return seastar::async([&db, &ms, tmptr = std::move(tmptr), leaving_node = std::move(leaving_node)] () mutable {
        auto myip = utils::fb_utilities::get_broadcast_address();
        auto keyspaces = db.local().get_non_system_keyspaces();
        bool is_removenode = myip != leaving_node;
@@ -2001,9 +1983,6 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
            auto local_dc = get_local_dc();
            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
            for (auto&r : ranges) {
-                if (ops) {
-                    ops->check_abort();
-                }
                auto end_token = r.end() ? r.end()->value() : dht::maximum_token();
                const std::vector<inet_address> new_eps = ks.get_replication_strategy().calculate_natural_endpoints(end_token, temp, utils::can_yield::yes);
                const std::vector<inet_address>& current_eps = current_replica_endpoints[r];
@@ -2085,12 +2064,6 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
                }
                neighbors_set.erase(myip);
                neighbors_set.erase(leaving_node);
-                // Remove nodes in ignore_nodes
-                if (ops) {
-                    for (const auto& node : ops->ignore_nodes) {
-                        neighbors_set.erase(node);
-                    }
-                }
                auto neighbors = boost::copy_range<std::vector<gms::inet_address>>(neighbors_set |
                    boost::adaptors::filtered([&local_dc, &snitch_ptr] (const gms::inet_address& node) {
                        return snitch_ptr->get_datacenter(node) == local_dc;
@@ -2102,10 +2075,9 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
                    rlogger.debug("{}: keyspace={}, range={}, current_replica_endpoints={}, new_replica_endpoints={}, neighbors={}, skipped",
                        op, keyspace_name, r, current_eps, new_eps, neighbors);
                } else {
-                    std::vector<gms::inet_address> mandatory_neighbors = is_removenode ? neighbors : std::vector<gms::inet_address>{};
-                    rlogger.info("{}: keyspace={}, range={}, current_replica_endpoints={}, new_replica_endpoints={}, neighbors={}, mandatory_neighbor={}",
-                            op, keyspace_name, r, current_eps, new_eps, neighbors, mandatory_neighbors);
-                    range_sources[r] = repair_neighbors(std::move(neighbors), std::move(mandatory_neighbors));
+                    rlogger.debug("{}: keyspace={}, range={}, current_replica_endpoints={}, new_replica_endpoints={}, neighbors={}",
+                        op, keyspace_name, r, current_eps, new_eps, neighbors);
+                    range_sources[r] = repair_neighbors(std::move(neighbors));
                    if (is_removenode) {
                        ranges_for_removenode.push_back(r);
                    }
@@ -2125,8 +2097,7 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
                ranges.swap(ranges_for_removenode);
            }
            auto nr_ranges_synced = ranges.size();
-            std::optional<utils::UUID> opt_uuid = ops ? std::make_optional<utils::UUID>(ops->ops_uuid) : std::nullopt;
-            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason, opt_uuid).get();
+            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
            rlogger.info("{}: finished with keyspace={}, leaving_node={}, nr_ranges={}, nr_ranges_synced={}, nr_ranges_skipped={}",
                op, keyspace_name, leaving_node, nr_ranges_total, nr_ranges_synced, nr_ranges_skipped);
        }
@@ -2135,17 +2106,11 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
 }

 future<> decommission_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr) {
-    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), utils::fb_utilities::get_broadcast_address(), {});
+    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), utils::fb_utilities::get_broadcast_address());
 }

-future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node, shared_ptr<node_ops_info> ops) {
-    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), std::move(leaving_node), std::move(ops));
-}
-
-future<> abort_repair_node_ops(utils::UUID ops_uuid) {
-    return smp::invoke_on_all([ops_uuid] {
-        return repair_tracker().abort_repair_node_ops(ops_uuid);
-    });
+future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node) {
+    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), std::move(leaving_node));
 }

 static future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, sstring op, sstring source_dc, streaming::stream_reason reason) {
@@ -2220,7 +2185,7 @@ static future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, s
                }).get();
            }
            auto nr_ranges = ranges.size();
-            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason, {}).get();
+            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
            rlogger.info("{}: finished with keyspace={}, source_dc={}, nr_ranges={}", op, keyspace_name, source_dc, nr_ranges);
        }
        rlogger.info("{}: finished with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
@@ -2258,19 +2223,12 @@ static future<> init_messaging_service_handler(sharded<database>& db, sharded<ne
                return checksum_range(db, keyspace, cf, range, hv);
            });
        });
-        ms.register_node_ops_cmd([] (const rpc::client_info& cinfo, node_ops_cmd_request req) {
-            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
-            auto coordinator = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
-            return smp::submit_to(src_cpu_id % smp::count, [coordinator, req = std::move(req)] () mutable {
-                return service::get_local_storage_service().node_ops_cmd_handler(coordinator, std::move(req));
-            });
-        });
    });
 }

 static future<> uninit_messaging_service_handler() {
    return _messaging->invoke_on_all([] (auto& ms) {
-        return when_all_succeed(ms.unregister_repair_checksum_range(), ms.unregister_node_ops_cmd()).discard_result();
+        return ms.unregister_repair_checksum_range();
    });
 }

--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -76,22 +76,13 @@ struct repair_uniq_id {
 };
 std::ostream& operator<<(std::ostream& os, const repair_uniq_id& x);

-struct node_ops_info {
-    utils::UUID ops_uuid;
-    bool abort = false;
-    std::list<gms::inet_address> ignore_nodes;
-    void check_abort();
-};
-
 // The tokens are the tokens assigned to the bootstrap node.
 future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, std::unordered_set<dht::token> bootstrap_tokens);
 future<> decommission_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr);
-future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node, shared_ptr<node_ops_info> ops);
+future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node);
 future<> rebuild_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, sstring source_dc);
 future<> replace_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, std::unordered_set<dht::token> replacing_tokens);

-future<> abort_repair_node_ops(utils::UUID ops_uuid);
-
 // NOTE: repair_start() can be run on any node, but starts a node-global
 // operation.
 // repair_start() starts the requested repair on this node. It returns an
@@ -253,7 +244,6 @@ public:
    bool _row_level_repair;
    uint64_t _sub_ranges_nr = 0;
    std::unordered_set<sstring> dropped_tables;
-    std::optional<utils::UUID> _ops_uuid;
 public:
    repair_info(seastar::sharded<database>& db_,
            seastar::sharded<netw::messaging_service>& ms_,
@@ -263,8 +253,7 @@ public:
            repair_uniq_id id_,
            const std::vector<sstring>& data_centers_,
            const std::vector<sstring>& hosts_,
-            streaming::stream_reason reason_,
-            std::optional<utils::UUID> ops_uuid);
+            streaming::stream_reason reason_);
    future<> do_streaming();
    void check_failed_ranges();
    future<> request_transfer_ranges(const sstring& cf,
@@ -283,9 +272,6 @@ public:
    const std::vector<sstring>& table_names() {
        return cfs;
    }
-    const std::optional<utils::UUID>& ops_uuid() const {
-        return _ops_uuid;
-    };
 };

 // The repair_tracker tracks ongoing repair operations and their progress.
@@ -338,7 +324,6 @@ public:
    future<> run(repair_uniq_id id, std::function<void ()> func);
    future<repair_status> repair_await_completion(int id, std::chrono::steady_clock::time_point timeout);
    float report_progress(streaming::stream_reason reason);
-    void abort_repair_node_ops(utils::UUID ops_uuid);
 };

 future<uint64_t> estimate_partitions(seastar::sharded<database>& db, const sstring& keyspace,
@@ -479,27 +464,6 @@ enum class row_level_diff_detect_algorithm : uint8_t {

 std::ostream& operator<<(std::ostream& out, row_level_diff_detect_algorithm algo);

-enum class node_ops_cmd : uint32_t {
-     removenode_prepare,
-     removenode_heartbeat,
-     removenode_sync_data,
-     removenode_abort,
-     removenode_done,
-};
-
-// The cmd and ops_uuid are mandatory for each request.
-// The ignore_nodes and leaving_node are optional.
-struct node_ops_cmd_request {
-    node_ops_cmd cmd;
-    utils::UUID ops_uuid;
-    std::list<gms::inet_address> ignore_nodes;
-    std::list<gms::inet_address> leaving_nodes;
-};
-
-struct node_ops_cmd_response {
-    bool ok;
-};
-
 namespace std {
 template<>
 struct hash<partition_checksum> {
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -327,7 +327,7 @@ public:
    }
 };

-future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout) {
+future<> read_context::create_underlying(db::timeout_clock::time_point timeout) {
    if (_range_query) {
        // FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range
        // here in case the same reader will need to be fast forwarded later.
@@ -335,13 +335,8 @@ future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_c
    } else {
        _sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)});
    }
-    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this, skip_first_fragment, timeout] {
+    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this] {
        _underlying_snapshot = {};
-        if (skip_first_fragment) {
-            return _underlying.underlying()(timeout).then([](auto &&mf) {});
-        } else {
-            return make_ready_future<>();
-        }
    });
 }

@@ -361,7 +356,7 @@ private:
        auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value());
        auto phase = src_and_phase.phase;
        _read_context->enter_partition(_read_context->range().start()->value().as_decorated_key(), src_and_phase.snapshot, phase);
-        return _read_context->create_underlying(false, timeout).then([this, phase, timeout] {
+        return _read_context->create_underlying(timeout).then([this, phase, timeout] {
          return _read_context->underlying().underlying()(timeout).then([this, phase] (auto&& mfopt) {
            if (!mfopt) {
                if (phase == _cache.phase_of(_read_context->range().start()->value())) {
@@ -722,7 +717,7 @@ row_cache::make_reader(schema_ptr s,
            auto&& pos = ctx->range().start()->value();
            partitions_type::bound_hint hint;
            auto i = _partitions.lower_bound(pos, cmp, hint);
-            if (i != _partitions.end() && hint.match) {
+            if (hint.match) {
                cache_entry& e = *i;
                upgrade_entry(e);
                on_partition_hit();
--- a/schema.cc
+++ b/schema.cc
@@ -456,6 +456,9 @@ schema::schema(const schema& o)
    rebuild();
    if (o.is_view()) {
        _view_info = std::make_unique<::view_info>(*this, o.view_info()->raw());
+        if (o.view_info()->base_info()) {
+            _view_info->set_base_info(o.view_info()->base_info());
+        }
    }
 }

--- a/Show More
+++ b/Show More