release: prepare for 4.4.rc4

compaction: Prevent cleanup and regular from compacting the same sstable
Due to regression introduced by 463d0ab, regular can compact in parallel a sstable being compacted by cleanup, scrub or upgrade. This redundancy causes resources to be wasted, write amplification is increased and so does the operation time, etc. That's a potential source of data resurrection because the now-owned data from a sstable being compacted by both cleanup and regular will still exist in the node afterwards, so resurrection can happen if node regains ownership. Fixes #8155. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Message-Id: <20210225172641.787022-1-raphaelsc@scylladb.com> (cherry picked from commit 2cf0c4bbf1) Includes fixup patch: compaction_manager: Fix use-after-free in rewrite_sstables() Use-after-free introduced by 2cf0c4bbf1. That's because compacting is moved into then_wrapped() lambda, so it's potentially freed on the next iteration of repeat(). Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Message-Id: <20210309232940.433490-1-raphaelsc@scylladb.com> (cherry picked from commit f7cc431477)
2021-03-11 23:57:55 +02:00 · 2021-03-11 08:24:01 +02:00 · 2021-03-10 16:27:47 +02:00 · 2021-03-10 16:27:43 +02:00 · 2021-03-09 14:08:44 +02:00 · 2021-03-08 14:28:58 +02:00
63 changed files with 1371 additions and 871 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.4.dev
+VERSION=4.4.rc4

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -159,23 +159,40 @@ static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error::validation(format("BEGINS_WITH operator requires String or Binary type in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error::validation(format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error::validation(format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
@@ -279,24 +296,38 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -310,7 +341,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

@@ -341,56 +373,71 @@ struct cmp_gt {
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-static bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
-        throw api_error::validation(
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        if (bounds_from_query) {
+            throw api_error::validation(
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error::validation("between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error::validation(
+        if (bounds_from_query) {
+           throw api_error::validation(
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error::validation(
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error::validation(
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -437,19 +484,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -461,7 +508,8 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
            {
                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
@@ -573,7 +621,8 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
            // Shouldn't happen unless we have a bug in the parser
            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
    case parsed::primitive_condition::type::IN:
        return check_IN(calculated_values);
    case parsed::primitive_condition::type::VALUE:
@@ -604,13 +653,17 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::NE:
        return check_NE(&calculated_values[0], calculated_values[1]);
    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    default:
        // Shouldn't happen unless we have a bug in the parser
        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -52,6 +52,7 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
 bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);

 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);

 bool verify_condition_expression(
        const parsed::condition_expression& condition_expression,
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -603,52 +603,8 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
-            // TODO: There's duplication here with check_BEGINS_WITH().
-            // But unfortunately, the two functions differ a bit.
-
-            // If one of v1 or v2 is malformed or has an unsupported type
-            // (not B or S), what we do depends on whether it came from
-            // the user's query (is_constant()), or the item. Unsupported
-            // values in the query result in an error, but if they are in
-            // the item, we silently return false (no match).
-            bool bad = false;
-            if (!v1.IsObject() || v1.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v1));
-                }
-            } else if (v1.MemberBegin()->name != "S" && v1.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v1));
-                }
-            }
-            if (!v2.IsObject() || v2.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v2));
-                }
-            } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v2));
-                }
-            }
-            bool ret = false;
-            if (!bad) {
-                auto it1 = v1.MemberBegin();
-                auto it2 = v2.MemberBegin();
-                if (it1->name == it2->name) {
-                    if (it2->name == "S") {
-                        std::string_view val1 = rjson::to_string_view(it1->value);
-                        std::string_view val2 = rjson::to_string_view(it2->value);
-                        ret = val1.starts_with(val2);
-                    } else /* it2->name == "B" */ {
-                        ret = base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
-                    }
-                }
-            }
-            return to_bool_json(ret);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
        }
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -499,19 +499,11 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // TODO: creation time

    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-    // cannot really "resume" query, must iterate all data. because we cannot query neither "time" (pk) > something,
-    // or on expired...
-    // TODO: maybe add secondary index to topology table to enable this?
-    return _sdks.cdc_get_versioned_streams({ normal_token_owners }).then([this, &db, schema, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc), ttl](std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

-        // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
-        auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);
+    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
+    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-        auto i = topologies.lower_bound(low_ts);
-        // need first gen _intersecting_ the timestamp.
-        if (i != topologies.begin()) {
-            i = std::prev(i);
-        }
+    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([this, &db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

        auto e = topologies.end();
        auto prev = e;
@@ -519,9 +511,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

        std::optional<shard_id> last;

-        // i is now at the youngest generation we include. make a mark of it.
-        auto first = i;
-
+        auto i = topologies.begin();
        // if we're a paged query, skip to the generation where we left of.
        if (shard_start) {
            i = topologies.find(shard_start->time);
@@ -547,7 +537,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        };

        // need a prev even if we are skipping stuff
-        if (i != first) {
+        if (i != topologies.begin()) {
            prev = std::prev(i);
        }

@@ -1028,7 +1018,9 @@ future<executor::request_return_type> executor::get_records(client_state& client
        }

        // ugh. figure out if we are and end-of-shard
-        return cdc::get_local_streams_timestamp().then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
+        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+        
+        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
            auto& shard = iter.shard;            

            if (shard.time < ts && ts < high_ts) {
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1105,14 +1105,6 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"ignore_nodes",
-                     "description":"List of dead nodes to ingore in removenode operation",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
                  }
               ]
            }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -656,7 +656,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -664,7 +664,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -672,7 +672,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -680,7 +680,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -688,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -696,7 +696,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -27,7 +27,6 @@
 #include <time.h>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/adaptor/filtered.hpp>
-#include <boost/algorithm/string/trim_all.hpp>
 #include "service/storage_service.hh"
 #include "service/load_meter.hh"
 #include "db/commitlog/commitlog.hh"
@@ -497,22 +496,7 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::remove_node.set(r, [](std::unique_ptr<request> req) {
        auto host_id = req->get_query_param("host_id");
-        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
-        auto ignore_nodes = std::list<gms::inet_address>();
-        for (std::string n : ignore_nodes_strs) {
-            try {
-                std::replace(n.begin(), n.end(), '\"', ' ');
-                std::replace(n.begin(), n.end(), '\'', ' ');
-                boost::trim_all(n);
-                if (!n.empty()) {
-                    auto node = gms::inet_address(n);
-                    ignore_nodes.push_back(node);
-                }
-            } catch (...) {
-                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}", ignore_nodes_strs, n));
-            }
-        }
-        return service::get_local_storage_service().removenode(host_id, std::move(ignore_nodes)).then([] {
+        return service::get_local_storage_service().removenode(host_id).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -22,10 +22,13 @@
 #include <boost/type.hpp>
 #include <random>
 #include <unordered_set>
+#include <algorithm>
 #include <seastar/core/sleep.hh>
+#include <seastar/core/coroutine.hh>

 #include "keys.hh"
 #include "schema_builder.hh"
+#include "database.hh"
 #include "db/config.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
@@ -36,6 +39,7 @@
 #include "gms/gossiper.hh"

 #include "cdc/generation.hh"
+#include "cdc/cdc_options.hh"

 extern logging::logger cdc_log;

@@ -321,31 +325,23 @@ std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_ad
    return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
 }

-// Run inside seastar::async context.
-static void do_update_streams_description(
+static future<> do_update_streams_description(
        db_clock::time_point streams_ts,
        db::system_distributed_keyspace& sys_dist_ks,
        db::system_distributed_keyspace::context ctx) {
-    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
-        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
-        return;
+    if (co_await sys_dist_ks.cdc_desc_exists(streams_ts, ctx)) {
+        cdc_log.info("Generation {}: streams description table already updated.", streams_ts);
+        co_return;
    }

    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.

-    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    auto topo = co_await sys_dist_ks.read_cdc_topology_description(streams_ts, ctx);
    if (!topo) {
-        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+        throw no_generation_data_exception(streams_ts);
    }

-    std::set<cdc::stream_id> streams_set;
-    for (auto& entry: topo->entries()) {
-        streams_set.insert(entry.streams.begin(), entry.streams.end());
-    }
-
-    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
-
-    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    co_await sys_dist_ks.create_cdc_desc(streams_ts, *topo, ctx);
    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
 }

@@ -355,7 +351,7 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source& abort_src) {
    try {
-        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
    } catch(...) {
        cdc_log.warn(
            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
@@ -368,7 +364,7 @@ void update_streams_description(
            while (true) {
                sleep_abortable(std::chrono::seconds(60), abort_src).get();
                try {
-                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
                    return;
                } catch (...) {
                    cdc_log.warn(
@@ -380,4 +376,176 @@ void update_streams_description(
    }
 }

+static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
+    return db_clock::time_point{std::chrono::milliseconds(utils::UUID_gen::get_adjusted_timestamp(uuid))};
+}
+
+static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
+        db::system_distributed_keyspace& sys_dist_ks,
+        abort_source& abort_src,
+        const noncopyable_function<unsigned()>& get_num_token_owners) {
+    while (true) {
+        try {
+            co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
+        } catch (...) {
+            cdc_log.warn(
+                    "Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
+                    std::current_exception());
+        }
+        co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+    }
+}
+
+// Contains a CDC log table's creation time (extracted from its schema's id)
+// and its CDC TTL setting.
+struct time_and_ttl {
+    db_clock::time_point creation_time;
+    int ttl;
+};
+
+/*
+ * See `maybe_rewrite_streams_descriptions`.
+ * This is the long-running-in-the-background part of that function.
+ * It returns the timestamp of the last rewritten generation (if any).
+ */
+static future<std::optional<db_clock::time_point>> rewrite_streams_descriptions(
+        std::vector<time_and_ttl> times_and_ttls,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    cdc_log.info("Retrieving generation timestamps for rewriting...");
+    auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
+    cdc_log.info("Generation timestamps retrieved.");
+
+    // Find first generation timestamp such that some CDC log table may contain data before this timestamp.
+    // This predicate is monotonic w.r.t the timestamps.
+    auto now = db_clock::now();
+    std::sort(tss.begin(), tss.end());
+    auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
+        // partition_point finds first element that does *not* satisfy the predicate.
+        return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
+                [&] (const time_and_ttl& tat) {
+            // In this CDC log table there are no entries older than the table's creation time
+            // or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
+            // If ttl is set to 0 then entries in this table never expire. In that case we look
+            // only at the table's creation time.
+            auto no_entries_older_than =
+                (tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
+                    - std::chrono::seconds(10);
+            return no_entries_older_than < ts;
+        });
+    });
+
+    // Find first generation timestamp such that some CDC log table may contain data in this generation.
+    // This and all later generations need to be written to the new streams table.
+    if (first != tss.begin()) {
+        --first;
+    }
+
+    if (first == tss.end()) {
+        cdc_log.info("No generations to rewrite.");
+        co_return std::nullopt;
+    }
+
+    cdc_log.info("First generation to rewrite: {}", *first);
+
+    bool each_success = true;
+    co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
+        while (true) {
+            try {
+                co_return co_await do_update_streams_description(ts, *sys_dist_ks, { get_num_token_owners() });
+            } catch (const no_generation_data_exception& e) {
+                cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
+                each_success = false;
+                co_return;
+            } catch (...) {
+                cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
+            }
+            co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+        }
+    });
+
+    if (each_success) {
+        cdc_log.info("Rewriting stream tables finished successfully.");
+    } else {
+        cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
+    }
+
+    if (first != tss.end()) {
+        co_return *std::prev(tss.end());
+    }
+
+    co_return std::nullopt;
+}
+
+future<> maybe_rewrite_streams_descriptions(
+        const database& db,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    if (!db.has_schema(sys_dist_ks->NAME, sys_dist_ks->CDC_DESC_V1)) {
+        // This cluster never went through a Scylla version which used this table
+        // or the user deleted the table. Nothing to do.
+        co_return;
+    }
+
+    if (co_await db::system_keyspace::cdc_is_rewritten()) {
+        co_return;
+    }
+
+    if (db.get_config().cdc_dont_rewrite_streams()) {
+        cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
+        co_return;
+    }
+
+    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
+    std::vector<time_and_ttl> times_and_ttls;
+    for (auto& [_, cf] : db.get_column_families()) {
+        auto& s = *cf->schema();
+        auto base = cdc::get_base_table(db, s.ks_name(), s.cf_name());
+        if (!base) {
+            // Not a CDC log table.
+            continue;
+        }
+        auto& cdc_opts = base->cdc_options();
+        if (!cdc_opts.enabled()) {
+            // This table is named like a CDC log table but it's not one.
+            continue;
+        }
+
+        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id()), cdc_opts.ttl()});
+    }
+
+    if (times_and_ttls.empty()) {
+        // There's no point in rewriting old generations' streams (they don't contain any data).
+        cdc_log.info("No CDC log tables present, not rewriting stream tables.");
+        co_return co_await db::system_keyspace::cdc_set_rewritten(std::nullopt);
+    }
+
+    // It's safe to discard this future: the coroutine keeps system_distributed_keyspace alive
+    // and the abort source's lifetime extends the lifetime of any other service.
+    (void)(([_times_and_ttls = std::move(times_and_ttls), _sys_dist_ks = std::move(sys_dist_ks),
+                _get_num_token_owners = std::move(get_num_token_owners), &_abort_src = abort_src] () mutable -> future<> {
+        auto times_and_ttls = std::move(_times_and_ttls);
+        auto sys_dist_ks = std::move(_sys_dist_ks);
+        auto get_num_token_owners = std::move(_get_num_token_owners);
+        auto& abort_src = _abort_src;
+
+        // This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
+        // and some nodes that are UP may still be marked as DOWN by us.
+        // Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
+        // it doesn't - we'll retry - but it's nice if we succeed without any warnings).
+        co_await sleep_abortable(std::chrono::seconds(10), abort_src);
+
+        cdc_log.info("Rewriting stream tables in the background...");
+        auto last_rewritten = co_await rewrite_streams_descriptions(
+                std::move(times_and_ttls),
+                std::move(sys_dist_ks),
+                std::move(get_num_token_owners),
+                abort_src);
+
+        co_await db::system_keyspace::cdc_set_rewritten(last_rewritten);
+    })());
+}
+
 } // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -41,6 +41,7 @@
 #include "db_clock.hh"
 #include "dht/token.hh"
 #include "locator/token_metadata.hh"
+#include "utils/chunked_vector.hh"

 namespace seastar {
    class abort_source;
@@ -122,14 +123,19 @@ public:
 */ 
 class streams_version {
 public:
-    std::vector<stream_id> streams;
+    utils::chunked_vector<stream_id> streams;
    db_clock::time_point timestamp;
-    std::optional<db_clock::time_point> expired;

-    streams_version(std::vector<stream_id> s, db_clock::time_point ts, std::optional<db_clock::time_point> exp)
+    streams_version(utils::chunked_vector<stream_id> s, db_clock::time_point ts)
        : streams(std::move(s))
        , timestamp(ts)
-        , expired(std::move(exp))
+    {}
+};
+
+class no_generation_data_exception : public std::runtime_error {
+public:
+    no_generation_data_exception(db_clock::time_point generation_ts)
+        : std::runtime_error(format("could not find generation data for timestamp {}", generation_ts))
    {}
 };

@@ -194,4 +200,15 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source&);

+/* Part of the upgrade procedure. Useful in case where the version of Scylla that we're upgrading from
+ * used the "cdc_streams_descriptions" table. This procedure ensures that the new "cdc_streams_descriptions_v2"
+ * table contains streams of all generations that were present in the old table and may still contain data
+ * (i.e. there exist CDC log tables that may contain rows with partition keys being the stream IDs from
+ * these generations). */
+future<> maybe_rewrite_streams_descriptions(
+        const database&,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
 } // namespace cdc
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -668,10 +668,14 @@ struct internal_query_state {
    bool more_results = true;
 };

-::shared_ptr<internal_query_state> query_processor::create_paged_state(const sstring& query_string,
-        const std::initializer_list<data_value>& values, int32_t page_size) {
+::shared_ptr<internal_query_state> query_processor::create_paged_state(
+        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
+        const std::initializer_list<data_value>& values,
+        int32_t page_size) {
    auto p = prepare_internal(query_string);
-    auto opts = make_internal_options(p, values, db::consistency_level::ONE, infinite_timeout_config, page_size);
+    auto opts = make_internal_options(p, values, cl, timeout_config, page_size);
    ::shared_ptr<internal_query_state> res = ::make_shared<internal_query_state>(
            internal_query_state{
                    query_string,
@@ -935,17 +939,20 @@ bool query_processor::migration_subscriber::should_invalidate(
    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
        const std::initializer_list<data_value>& values,
+        int32_t page_size,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, values), std::move(f));
+    return for_each_cql_result(create_paged_state(query_string, cl, timeout_config, values, page_size), std::move(f));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, {}), std::move(f));
+    return query_internal(query_string, db::consistency_level::ONE, infinite_timeout_config, {}, 1000, std::move(f));
 }

 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -224,75 +224,52 @@ public:
    /*!
     * \brief iterate over all cql results using paging
     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
+     * You create a statement with optional parameters and pass
+     * a function that goes over the result rows.
     *
-     * The passed function would be called for all the results, return stop_iteration::yes
-     * to stop during iteration.
+     * The passed function would be called for all rows; return future<stop_iteration::yes>
+     * to stop iteration.
     *
     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
-                ....
-                ....
-                return stop_iteration::no;
-            });
-
-     * You can use place holder in the query, the prepared statement will only be done once.
-     *
-     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return false the iteration would stop
-     * args - arbitrary number of query parameters
-     */
-    template<typename... Args>
-    future<> query(
-            const sstring& query_string,
-            std::function<stop_iteration(const cql3::untyped_result_set_row&)>&& f,
-            Args&&... args) {
-        return for_each_cql_result(
-                create_paged_state(query_string, { data_value(std::forward<Args>(args))... }), std::move(f));
-    }
-
-    /*!
-     * \brief iterate over all cql results using paging
-     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
-     *
-     * The passed function would be called for all the results, return future<stop_iteration::yes>
-     * to stop during iteration.
-     *
-     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
+            return query_internal(
+                    "SELECT * from system.compaction_history",
+                    db::consistency_level::ONE,
+                    infinite_timeout_config,
+                    {},
+                    [&history] (const cql3::untyped_result_set::row& row) mutable {
                ....
                ....
                return make_ready_future<stop_iteration>(stop_iteration::no);
            });

-     * You can use place holder in the query, the prepared statement will only be done once.
+     * You can use placeholders in the query, the statement will only be prepared once.
     *
-     *
-     * query_string - the cql string, can contain place holder
-     * values - query parameters value
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * cl - consistency level of the query
+     * timeout_config - timeout configuration
+     * values - values to be substituted for the placeholders in the query
+     * page_size - maximum page size
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
+            db::consistency_level cl,
+            const timeout_config& timeout_config,
            const std::initializer_list<data_value>& values,
+            int32_t page_size,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

    /*
     * \brief iterate over all cql results using paging
-     * An overload of the query with future function without query parameters.
+     * An overload of query_internal without query parameters
+     * using CL = ONE, no timeout, and page size = 1000.
     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

@@ -354,8 +331,10 @@ private:
     */
    ::shared_ptr<internal_query_state> create_paged_state(
            const sstring& query_string,
-            const std::initializer_list<data_value>& = { },
-            int32_t page_size = 1000);
+            db::consistency_level,
+            const timeout_config&,
+            const std::initializer_list<data_value>&,
+            int32_t page_size);

    /*!
     * \brief run a query using paging
--- a/db/config.cc
+++ b/db/config.cc
@@ -780,6 +780,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Time period in seconds after which unused schema versions will be evicted from the local schema registry cache. Default is 1 second.")
    , max_concurrent_requests_per_shard(this, "max_concurrent_requests_per_shard",liveness::LiveUpdate, value_status::Used, std::numeric_limits<uint32_t>::max(),
        "Maximum number of concurrent requests a single shard can handle before it starts shedding extra load. By default, no requests will be shed.")
+    , cdc_dont_rewrite_streams(this, "cdc_dont_rewrite_streams", value_status::Used, false,
+            "Disable rewriting streams from cdc_streams_descriptions to cdc_streams_descriptions_v2. Should not be necessary, but the procedure is expensive and prone to failures; this config option is left as a backdoor in case some user requires manual intervention.")
    , alternator_port(this, "alternator_port", value_status::Used, 0, "Alternator API port")
    , alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port")
    , alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address")
--- a/db/config.hh
+++ b/db/config.hh
@@ -322,6 +322,7 @@ public:
    named_value<unsigned> user_defined_function_contiguous_allocation_limit_bytes;
    named_value<uint32_t> schema_registry_grace_period;
    named_value<uint32_t> max_concurrent_requests_per_shard;
+    named_value<bool> cdc_dont_rewrite_streams;

    named_value<uint16_t> alternator_port;
    named_value<uint16_t> alternator_https_port;
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -35,12 +35,14 @@

 #include <seastar/core/seastar.hh>
 #include <seastar/core/shared_ptr.hh>
+#include <seastar/core/coroutine.hh>
+#include <seastar/core/future-util.hh>

 #include <boost/range/adaptor/transformed.hpp>

 #include <optional>
 #include <vector>
-#include <optional>
+#include <set>

 extern logging::logger cdc_log;

@@ -91,12 +93,31 @@ schema_ptr cdc_generations() {
 /* A user-facing table providing identifiers of the streams used in CDC generations. */
 schema_ptr cdc_desc() {
    thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC);
-        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC, {id})
+        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2);
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2, {id})
                /* The timestamp of this CDC generation. */
                .with_column("time", timestamp_type, column_kind::partition_key)
-                /* The set of stream identifiers used in this CDC generation. */
+                /* For convenience, the list of stream IDs in this generation is split into token ranges
+                 * which the stream IDs were mapped to (by the partitioner) when the generation was created.  */
+                .with_column("range_end", long_type, column_kind::clustering_key)
+                /* The set of stream identifiers used in this CDC generation for the token range
+                 * ending on `range_end`. */
                .with_column("streams", cdc_streams_set_type)
+                .with_version(system_keyspace::generate_schema_version(id))
+                .build();
+    }();
+    return schema;
+}
+
+/* A user-facing table providing CDC generation timestamps. */
+schema_ptr cdc_timestamps() {
+    thread_local auto schema = [] {
+        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS);
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS, {id})
+                /* This is a single-partition table. The partition key is always "timestamps". */
+                .with_column("key", utf8_type, column_kind::partition_key)
+                /* The timestamp of this CDC generation. */
+                .with_column("time", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
                /* Expiration time of this CDC generation (or null if not expired). */
                .with_column("expired", timestamp_type)
                .with_version(system_keyspace::generate_schema_version(id))
@@ -105,11 +126,14 @@ schema_ptr cdc_desc() {
    return schema;
 }

+static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
+
 static std::vector<schema_ptr> all_tables() {
    return {
        view_build_status(),
        cdc_generations(),
        cdc_desc(),
+        cdc_timestamps(),
    };
 }

@@ -117,13 +141,15 @@ bool system_distributed_keyspace::is_extra_durable(const sstring& cf_name) {
    return cf_name == CDC_TOPOLOGY_DESCRIPTION;
 }

-system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm)
+system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
        : _qp(qp)
-        , _mm(mm) {
+        , _mm(mm)
+        , _sp(sp) {
 }

 future<> system_distributed_keyspace::start() {
    if (this_shard_id() != 0) {
+        _started = true;
        return make_ready_future<>();
    }

@@ -148,18 +174,18 @@ future<> system_distributed_keyspace::start() {
                });
            });
        });
-    });
+    }).then([this] { _started = true; });
 }

 future<> system_distributed_keyspace::stop() {
    return make_ready_future<>();
 }

-static const timeout_config internal_distributed_timeout_config = [] {
-    using namespace std::chrono_literals;
-    const auto t = 10s;
+static timeout_config get_timeout_config(db::timeout_clock::duration t) {
    return timeout_config{ t, t, t, t, t, t, t };
-}();
+}
+
+static const timeout_config internal_distributed_timeout_config = get_timeout_config(std::chrono::seconds(10));

 future<std::unordered_map<utils::UUID, sstring>> system_distributed_keyspace::view_status(sstring ks_name, sstring view_name) const {
    return _qp.execute_internal(
@@ -326,24 +352,69 @@ system_distributed_keyspace::expire_cdc_topology_description(
            false).discard_result();
 }

-static set_type_impl::native_type prepare_cdc_streams(const std::vector<cdc::stream_id>& streams) {
-    set_type_impl::native_type ret;
-    for (auto& s: streams) {
-        ret.push_back(data_value(s.to_bytes()));
+static future<std::vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
+        const database& db,
+        db_clock::time_point time,
+        const cdc::topology_description& desc) {
+    auto s = db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2);
+
+    auto ts = api::new_timestamp();
+    std::vector<mutation> res;
+    res.emplace_back(s, partition_key::from_singular(*s, time));
+    size_t size_estimate = 0;
+    for (auto& e : desc.entries()) {
+        // We want to keep each mutation below ~1 MB.
+        if (size_estimate >= 1000 * 1000) {
+            res.emplace_back(s, partition_key::from_singular(*s, time));
+            size_estimate = 0;
+        }
+
+        set_type_impl::native_type streams;
+        streams.reserve(e.streams.size());
+        for (auto& stream : e.streams) {
+            streams.push_back(data_value(stream.to_bytes()));
+        }
+
+        // We estimate 20 bytes per stream ID.
+        // Stream IDs themselves weigh 16 bytes each (2 * sizeof(int64_t))
+        // but there's metadata to be taken into account.
+        // It has been verified experimentally that 20 bytes per stream ID is a good estimate.
+        size_estimate += e.streams.size() * 20;
+        res.back().set_cell(clustering_key::from_singular(*s, dht::token::to_int64(e.token_range_end)),
+                to_bytes("streams"), make_set_value(cdc_streams_set_type, std::move(streams)), ts);
+
+        co_await make_ready_future<>(); // maybe yield
    }
-    return ret;
+
+    co_return res;
 }

 future<>
 system_distributed_keyspace::create_cdc_desc(
        db_clock::time_point time,
-        const std::vector<cdc::stream_id>& streams,
+        const cdc::topology_description& desc,
        context ctx) {
-    return _qp.execute_internal(
-            format("INSERT INTO {}.{} (time, streams) VALUES (?,?)", NAME, CDC_DESC),
+    using namespace std::chrono_literals;
+
+    auto ms = co_await get_cdc_streams_descriptions_v2_mutation(_qp.db(), time, desc);
+    co_await max_concurrent_for_each(ms, 20, [&] (mutation& m) -> future<> {
+        // We use the storage_proxy::mutate API since CQL is not the best for handling large batches.
+        co_await _sp.mutate(
+            { std::move(m) },
+            quorum_if_many(ctx.num_token_owners),
+            db::timeout_clock::now() + 10s,
+            nullptr, // trace_state
+            empty_service_permit(),
+            false // raw_counters
+        );
+    });
+
+    // Commit the description.
+    co_await _qp.execute_internal(
+            format("INSERT INTO {}.{} (key, time) VALUES (?, ?)", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
-            { time, make_set_value(cdc_streams_set_type, prepare_cdc_streams(streams)) },
+            { CDC_TIMESTAMPS_KEY, time },
            false).discard_result();
 }

@@ -353,7 +424,7 @@ system_distributed_keyspace::expire_cdc_desc(
        db_clock::time_point expiration_time,
        context ctx) {
    return _qp.execute_internal(
-            format("UPDATE {}.{} SET expired = ? WHERE time = ?", NAME, CDC_DESC),
+            format("UPDATE {}.{} SET expired = ? WHERE time = ?", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
            { expiration_time, streams_ts },
@@ -364,11 +435,44 @@ future<bool>
 system_distributed_keyspace::cdc_desc_exists(
        db_clock::time_point streams_ts,
        context ctx) {
-    return _qp.execute_internal(
-            format("SELECT time FROM {}.{} WHERE time = ?", NAME, CDC_DESC),
+    // Reading from this table on a freshly upgraded node that is the first to announce the CDC_TIMESTAMPS
+    // schema would most likely result in replicas refusing to return data, telling the node that they can't
+    // find the schema. Indeed, it takes some time for the nodes to synchronize their schema; schema is
+    // only eventually consistent.
+    //
+    // This problem doesn't occur on writes since writes enforce schema pull if the receiving replica
+    // notices that the write comes from an unknown schema, but it does occur on reads.
+    //
+    // Hence we work around it with a hack: we send a mutation with an empty partition to force our replicas
+    // to pull the schema.
+    //
+    // This is not strictly necessary; the code that calls this function does it in a retry loop
+    // so eventually, after the schema gets pulled, the read would succeed.
+    // Still, the errors are also unnecessary and if we can get rid of them - let's do it.
+    //
+    // FIXME: find a more elegant way to deal with this ``problem''.
+    if (!_forced_cdc_timestamps_schema_sync) {
+        using namespace std::chrono_literals;
+        auto s = _qp.db().find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS);
+        mutation m(s, partition_key::from_singular(*s, CDC_TIMESTAMPS_KEY));
+        co_await _sp.mutate(
+            { std::move(m) },
+            quorum_if_many(ctx.num_token_owners),
+            db::timeout_clock::now() + 10s,
+            nullptr, // trace_state
+            empty_service_permit(),
+            false // raw_counters
+        );
+
+        _forced_cdc_timestamps_schema_sync = true;
+    }
+
+    // At this point replicas know the schema, we can perform the actual read...
+    co_return co_await _qp.execute_internal(
+            format("SELECT time FROM {}.{} WHERE key = ? AND time = ?", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
-            { streams_ts },
+            { CDC_TIMESTAMPS_KEY, streams_ts },
            false
    ).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) -> bool {
        return !cql_result->empty() && cql_result->one().has("time");
@@ -376,27 +480,76 @@ system_distributed_keyspace::cdc_desc_exists(
 }

 future<std::map<db_clock::time_point, cdc::streams_version>> 
-system_distributed_keyspace::cdc_get_versioned_streams(context ctx) {
-    return _qp.execute_internal(
-            format("SELECT * FROM {}.{}", NAME, CDC_DESC),
+system_distributed_keyspace::cdc_get_versioned_streams(db_clock::time_point not_older_than, context ctx) {
+    auto timestamps_cql = co_await _qp.execute_internal(
+            format("SELECT time FROM {}.{} WHERE key = ?", NAME, CDC_TIMESTAMPS),
            quorum_if_many(ctx.num_token_owners),
            internal_distributed_timeout_config,
-            {},
-            false
-    ).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
-        std::map<db_clock::time_point, cdc::streams_version> result;
+            { CDC_TIMESTAMPS_KEY },
+            false);

-        for (auto& row : *cql_result) {
-            auto ts = row.get_as<db_clock::time_point>("time");
-            auto exp = row.get_opt<db_clock::time_point>("expired");
-            std::vector<cdc::stream_id> ids;
-            row.get_list_data<bytes>("streams", std::back_inserter(ids)); 
-            result.emplace(ts, cdc::streams_version(std::move(ids), ts, exp));
+    std::vector<db_clock::time_point> timestamps;
+    timestamps.reserve(timestamps_cql->size());
+    for (auto& row : *timestamps_cql) {
+        timestamps.push_back(row.get_as<db_clock::time_point>("time"));
+    }
+
+    // `time` is the table's clustering key, so the results are already sorted
+    auto first = std::lower_bound(timestamps.rbegin(), timestamps.rend(), not_older_than);
+    // need first gen _intersecting_ the timestamp.
+    if (first != timestamps.rbegin()) {
+        --first;
+    }
+
+    std::map<db_clock::time_point, cdc::streams_version> result;
+    co_await max_concurrent_for_each(first, timestamps.rend(), 5, [this, &ctx, &result] (db_clock::time_point ts) -> future<> {
+        auto streams_cql = co_await _qp.execute_internal(
+                format("SELECT streams FROM {}.{} WHERE time = ?", NAME, CDC_DESC_V2),
+                quorum_if_many(ctx.num_token_owners),
+                internal_distributed_timeout_config,
+                { ts },
+                false);
+
+        utils::chunked_vector<cdc::stream_id> ids;
+        for (auto& row : *streams_cql) {
+            row.get_list_data<bytes>("streams", std::back_inserter(ids));
+            co_await make_ready_future<>(); // maybe yield
        }

-        return result;
+        result.emplace(ts, cdc::streams_version{std::move(ids), ts});
    });
+
+    co_return result;
 }

+future<db_clock::time_point> 
+system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
+    auto timestamp_cql = co_await _qp.execute_internal(
+            format("SELECT time FROM {}.{} WHERE key = ? limit 1", NAME, CDC_TIMESTAMPS),
+            quorum_if_many(ctx.num_token_owners),
+            internal_distributed_timeout_config,
+            { CDC_TIMESTAMPS_KEY },
+            false);
+
+    co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
+}
+
+future<std::vector<db_clock::time_point>>
+system_distributed_keyspace::get_cdc_desc_v1_timestamps(context ctx) {
+    std::vector<db_clock::time_point> res;
+    co_await _qp.query_internal(
+            format("SELECT time FROM {}.{}", NAME, CDC_DESC_V1),
+            quorum_if_many(ctx.num_token_owners),
+            // This is a long and expensive scan (mostly due to #8061).
+            // Give it a bit more time than usual.
+            get_timeout_config(std::chrono::seconds(60)),
+            {},
+            1000,
+            [&] (const cql3::untyped_result_set_row& r) {
+        res.push_back(r.get_as<db_clock::time_point>("time"));
+        return make_ready_future<stop_iteration>(stop_iteration::no);
+    });
+    co_return res;
+}

 }
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -41,6 +41,10 @@ namespace cdc {
    class streams_version;
 } // namespace cdc

+namespace service {
+    class storage_proxy;
+}
+
 namespace db {

 class system_distributed_keyspace {
@@ -51,8 +55,16 @@ public:
    /* Nodes use this table to communicate new CDC stream generations to other nodes. */
    static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";

-    /* This table is used by CDC clients to learn about avaliable CDC streams. */
-    static constexpr auto CDC_DESC = "cdc_streams_descriptions";
+    /* This table is used by CDC clients to learn about available CDC streams. */
+    static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
+
+    /* Used by CDC clients to learn CDC generation timestamps. */
+    static constexpr auto CDC_TIMESTAMPS = "cdc_generation_timestamps";
+
+    /* Previous version of the "cdc_streams_descriptions_v2" table.
+     * We use it in the upgrade procedure to ensure that CDC generations appearing
+     * in the old table also appear in the new table, if necessary. */
+    static constexpr auto CDC_DESC_V1 = "cdc_streams_descriptions";

    /* Information required to modify/query some system_distributed tables, passed from the caller. */
    struct context {
@@ -62,17 +74,23 @@ public:
 private:
    cql3::query_processor& _qp;
    service::migration_manager& _mm;
+    service::storage_proxy& _sp;
+
+    bool _started = false;
+    bool _forced_cdc_timestamps_schema_sync = false;

 public:
    /* Should writes to the given table always be synchronized by commitlog (flushed to disk)
     * before being acknowledged? */
    static bool is_extra_durable(const sstring& cf_name);

-    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&);
+    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);

    future<> start();
    future<> stop();

+    bool started() const { return _started; }
+
    future<std::unordered_map<utils::UUID, sstring>> view_status(sstring ks_name, sstring view_name) const;
    future<> start_view_build(sstring ks_name, sstring view_name) const;
    future<> finish_view_build(sstring ks_name, sstring view_name) const;
@@ -82,11 +100,18 @@ public:
    future<std::optional<cdc::topology_description>> read_cdc_topology_description(db_clock::time_point streams_ts, context);
    future<> expire_cdc_topology_description(db_clock::time_point streams_ts, db_clock::time_point expiration_time, context);

-    future<> create_cdc_desc(db_clock::time_point streams_ts, const std::vector<cdc::stream_id>&, context);
+    future<> create_cdc_desc(db_clock::time_point streams_ts, const cdc::topology_description&, context);
    future<> expire_cdc_desc(db_clock::time_point streams_ts, db_clock::time_point expiration_time, context);
    future<bool> cdc_desc_exists(db_clock::time_point streams_ts, context);

-    future<std::map<db_clock::time_point, cdc::streams_version>> cdc_get_versioned_streams(context);
+    /* Get all generation timestamps appearing in the "cdc_streams_descriptions" table
+     * (the old CDC stream description table). */
+    future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(context);
+
+    future<std::map<db_clock::time_point, cdc::streams_version>> cdc_get_versioned_streams(db_clock::time_point not_older_than, context);
+
+    future<db_clock::time_point> cdc_current_generation_timestamp(context);
+
 };

 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1574,6 +1574,21 @@ future<> update_cdc_streams_timestamp(db_clock::time_point tp) {
            .discard_result().then([] { return force_blocking_flush(v3::CDC_LOCAL); });
 }

+static const sstring CDC_REWRITTEN_KEY = "rewritten";
+
+future<> cdc_set_rewritten(std::optional<db_clock::time_point> tp) {
+    if (tp) {
+        return qctx->execute_cql(
+                format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
+                CDC_REWRITTEN_KEY, *tp).discard_result();
+    } else {
+        // Insert just the row marker.
+        return qctx->execute_cql(
+                format("INSERT INTO system.{} (key) VALUES (?)", v3::CDC_LOCAL),
+                CDC_REWRITTEN_KEY).discard_result();
+    }
+}
+
 future<> force_blocking_flush(sstring cfname) {
    assert(qctx);
    return qctx->_qp.invoke_on_all([cfname = std::move(cfname)] (cql3::query_processor& qp) {
@@ -1646,6 +1661,14 @@ future<std::optional<db_clock::time_point>> get_saved_cdc_streams_timestamp() {
    });
 }

+future<bool> cdc_is_rewritten() {
+    // We don't care about the actual timestamp; it's additional information for debugging purposes.
+    return qctx->execute_cql(format("SELECT key FROM system.{} WHERE key = ?", v3::CDC_LOCAL), CDC_REWRITTEN_KEY)
+            .then([] (::shared_ptr<cql3::untyped_result_set> msg) {
+        return !msg->empty();
+    });
+}
+
 bool bootstrap_complete() {
    return get_bootstrap_state() == bootstrap_state::COMPLETED;
 }
@@ -1864,7 +1887,7 @@ future<> get_compaction_history(compaction_history_consumer&& f) {
    return do_with(compaction_history_consumer(std::move(f)),
            [](compaction_history_consumer& consumer) mutable {
        sstring req = format("SELECT * from system.{}", COMPACTION_HISTORY);
-        return qctx->qp().query(req, [&consumer] (const cql3::untyped_result_set::row& row) mutable {
+        return qctx->qp().query_internal(req, [&consumer] (const cql3::untyped_result_set::row& row) mutable {
            compaction_history_entry entry;
            entry.id = row.get_as<utils::UUID>("id");
            entry.ks = row.get_as<sstring>("keyspace_name");
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -634,5 +634,8 @@ future<> save_paxos_proposal(const schema& s, const service::paxos::proposal& pr
 future<> save_paxos_decision(const schema& s, const service::paxos::proposal& decision, db::timeout_clock::time_point timeout);
 future<> delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout);

+future<bool> cdc_is_rewritten();
+future<> cdc_set_rewritten(std::optional<db_clock::time_point>);
+
 } // namespace system_keyspace
 } // namespace db
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -244,12 +244,12 @@ if __name__ == "__main__":
                # and https://cloud.google.com/compute/docs/disks/local-ssd#nvme
                # note that scylla iotune might measure more, this is GCP recommended
                mbs=1024*1024
-                if nr_disks >= 1 & nr_disks < 4:
+                if nr_disks >= 1 and nr_disks < 4:
                    disk_properties["read_iops"] = 170000 * nr_disks
                    disk_properties["read_bandwidth"] = 660 * mbs * nr_disks
                    disk_properties["write_iops"] = 90000 * nr_disks
                    disk_properties["write_bandwidth"] = 350 * mbs * nr_disks
-                elif nr_disks >= 4 & nr_disks <= 8:
+                elif nr_disks >= 4 and nr_disks <= 8:
                    disk_properties["read_iops"] = 680000
                    disk_properties["read_bandwidth"] = 2650 * mbs
                    disk_properties["write_iops"] = 360000
@@ -281,3 +281,5 @@ if __name__ == "__main__":
                run_iotune()
        else:
            run_iotune()
+        os.chmod(etcdir() + '/scylla.d/io_properties.yaml', 0o644)
+        os.chmod(etcdir() + '/scylla.d/io.conf', 0o644)
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -82,6 +82,7 @@ def create_perftune_conf(cfg):
        yaml = run('/opt/scylladb/scripts/perftune.py ' + params, shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
        with open('/etc/scylla.d/perftune.yaml', 'w') as f:
            f.write(yaml)
+        os.chmod('/etc/scylla.d/perftune.yaml', 0o644)
        return True
    else:
        return False
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -27,6 +27,7 @@ import grp
 import sys
 import stat
 import distro
+from pathlib import Path
 from scylla_util import *
 from subprocess import run

@@ -85,8 +86,14 @@ if __name__ == '__main__':
            raiddevs_to_try = [args.raiddev, ]
        for fsdev in raiddevs_to_try:
            raiddevname = os.path.basename(fsdev)
-            if not os.path.exists(f'/sys/block/{raiddevname}/md/array_state'):
+            array_state = Path(f'/sys/block/{raiddevname}/md/array_state')
+            # mdX is not allocated
+            if not array_state.exists():
                break
+            with array_state.open() as f:
+                # allocated, but no devices, not running
+                if f.read().strip() == 'clear':
+                    break
            print(f'{fsdev} is already using')
        else:
            if args.raiddev is None:
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -176,11 +176,6 @@ def warn_offline(setup):
 def warn_offline_missing_pkg(setup, pkg):
    colorprint('{red}{setup} disabled by default, since {pkg} not available.{nocolor}', setup=setup, pkg=pkg)

-def current_umask():
-    current = os.umask(0)
-    os.umask(current)
-    return current
-
 if __name__ == '__main__':
    if not is_nonroot() and os.getuid() > 0:
        print('Requires root permission.')
@@ -331,12 +326,6 @@ if __name__ == '__main__':
    selinux_reboot_required = False
    set_clocksource = False

-    umask = current_umask()
-    # files have to be world-readable
-    if not is_nonroot() and (umask & 0o7) != 0o2:
-        colorprint('{red}Scylla does not work with current umask setting ({umask}),\nplease restore umask to the default value (0022).{nocolor}', umask='{0:o}'.format(umask).zfill(4))
-        sys.exit(1)
-
    if interactive:
        colorprint('{green}Skip any of the following steps by answering \'no\'{nocolor}')

@@ -375,11 +364,13 @@ if __name__ == '__main__':
            if version_check:
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: True\n')
+                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
                systemd_unit('scylla-housekeeping-daily.timer').unmask()
                systemd_unit('scylla-housekeeping-restart.timer').unmask()
            else:
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: False\n')
+                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
                hk_daily = systemd_unit('scylla-housekeeping-daily.timer')
                hk_daily.mask()
                hk_daily.stop()
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -380,6 +380,8 @@ class aws_instance:
            raise Exception("found more than one disk mounted at root'".format(root_dev_candidates))

        root_dev = root_dev_candidates[0].device
+        if root_dev == '/dev/root':
+            root_dev = run('findmnt -n -o SOURCE /', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
        nvmes_present = list(filter(nvme_re.match, os.listdir("/dev")))
        return {"root": [ root_dev ], "ephemeral": [ x for x in nvmes_present if not root_dev.startswith(os.path.join("/dev/", x)) ] }

--- a/dist/debian/debian/rules
+++ b/dist/debian/debian/rules
@@ -29,11 +29,11 @@ ifeq ($(product),scylla)
 	dh_installinit --no-start
 else
 	dh_installinit --no-start --name scylla-server
+	dh_installinit --no-start --name scylla-node-exporter
 endif
 	dh_installinit --no-start --name scylla-housekeeping-daily
 	dh_installinit --no-start --name scylla-housekeeping-restart
 	dh_installinit --no-start --name scylla-fstrim
-	dh_installinit --no-start --name node-exporter

 override_dh_strip:
 	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=666.development
+ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/branch-4.4/latest/scylla.repo
+ARG VERSION=4.4

 ADD scylla_bashrc /scylla_bashrc

--- a/dist/offline_installer/redhat/build_offline_installer.sh
+++ b/dist/offline_installer/redhat/build_offline_installer.sh
@@ -26,26 +26,31 @@ fi
 print_usage() {
    echo "build_offline_installer.sh --repo [URL]"
    echo "  --repo  repository for fetching scylla rpm, specify .repo file URL"
-    echo "  --releasever  use specific minor version of the distribution repo (ex: 7.4)"
+    echo "  --image [IMAGE]  Use the specified docker IMAGE"
+    echo "  --no-docker  Build offline installer without using docker"
    exit 1
 }

-is_rhel7_variant() {
-    [ "$ID" = "rhel" -o "$ID" = "ol" -o "$ID" = "centos" ] && [[ "$VERSION_ID" =~ ^7 ]]
-}
+here="$(realpath $(dirname "$0"))"
+releasever=`rpm -q --provides $(rpm -q --whatprovides "system-release(releasever)") | grep "system-release(releasever)"| uniq |  cut -d ' ' -f 3`

 REPO=
-RELEASEVER=
+IMAGE=docker.io/centos:7
+NO_DOCKER=false
 while [ $# -gt 0 ]; do
    case "$1" in
        "--repo")
            REPO=$2
            shift 2
            ;;
-        "--releasever")
-            RELEASEVER=$2
+        "--image")
+            IMAGE=$2
            shift 2
            ;;
+        "--no-docker")
+            NO_DOCKER=true
+            shift 1
+            ;;
        *)
            print_usage
            ;;
@@ -59,25 +64,17 @@ if [ -z $REPO ]; then
    exit 1
 fi

-if ! is_rhel7_variant; then
-    echo "Unsupported distribution"
-    exit 1
-fi
-
-if [ "$ID" = "centos" ]; then
-    if [ ! -f /etc/yum.repos.d/epel.repo ]; then
-        sudo yum install -y epel-release
+if ! $NO_DOCKER; then
+    if [[ -f ~/.config/scylladb/dbuild ]]; then
+        . ~/.config/scylladb/dbuild
    fi
-    RELEASE=7
-else
-    if [ ! -f /etc/yum.repos.d/epel.repo ]; then
-        sudo rpm -Uvh https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+    if which docker >/dev/null 2>&1 ; then
+      tool=${DBUILD_TOOL-docker}
+    elif which podman >/dev/null 2>&1 ; then
+      tool=${DBUILD_TOOL-podman}
+    else
+      echo "Please make sure you install either podman or docker on this machine to run dbuild" && exit 1
    fi
-    RELEASE=7Server
-fi
-
-if [ ! -f /usr/bin/yumdownloader ]; then
-    sudo yum -y install yum-utils
 fi

 if [ ! -f /usr/bin/wget ]; then
@@ -85,29 +82,55 @@ if [ ! -f /usr/bin/wget ]; then
 fi

 if [ ! -f /usr/bin/makeself ]; then
-    sudo yum -y install makeself
+    if $NO_DOCKER; then
+        # makeself on EPEL7 is too old, borrow it from EPEL8
+        # since there is no dependency on the package, it just work
+        if [ $release_major = '7' ]; then
+            sudo rpm --import https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8
+            sudo cp "$here"/lib/epel8.repo /etc/yum.repos.d/
+            YUM_OPTS="--enablerepo=epel8"
+        elif [ $release_major = '8' ]; then
+            yum -y install epel-release || yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+        fi
+    fi
+    sudo yum -y install "$YUM_OPTS" makeself
 fi

 if [ ! -f /usr/bin/createrepo ]; then
    sudo yum -y install createrepo
 fi

-sudo yum -y install yum-plugin-downloadonly
+makeself_ver=$(makeself --version|cut -d ' ' -f 3|sed -e 's/\.//g')
+if [ $makeself_ver -lt 240 ]; then
+    echo "$(makeself --version) is too old, please install 2.4.0 or later"
+    exit 1
+fi

-cd /etc/yum.repos.d/
-sudo wget -N $REPO
-cd -
-
-sudo rm -rf build/installroot build/offline_installer build/scylla_offline_installer.sh
+sudo rm -rf build/installroot build/offline_docker build/offline_installer build/scylla_offline_installer.sh
 mkdir -p build/installroot
 mkdir -p build/installroot/etc/yum/vars
-sudo sh -c "echo $RELEASE >> build/installroot/etc/yum/vars/releasever"
+
+mkdir -p build/offline_docker
+wget "$REPO" -O build/offline_docker/scylla.repo
+cp "$here"/lib/install_deps.sh build/offline_docker
+cp "$here"/lib/Dockerfile.in build/offline_docker/Dockerfile
+sed -i -e "s#@@IMAGE@@#$IMAGE#" build/offline_docker/Dockerfile
+
+cd build/offline_docker
+if $NO_DOCKER; then
+    sudo cp scylla.repo /etc/yum.repos.d/scylla.repo
+    sudo ./install_deps.sh
+else
+    image_id=$($tool build -q .)
+fi
+cd -

 mkdir -p build/offline_installer
-cp dist/offline_installer/redhat/header build/offline_installer
-if [ -n "$RELEASEVER" ]; then
-    YUMOPTS="--releasever=$RELEASEVER"
+cp "$here"/lib/header build/offline_installer
+if $NO_DOCKER; then
+    "$here"/lib/construct_offline_repo.sh
+else
+    ./tools/toolchain/dbuild --image "$image_id" -- "$here"/lib/construct_offline_repo.sh
 fi
-sudo yum -y install $YUMOPTS --downloadonly --installroot=`pwd`/build/installroot --downloaddir=build/offline_installer scylla sudo ntp ntpdate net-tools kernel-tools
 (cd build/offline_installer; createrepo -v .)
-(cd build; makeself offline_installer scylla_offline_installer.sh "Scylla offline package" ./header)
+(cd build; makeself --keep-umask offline_installer scylla_offline_installer.sh "Scylla offline package" ./header)
--- a/dist/offline_installer/redhat/lib/Dockerfile.in
+++ b/dist/offline_installer/redhat/lib/Dockerfile.in
@@ -0,0 +1,5 @@
+FROM @@IMAGE@@
+ADD install_deps.sh install_deps.sh
+RUN ./install_deps.sh
+ADD scylla.repo /etc/yum.repos.d/scylla.repo
+CMD /bin/bash
--- a/dist/offline_installer/redhat/lib/construct_offline_repo.sh
+++ b/dist/offline_installer/redhat/lib/construct_offline_repo.sh
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+
+releasever=`rpm -q --provides $(rpm -q --whatprovides "system-release(releasever)") | grep "system-release(releasever)"| uniq |  cut -d ' ' -f 3`
+
+# Can ignore error since we only needed when files exists
+cp /etc/yum/vars/* build/installroot/etc/yum/vars/ ||:
+
+# run yum in non-root mode using fakeroot
+fakeroot yum -y install --downloadonly --releasever="$releasever" --installroot=`pwd`/build/installroot --downloaddir=build/offline_installer scylla sudo chrony net-tools kernel-tools mdadm xfsprogs
--- a/dist/offline_installer/redhat/lib/epel8.repo
+++ b/dist/offline_installer/redhat/lib/epel8.repo
@@ -0,0 +1,7 @@
+[epel8]
+name=Extra Packages for Enterprise Linux 8 - $basearch
+#baseurl=https://download.fedoraproject.org/pub/epel/8/Everything/$basearch
+metalink=https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir
+enabled=0
+gpgcheck=1
+countme=1
--- a/dist/offline_installer/redhat/lib/header
+++ b/dist/offline_installer/redhat/lib/header
--- a/dist/offline_installer/redhat/lib/install_deps.sh
+++ b/dist/offline_installer/redhat/lib/install_deps.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+. /etc/os-release
+
+release_major=$(echo $VERSION_ID|sed -e 's/^\([0-9]*\)[^0-9]*.*/\1/')
+
+if [ ! -f /etc/yum.repos.d/epel.repo ]; then
+    yum -y install epel-release || yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-"$release_major".noarch.rpm
+fi
+if [ ! -f /usr/bin/fakeroot ]; then
+    yum -y install fakeroot
+fi
--- a/docs/design-notes/cdc.md
+++ b/docs/design-notes/cdc.md
@@ -146,6 +146,15 @@ Next, the node starts gossiping the timestamp of the new generation together wit
        }).get();
 ```

+The node persists the currently gossiped timestamp in order to recover it on restart in the `system.cdc_local` table. This is the schema:
+```
+CREATE TABLE system.cdc_local (
+    key text PRIMARY KEY,
+    streams_timestamp timestamp
+) ...
+```
+The timestamp is kept under the `"cdc_local"` key in the `streams_timestamp` column.
+
 When other nodes learn about the generation, they'll extract it from the `cdc_generation_descriptions` table and save it using `cdc::metadata::insert(db_clock::time_point, topology_description&&)`.
 Notice that nodes learn about the generation together with the new node's tokens. When they learn about its tokens they'll immediately start sending writes to the new node (in the case of bootstrapping, it will become a pending replica). But the old generation will still be operating for a minute or two. Thus colocation will be lost for a while. This problem will be fixed when the two-phase-commit approach is implemented.

@@ -157,9 +166,54 @@ Due to the need of maintaining colocation we don't allow the client to send writ
 Suppose that a write is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`). Then we only allow the write if its timestamp is in the interval [`T`, `C + generation_leeway`), where `generation_leeway` is a small time-inteval constant (e.g. 5 seconds).
 Reason: we cannot allow writes before `T`, because they belong to the old generation whose token ranges might no longer refine the current vnodes, so the corresponding log write would not necessarily be colocated with the base write. We also cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.

-### Streams description table
+### Streams description tables

-The `cdc_streams_descriptions` table in the `system_distributed` keyspace allows CDC clients to learn about available sets of streams and the time intervals they are operating at. It's definition is as follows (db/system_distributed_keyspace.cc):
+The `cdc_streams_descriptions_v2` table in the `system_distributed` keyspace allows CDC clients to learn about available sets of streams and the time intervals they are operating at. It's definition is as follows (db/system_distributed_keyspace.cc):
+```
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC_V2, {id})
+                /* The timestamp of this CDC generation. */
+                .with_column("time", timestamp_type, column_kind::partition_key)
+                /* For convenience, the list of stream IDs in this generation is split into token ranges
+                 * which the stream IDs were mapped to (by the partitioner) when the generation was created.  */
+                .with_column("range_end", long_type, column_kind::clustering_key)
+                /* The set of stream identifiers used in this CDC generation for the token range
+                 * ending on `range_end`. */
+                .with_column("streams", cdc_streams_set_type)
+                .with_version(system_keyspace::generate_schema_version(id))
+                .build();
+```
+where
+```
+thread_local data_type cdc_stream_tuple_type = tuple_type_impl::get_instance({long_type, long_type});
+thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(cdc_stream_tuple_type, false);
+```
+This table contains each generation's timestamp (as partition key) and the set of stream IDs used by this generation grouped by token ranges that the stream IDs are mapped to. It is meant to be user-facing, in contrast to `cdc_generation_descriptions` which is used internally.
+
+There is a second table that contains just the generations' timestamps, `cdc_generation_timestamps`:
+```
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TIMESTAMPS, {id})
+                /* This is a single-partition table. The partition key is always "timestamps". */
+                .with_column("key", utf8_type, column_kind::partition_key)
+                /* The timestamp of this CDC generation. */
+                .with_column("time", timestamp_type, column_kind::clustering_key)
+                /* Expiration time of this CDC generation (or null if not expired). */
+                .with_column("expired", timestamp_type)
+                .with_version(system_keyspace::generate_schema_version(id))
+                .build();
+```
+It is a single-partition table, containing the timestamps of generations found in `cdc_streams_descriptions_v2` in separate clustered rows. It allows clients to efficiently query if there are any new generations, e.g.:
+```
+SELECT time FROM system_distributed.cdc_generation_timestamps` WHERE time > X
+```
+where `X` is the last timestamp known by that particular client.
+
+When nodes learn about a CDC generation through gossip, they race to update these description tables by first inserting the set of rows containing this generation's stream IDs into `cdc_streams_descriptions_v2` and then, if the node succeeds, by inserting its timestamp into `cdc_generation_timestamps` (see `cdc::update_streams_description`). This operation is idempotent so it doesn't matter if multiple nodes do it at the same time.
+
+Note that the first phase of inserting stream IDs may fail in the middle; in that case, the partition for that generation may contain partial information. Thus a client can only safely read a partition from `cdc_streams_descriptions_v2` (i.e. without the risk of observing only a part of the stream IDs) if they first observe its timestamp in `cdc_generation_timestamps`.
+
+### Streams description table V1 and rewriting
+
+As the name suggests, `cdc_streams_descriptions_v2` is the second version of the streams description table. The previous schema was:
 ```
        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC, {id})
                /* The timestamp of this CDC generation. */
@@ -171,14 +225,26 @@ The `cdc_streams_descriptions` table in the `system_distributed` keyspace allows
                .with_version(system_keyspace::generate_schema_version(id))
                .build();
 ```
-where
-```
-thread_local data_type cdc_stream_tuple_type = tuple_type_impl::get_instance({long_type, long_type});
-thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(cdc_stream_tuple_type, false);
-```
-This table simply contains each generation's timestamp (as partition key) and the set of stream IDs used by this generation. It is meant to be user-facing, in contrast to `cdc_generation_descriptions` which is used internally.

-When nodes learn about a CDC generation through gossip, they race to update the description table by inserting a proper row (see `cdc::update_streams_description`). This operation is idempotent so it doesn't matter if multiple nodes do it at the same time.
+The entire set of stream IDs (for all token ranges) was stored as a single collection. With large clusters the collection could grow quite big: for example, with 100 nodes 64 shards each and 256 vnodes per node, a new generation would contain 1,6M stream IDs, resulting in a ~32MB collection. For reasons described in issue #7993 this would disqualify the previous schema.
+
+However, that was the schema used in the Scylla 4.3 release. For clusters that used CDC with this schema we need to ensure that stream descriptions residing in the old table appear in the new table as well (if necessary, i.e. if these streams may still contain some data).
+
+To do that, we perform a rewrite procedure. Each node does the following on restart:
+1. Check if the `system_distributed.cdc_streams_descriptions` table exists. If it doesn't, there's nothing to rewrite, so stop.
+2. Check if the `system.cdc_local` table contains a row with `key = "rewritten"`. If it does then rewrite was already performed, so stop.
+3. Check if there is a table with CDC enabled. If not, add a row with `key = "rewritten"` to `system.cdc_local` and stop; no rewriting is necessary (and won't be) since old generations - even if they exists - are not needed.
+4. Retrieve all generation timestamps from the old streams description table by performing a full range scan: `select time from system_distributed.cdc_streams_descriptions`. This may be a long/expensive operation, hence it's performed in a background task (the procedure is moved to background in this step).
+5. Filter out timestamps that are "too old". A generation timestamp is "too old" if there is a greater timestamp `T` such that for every table with CDC enabled, `now - ttl > T`, where `now` is the current time and `ttl` is the table's TTL setting. This means that the table cannot contain data that belongs to the "too old" generation. Thus, if each table passes this check for a given generation, that generation doesn't need to be rewritten.
+6. For each timestamp that's left:
+6.1 if it's already present in the new table, skip it (we check this by querying `cdc_generation_timestamps`
+6.2 fetch the generation (by querying `cdc_generation_descriptions`)
+6.3 insert the generation's streams into the new table
+7. Insert a row with `key = "rewritten"` into `system.cdc_local`.
+
+Note that every node will perform this procedure on upgrade, but there's a high chance that only one of them actually proceeds all the way to step 6.2 if upgrade is performed correctly, i.e. in a rolling fashion (nodes are restarted one-by-one).
+
+In order to prevent new nodes to do the rewriting (we only want upgrading nodes to do it), we insert the `key = "rewritten"` row on bootstrap as well, before we start this procedure (so the node won't pass the second check).

 #### TODO: expired generations
-The `expired` column in `cdc_streams_descriptions` and `cdc_generation_descriptions` means that this generation was superseded by some new generation and will soon be removed (its table entry will be gone). This functionality is yet to be implemented.
+The `expired` column in `cdc_generation_timestamps` and `cdc_generation_descriptions` means that this generation was superseded by some new generation and will soon be removed (its table entry will be gone). This functionality is yet to be implemented.
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1792,6 +1792,8 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes,
                }).handle_exception_type([node, &fall_back_to_syn_msg] (seastar::rpc::unknown_verb_error&) {
                    logger.warn("Node {} does not support get_endpoint_states verb", node);
                    fall_back_to_syn_msg = true;
+                }).handle_exception_type([node, &nodes_down] (seastar::rpc::timeout_error&) {
+                    logger.warn("The get_endpoint_states verb to node {} was timeout", node);
                }).handle_exception_type([node, &nodes_down] (seastar::rpc::closed_error&) {
                    nodes_down++;
                    logger.warn("Node {} is down for get_endpoint_states verb", node);
--- a/idl/partition_checksum.idl.hh
+++ b/idl/partition_checksum.idl.hh
@@ -103,22 +103,3 @@ enum class repair_row_level_start_status: uint8_t {
 struct repair_row_level_start_response {
    repair_row_level_start_status status;
 };
-
-enum class node_ops_cmd : uint32_t {
-     removenode_prepare,
-     removenode_heartbeat,
-     removenode_sync_data,
-     removenode_abort,
-     removenode_done,
-};
-
-struct node_ops_cmd_request {
-    node_ops_cmd cmd;
-    utils::UUID ops_uuid;
-    std::list<gms::inet_address> ignore_nodes;
-    std::list<gms::inet_address> leaving_nodes;
-};
-
-struct node_ops_cmd_response {
-    bool ok;
-};
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -335,7 +335,6 @@ public:
    void remove_bootstrap_tokens(std::unordered_set<token> tokens);

    void add_leaving_endpoint(inet_address endpoint);
-    void del_leaving_endpoint(inet_address endpoint);
 public:
    void remove_endpoint(inet_address endpoint);
 #if 0
@@ -1658,10 +1657,6 @@ void token_metadata_impl::add_leaving_endpoint(inet_address endpoint) {
     _leaving_endpoints.emplace(endpoint);
 }

-void token_metadata_impl::del_leaving_endpoint(inet_address endpoint) {
-     _leaving_endpoints.erase(endpoint);
-}
-
 void token_metadata_impl::add_replacing_endpoint(inet_address existing_node, inet_address replacing_node) {
    tlogger.info("Added node {} as pending replacing endpoint which replaces existing node {}",
            replacing_node, existing_node);
@@ -1932,11 +1927,6 @@ token_metadata::add_leaving_endpoint(inet_address endpoint) {
    _impl->add_leaving_endpoint(endpoint);
 }

-void
-token_metadata::del_leaving_endpoint(inet_address endpoint) {
-    _impl->del_leaving_endpoint(endpoint);
-}
-
 void
 token_metadata::remove_endpoint(inet_address endpoint) {
    _impl->remove_endpoint(endpoint);
--- a/locator/token_metadata.hh
+++ b/locator/token_metadata.hh
@@ -238,7 +238,6 @@ public:
    void remove_bootstrap_tokens(std::unordered_set<token> tokens);

    void add_leaving_endpoint(inet_address endpoint);
-    void del_leaving_endpoint(inet_address endpoint);

    void remove_endpoint(inet_address endpoint);

--- a/main.cc
+++ b/main.cc
@@ -1063,7 +1063,7 @@ int main(int ac, char** av) {
                gms::stop_gossiping().get();
            });

-            sys_dist_ks.start(std::ref(qp), std::ref(mm)).get();
+            sys_dist_ks.start(std::ref(qp), std::ref(mm), std::ref(proxy)).get();

            ss.init_server().get();
            sst_format_selector.sync();
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -477,6 +477,7 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    // as well as reduce latency as there are potentially many requests
    // blocked on schema version request.
    case messaging_verb::GOSSIP_DIGEST_SYN:
+    case messaging_verb::GOSSIP_DIGEST_ACK:
    case messaging_verb::GOSSIP_DIGEST_ACK2:
    case messaging_verb::GOSSIP_SHUTDOWN:
    case messaging_verb::GOSSIP_ECHO:
@@ -504,7 +505,6 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM:
    case messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM:
    case messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM:
-    case messaging_verb::NODE_OPS_CMD:
    case messaging_verb::HINT_MUTATION:
        return 1;
    case messaging_verb::CLIENT_ID:
@@ -512,7 +512,6 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::READ_DATA:
    case messaging_verb::READ_MUTATION_DATA:
    case messaging_verb::READ_DIGEST:
-    case messaging_verb::GOSSIP_DIGEST_ACK:
    case messaging_verb::DEFINITIONS_UPDATE:
    case messaging_verb::TRUNCATE:
    case messaging_verb::MIGRATION_REQUEST:
@@ -1350,17 +1349,6 @@ future<std::vector<row_level_diff_detect_algorithm>> messaging_service::send_rep
    return send_message<future<std::vector<row_level_diff_detect_algorithm>>>(this, messaging_verb::REPAIR_GET_DIFF_ALGORITHMS, std::move(id));
 }

-// Wrapper for NODE_OPS_CMD
-void messaging_service::register_node_ops_cmd(std::function<future<node_ops_cmd_response> (const rpc::client_info& cinfo, node_ops_cmd_request)>&& func) {
-    register_handler(this, messaging_verb::NODE_OPS_CMD, std::move(func));
-}
-future<> messaging_service::unregister_node_ops_cmd() {
-    return unregister_handler(messaging_verb::NODE_OPS_CMD);
-}
-future<node_ops_cmd_response> messaging_service::send_node_ops_cmd(msg_addr id, node_ops_cmd_request req) {
-    return send_message<future<node_ops_cmd_response>>(this, messaging_verb::NODE_OPS_CMD, std::move(id), std::move(req));
-}
-
 void
 messaging_service::register_paxos_prepare(std::function<future<foreign_ptr<std::unique_ptr<service::paxos::prepare_response>>>(
        const rpc::client_info&, rpc::opt_time_point, query::read_command cmd, partition_key key, utils::UUID ballot,
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -143,8 +143,7 @@ enum class messaging_verb : int32_t {
    HINT_MUTATION = 42,
    PAXOS_PRUNE = 43,
    GOSSIP_GET_ENDPOINT_STATES = 44,
-    NODE_OPS_CMD = 45,
-    LAST = 46,
+    LAST = 45,
 };

 } // namespace netw
@@ -395,11 +394,6 @@ public:
    future<> unregister_repair_get_diff_algorithms();
    future<std::vector<row_level_diff_detect_algorithm>> send_repair_get_diff_algorithms(msg_addr id);

-    // Wrapper for NODE_OPS_CMD
-    void register_node_ops_cmd(std::function<future<node_ops_cmd_response> (const rpc::client_info& cinfo, node_ops_cmd_request)>&& func);
-    future<> unregister_node_ops_cmd();
-    future<node_ops_cmd_response> send_node_ops_cmd(msg_addr id, node_ops_cmd_request);
-
    // Wrapper for GOSSIP_ECHO verb
    void register_gossip_echo(std::function<future<> ()>&& func);
    future<> unregister_gossip_echo();
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -54,14 +54,6 @@ logging::logger rlogger("repair");

 static sharded<netw::messaging_service>* _messaging;

-void node_ops_info::check_abort() {
-    if (abort) {
-        auto msg = format("Node operation with ops_uuid={} is aborted", ops_uuid);
-        rlogger.warn("{}", msg);
-        throw std::runtime_error(msg);
-    }
-}
-
 class node_ops_metrics {
 public:
    node_ops_metrics() {
@@ -443,16 +435,6 @@ void tracker::abort_all_repairs() {
    rlogger.info0("Aborted {} repair job(s)", count);
 }

-void tracker::abort_repair_node_ops(utils::UUID ops_uuid) {
-    for (auto& x : _repairs[this_shard_id()]) {
-        auto& ri = x.second;
-        if (ri->ops_uuid() && ri->ops_uuid().value() == ops_uuid) {
-            rlogger.info0("Aborted repair jobs for ops_uuid={}", ops_uuid);
-            ri->abort();
-        }
-    }
-}
-
 float tracker::report_progress(streaming::stream_reason reason) {
    uint64_t nr_ranges_finished = 0;
    uint64_t nr_ranges_total = 0;
@@ -811,8 +793,7 @@ repair_info::repair_info(seastar::sharded<database>& db_,
    repair_uniq_id id_,
    const std::vector<sstring>& data_centers_,
    const std::vector<sstring>& hosts_,
-    streaming::stream_reason reason_,
-    std::optional<utils::UUID> ops_uuid)
+    streaming::stream_reason reason_)
    : db(db_)
    , messaging(ms_)
    , sharder(get_sharder_for_tables(db_, keyspace_, table_ids_))
@@ -826,8 +807,7 @@ repair_info::repair_info(seastar::sharded<database>& db_,
    , hosts(hosts_)
    , reason(reason_)
    , nr_ranges_total(ranges.size())
-    , _row_level_repair(db.local().features().cluster_supports_row_level_repair())
-    , _ops_uuid(std::move(ops_uuid)) {
+    , _row_level_repair(db.local().features().cluster_supports_row_level_repair()) {
 }

 future<> repair_info::do_streaming() {
@@ -1646,7 +1626,7 @@ static int do_repair_start(seastar::sharded<database>& db, seastar::sharded<netw
                _node_ops_metrics.repair_total_ranges_sum += ranges.size();
                auto ri = make_lw_shared<repair_info>(db, ms,
                        std::move(keyspace), std::move(ranges), std::move(table_ids),
-                        id, std::move(data_centers), std::move(hosts), streaming::stream_reason::repair, id.uuid);
+                        id, std::move(data_centers), std::move(hosts), streaming::stream_reason::repair);
                return repair_ranges(ri);
            });
            repair_results.push_back(std::move(f));
@@ -1716,15 +1696,14 @@ static future<> sync_data_using_repair(seastar::sharded<database>& db,
        sstring keyspace,
        dht::token_range_vector ranges,
        std::unordered_map<dht::token_range, repair_neighbors> neighbors,
-        streaming::stream_reason reason,
-        std::optional<utils::UUID> ops_uuid) {
+        streaming::stream_reason reason) {
    if (ranges.empty()) {
        return make_ready_future<>();
    }
-    return smp::submit_to(0, [&db, &ms, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] () mutable {
+    return smp::submit_to(0, [&db, &ms, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
        repair_uniq_id id = repair_tracker().next_repair_command();
        rlogger.info("repair id {} to sync data for keyspace={}, status=started", id, keyspace);
-        return repair_tracker().run(id, [id, &db, &ms, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] () mutable {
+        return repair_tracker().run(id, [id, &db, &ms, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
            auto cfs = list_column_families(db.local(), keyspace);
            if (cfs.empty()) {
                rlogger.warn("repair id {} to sync data for keyspace={}, no table in this keyspace", id, keyspace);
@@ -1734,12 +1713,12 @@ static future<> sync_data_using_repair(seastar::sharded<database>& db,
            std::vector<future<>> repair_results;
            repair_results.reserve(smp::count);
            for (auto shard : boost::irange(unsigned(0), smp::count)) {
-                auto f = db.invoke_on(shard, [&db, &ms, keyspace, table_ids, id, ranges, neighbors, reason, ops_uuid] (database& localdb) mutable {
+                auto f = db.invoke_on(shard, [&db, &ms, keyspace, table_ids, id, ranges, neighbors, reason] (database& localdb) mutable {
                    auto data_centers = std::vector<sstring>();
                    auto hosts = std::vector<sstring>();
                    auto ri = make_lw_shared<repair_info>(db, ms,
                            std::move(keyspace), std::move(ranges), std::move(table_ids),
-                            id, std::move(data_centers), std::move(hosts), reason, ops_uuid);
+                            id, std::move(data_centers), std::move(hosts), reason);
                    ri->neighbors = std::move(neighbors);
                    return repair_ranges(ri);
                });
@@ -1933,16 +1912,16 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
                }
            }
            auto nr_ranges = desired_ranges.size();
-            sync_data_using_repair(db, ms, keyspace_name, std::move(desired_ranges), std::move(range_sources), reason, {}).get();
+            sync_data_using_repair(db, ms, keyspace_name, std::move(desired_ranges), std::move(range_sources), reason).get();
            rlogger.info("bootstrap_with_repair: finished with keyspace={}, nr_ranges={}", keyspace_name, nr_ranges);
        }
        rlogger.info("bootstrap_with_repair: finished with keyspaces={}", keyspaces);
    });
 }

-static future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node, shared_ptr<node_ops_info> ops) {
+static future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node) {
    using inet_address = gms::inet_address;
-    return seastar::async([&db, &ms, tmptr = std::move(tmptr), leaving_node = std::move(leaving_node), ops] () mutable {
+    return seastar::async([&db, &ms, tmptr = std::move(tmptr), leaving_node = std::move(leaving_node)] () mutable {
        auto myip = utils::fb_utilities::get_broadcast_address();
        auto keyspaces = db.local().get_non_system_keyspaces();
        bool is_removenode = myip != leaving_node;
@@ -2001,9 +1980,6 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
            auto local_dc = get_local_dc();
            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
            for (auto&r : ranges) {
-                if (ops) {
-                    ops->check_abort();
-                }
                auto end_token = r.end() ? r.end()->value() : dht::maximum_token();
                const std::vector<inet_address> new_eps = ks.get_replication_strategy().calculate_natural_endpoints(end_token, temp, utils::can_yield::yes);
                const std::vector<inet_address>& current_eps = current_replica_endpoints[r];
@@ -2085,12 +2061,6 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
                }
                neighbors_set.erase(myip);
                neighbors_set.erase(leaving_node);
-                // Remove nodes in ignore_nodes
-                if (ops) {
-                    for (const auto& node : ops->ignore_nodes) {
-                        neighbors_set.erase(node);
-                    }
-                }
                auto neighbors = boost::copy_range<std::vector<gms::inet_address>>(neighbors_set |
                    boost::adaptors::filtered([&local_dc, &snitch_ptr] (const gms::inet_address& node) {
                        return snitch_ptr->get_datacenter(node) == local_dc;
@@ -2102,10 +2072,9 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
                    rlogger.debug("{}: keyspace={}, range={}, current_replica_endpoints={}, new_replica_endpoints={}, neighbors={}, skipped",
                        op, keyspace_name, r, current_eps, new_eps, neighbors);
                } else {
-                    std::vector<gms::inet_address> mandatory_neighbors = is_removenode ? neighbors : std::vector<gms::inet_address>{};
-                    rlogger.info("{}: keyspace={}, range={}, current_replica_endpoints={}, new_replica_endpoints={}, neighbors={}, mandatory_neighbor={}",
-                            op, keyspace_name, r, current_eps, new_eps, neighbors, mandatory_neighbors);
-                    range_sources[r] = repair_neighbors(std::move(neighbors), std::move(mandatory_neighbors));
+                    rlogger.debug("{}: keyspace={}, range={}, current_replica_endpoints={}, new_replica_endpoints={}, neighbors={}",
+                        op, keyspace_name, r, current_eps, new_eps, neighbors);
+                    range_sources[r] = repair_neighbors(std::move(neighbors));
                    if (is_removenode) {
                        ranges_for_removenode.push_back(r);
                    }
@@ -2125,8 +2094,7 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
                ranges.swap(ranges_for_removenode);
            }
            auto nr_ranges_synced = ranges.size();
-            std::optional<utils::UUID> opt_uuid = ops ? std::make_optional<utils::UUID>(ops->ops_uuid) : std::nullopt;
-            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason, opt_uuid).get();
+            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
            rlogger.info("{}: finished with keyspace={}, leaving_node={}, nr_ranges={}, nr_ranges_synced={}, nr_ranges_skipped={}",
                op, keyspace_name, leaving_node, nr_ranges_total, nr_ranges_synced, nr_ranges_skipped);
        }
@@ -2135,17 +2103,11 @@ static future<> do_decommission_removenode_with_repair(seastar::sharded<database
 }

 future<> decommission_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr) {
-    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), utils::fb_utilities::get_broadcast_address(), {});
+    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), utils::fb_utilities::get_broadcast_address());
 }

-future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node, shared_ptr<node_ops_info> ops) {
-    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), std::move(leaving_node), std::move(ops));
-}
-
-future<> abort_repair_node_ops(utils::UUID ops_uuid) {
-    return smp::invoke_on_all([ops_uuid] {
-        return repair_tracker().abort_repair_node_ops(ops_uuid);
-    });
+future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node) {
+    return do_decommission_removenode_with_repair(db, ms, std::move(tmptr), std::move(leaving_node));
 }

 static future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, sstring op, sstring source_dc, streaming::stream_reason reason) {
@@ -2220,7 +2182,7 @@ static future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, s
                }).get();
            }
            auto nr_ranges = ranges.size();
-            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason, {}).get();
+            sync_data_using_repair(db, ms, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
            rlogger.info("{}: finished with keyspace={}, source_dc={}, nr_ranges={}", op, keyspace_name, source_dc, nr_ranges);
        }
        rlogger.info("{}: finished with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
@@ -2258,19 +2220,12 @@ static future<> init_messaging_service_handler(sharded<database>& db, sharded<ne
                return checksum_range(db, keyspace, cf, range, hv);
            });
        });
-        ms.register_node_ops_cmd([] (const rpc::client_info& cinfo, node_ops_cmd_request req) {
-            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
-            auto coordinator = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
-            return smp::submit_to(src_cpu_id % smp::count, [coordinator, req = std::move(req)] () mutable {
-                return service::get_local_storage_service().node_ops_cmd_handler(coordinator, std::move(req));
-            });
-        });
    });
 }

 static future<> uninit_messaging_service_handler() {
    return _messaging->invoke_on_all([] (auto& ms) {
-        return when_all_succeed(ms.unregister_repair_checksum_range(), ms.unregister_node_ops_cmd()).discard_result();
+        return ms.unregister_repair_checksum_range();
    });
 }

--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -76,22 +76,13 @@ struct repair_uniq_id {
 };
 std::ostream& operator<<(std::ostream& os, const repair_uniq_id& x);

-struct node_ops_info {
-    utils::UUID ops_uuid;
-    bool abort = false;
-    std::list<gms::inet_address> ignore_nodes;
-    void check_abort();
-};
-
 // The tokens are the tokens assigned to the bootstrap node.
 future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, std::unordered_set<dht::token> bootstrap_tokens);
 future<> decommission_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr);
-future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node, shared_ptr<node_ops_info> ops);
+future<> removenode_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, gms::inet_address leaving_node);
 future<> rebuild_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, sstring source_dc);
 future<> replace_with_repair(seastar::sharded<database>& db, seastar::sharded<netw::messaging_service>& ms, locator::token_metadata_ptr tmptr, std::unordered_set<dht::token> replacing_tokens);

-future<> abort_repair_node_ops(utils::UUID ops_uuid);
-
 // NOTE: repair_start() can be run on any node, but starts a node-global
 // operation.
 // repair_start() starts the requested repair on this node. It returns an
@@ -253,7 +244,6 @@ public:
    bool _row_level_repair;
    uint64_t _sub_ranges_nr = 0;
    std::unordered_set<sstring> dropped_tables;
-    std::optional<utils::UUID> _ops_uuid;
 public:
    repair_info(seastar::sharded<database>& db_,
            seastar::sharded<netw::messaging_service>& ms_,
@@ -263,8 +253,7 @@ public:
            repair_uniq_id id_,
            const std::vector<sstring>& data_centers_,
            const std::vector<sstring>& hosts_,
-            streaming::stream_reason reason_,
-            std::optional<utils::UUID> ops_uuid);
+            streaming::stream_reason reason_);
    future<> do_streaming();
    void check_failed_ranges();
    future<> request_transfer_ranges(const sstring& cf,
@@ -283,9 +272,6 @@ public:
    const std::vector<sstring>& table_names() {
        return cfs;
    }
-    const std::optional<utils::UUID>& ops_uuid() const {
-        return _ops_uuid;
-    };
 };

 // The repair_tracker tracks ongoing repair operations and their progress.
@@ -338,7 +324,6 @@ public:
    future<> run(repair_uniq_id id, std::function<void ()> func);
    future<repair_status> repair_await_completion(int id, std::chrono::steady_clock::time_point timeout);
    float report_progress(streaming::stream_reason reason);
-    void abort_repair_node_ops(utils::UUID ops_uuid);
 };

 future<uint64_t> estimate_partitions(seastar::sharded<database>& db, const sstring& keyspace,
@@ -479,27 +464,6 @@ enum class row_level_diff_detect_algorithm : uint8_t {

 std::ostream& operator<<(std::ostream& out, row_level_diff_detect_algorithm algo);

-enum class node_ops_cmd : uint32_t {
-     removenode_prepare,
-     removenode_heartbeat,
-     removenode_sync_data,
-     removenode_abort,
-     removenode_done,
-};
-
-// The cmd and ops_uuid are mandatory for each request.
-// The ignore_nodes and leaving_node are optional.
-struct node_ops_cmd_request {
-    node_ops_cmd cmd;
-    utils::UUID ops_uuid;
-    std::list<gms::inet_address> ignore_nodes;
-    std::list<gms::inet_address> leaving_nodes;
-};
-
-struct node_ops_cmd_response {
-    bool ok;
-};
-
 namespace std {
 template<>
 struct hash<partition_checksum> {
--- a/2
+++ b/2
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -208,8 +208,9 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc

            if (cdc_topology_description_forbidden_permissions.contains(cmd.permission)) {
                if (ks == db::system_distributed_keyspace::NAME
-                        && (resource_view.table() == db::system_distributed_keyspace::CDC_DESC
-                        || resource_view.table() == db::system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION)) {
+                        && (resource_view.table() == db::system_distributed_keyspace::CDC_DESC_V2
+                        || resource_view.table() == db::system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION
+                        || resource_view.table() == db::system_distributed_keyspace::CDC_TIMESTAMPS)) {
                    throw exceptions::unauthorized_exception(
                            format("Cannot {} {}", auth::permissions::to_string(cmd.permission), cmd.resource));
                }
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -4931,10 +4931,12 @@ void storage_proxy::init_messaging_service() {
            tracing::trace(trace_state_ptr, "read_data: message received from /{}", src_addr.addr);
        }
        auto da = oda.value_or(query::digest_algorithm::MD5);
+        auto sp = get_local_shared_storage_proxy();
        if (!cmd.max_result_size) {
-            cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
+            auto& cfg = sp->local_db().get_config();
+            cmd.max_result_size.emplace(cfg.max_memory_for_unlimited_query_soft_limit(), cfg.max_memory_for_unlimited_query_hard_limit());
        }
-        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        return do_with(std::move(pr), std::move(sp), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
            p->get_stats().replica_data_reads++;
            auto src_ip = src_addr.addr;
            return get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, da, &pr, &p, &trace_state_ptr, t] (schema_ptr s) {
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -107,7 +107,6 @@ storage_service::storage_service(abort_source& abort_source, distributed<databas
        , _service_memory_total(config.available_memory / 10)
        , _service_memory_limiter(_service_memory_total)
        , _for_testing(for_testing)
-        , _node_ops_abort_thread(node_ops_abort_thread())
        , _shared_token_metadata(stm)
        , _sys_dist_ks(sys_dist_ks)
        , _view_update_generator(view_update_generator)
@@ -549,6 +548,10 @@ void storage_service::join_token_ring(int delay) {
    if (!db::system_keyspace::bootstrap_complete()) {
        // If we're not bootstrapping nor replacing, then we shouldn't have chosen a CDC streams timestamp yet.
        assert(should_bootstrap() || db().local().is_replacing() || !_cdc_streams_ts);
+
+        // Don't try rewriting CDC stream description tables.
+        // See cdc.md design notes, `Streams description table V1 and rewriting` section, for explanation.
+        db::system_keyspace::cdc_set_rewritten(std::nullopt).get();
    }

    if (!_cdc_streams_ts) {
@@ -605,6 +608,14 @@ void storage_service::join_token_ring(int delay) {

    // Retrieve the latest CDC generation seen in gossip (if any).
    scan_cdc_generations();
+
+    // Ensure that the new CDC stream description table has all required streams.
+    // See the function's comment for details.
+    cdc::maybe_rewrite_streams_descriptions(
+            _db.local(), _sys_dist_ks.local_shared(),
+            [tm = get_token_metadata_ptr()] { return tm->count_normal_token_owners(); },
+            _abort_source).get();
+
 }

 void storage_service::mark_existing_views_as_built() {
@@ -741,7 +752,8 @@ void storage_service::handle_cdc_generation(std::optional<db_clock::time_point>
        return;
    }

-    if (!db::system_keyspace::bootstrap_complete() || !_sys_dist_ks.local_is_initialized()) {
+    if (!db::system_keyspace::bootstrap_complete() || !_sys_dist_ks.local_is_initialized()
+            || !_sys_dist_ks.local().started()) {
        // We still haven't finished the startup process.
        // We will handle this generation in `scan_cdc_generations` (unless there's a newer one).
        return;
@@ -1750,12 +1762,9 @@ future<> storage_service::gossip_sharder() {

 future<> storage_service::stop() {
    // make sure nobody uses the semaphore
-    node_ops_singal_abort(std::nullopt);
    return _service_memory_limiter.wait(_service_memory_total).finally([this] {
        _listeners.clear();
        return _schema_version_publisher.join();
-    }).finally([this] {
-        return std::move(_node_ops_abort_thread);
    });
 }

@@ -2177,192 +2186,102 @@ future<> storage_service::decommission() {
    });
 }

-future<> storage_service::removenode(sstring host_id_string, std::list<gms::inet_address> ignore_nodes) {
-    return run_with_api_lock(sstring("removenode"), [host_id_string, ignore_nodes = std::move(ignore_nodes)] (storage_service& ss) mutable {
-        return seastar::async([&ss, host_id_string, ignore_nodes = std::move(ignore_nodes)] {
-            auto uuid = utils::make_random_uuid();
-            auto tmptr = ss.get_token_metadata_ptr();
+future<> storage_service::removenode(sstring host_id_string) {
+    return run_with_api_lock(sstring("removenode"), [host_id_string] (storage_service& ss) mutable {
+        return seastar::async([&ss, host_id_string] {
+            slogger.debug("removenode: host_id = {}", host_id_string);
+            auto my_address = ss.get_broadcast_address();
+            auto tmlock = std::make_unique<token_metadata_lock>(ss.get_token_metadata_lock().get0());
+            auto tmptr = ss.get_mutable_token_metadata_ptr().get0();
+            auto local_host_id = tmptr->get_host_id(my_address);
            auto host_id = utils::UUID(host_id_string);
            auto endpoint_opt = tmptr->get_endpoint_for_host_id(host_id);
            if (!endpoint_opt) {
-                throw std::runtime_error(format("removenode[{}]: Host ID not found in the cluster", uuid));
+                throw std::runtime_error("Host ID not found.");
            }
            auto endpoint = *endpoint_opt;
+
            auto tokens = tmptr->get_tokens(endpoint);
-            auto leaving_nodes = std::list<gms::inet_address>{endpoint};

-            future<> heartbeat_updater = make_ready_future<>();
-            auto heartbeat_updater_done = make_lw_shared<bool>(false);
+            slogger.debug("removenode: endpoint = {}", endpoint);

-            // Step 1: Decide who needs to sync data
-            //
-            // By default, we require all nodes in the cluster to participate
-            // the removenode operation and sync data if needed. We fail the
-            // removenode operation if any of them is down or fails.
-            //
-            // If the user want the removenode opeartion to succeed even if some of the nodes
-            // are not available, the user has to explicitly pass a list of
-            // node that can be skipped for the operation.
-            std::vector<gms::inet_address> nodes;
-            for (const auto& x : tmptr->get_endpoint_to_host_id_map_for_reading()) {
-                seastar::thread::maybe_yield();
-                if (x.first != endpoint && std::find(ignore_nodes.begin(), ignore_nodes.end(), x.first) == ignore_nodes.end()) {
-                    nodes.push_back(x.first);
+            if (endpoint == my_address) {
+                throw std::runtime_error("Cannot remove self");
+            }
+
+            if (ss._gossiper.get_live_members().contains(endpoint)) {
+                throw std::runtime_error(format("Node {} is alive and owns this ID. Use decommission command to remove it from the ring", endpoint));
+            }
+
+            // A leaving endpoint that is dead is already being removed.
+            if (tmptr->is_leaving(endpoint)) {
+                slogger.warn("Node {} is already being removed, continuing removal anyway", endpoint);
+            }
+
+            if (!ss._replicating_nodes.empty()) {
+                throw std::runtime_error("This node is already processing a removal. Wait for it to complete, or use 'removenode force' if this has failed.");
+            }
+
+            auto non_system_keyspaces = ss.db().local().get_non_system_keyspaces();
+            // Find the endpoints that are going to become responsible for data
+            for (const auto& keyspace_name : non_system_keyspaces) {
+                auto& ks = ss.db().local().find_keyspace(keyspace_name);
+                // if the replication factor is 1 the data is lost so we shouldn't wait for confirmation
+                if (ks.get_replication_strategy().get_replication_factor() == 1) {
+                    slogger.warn("keyspace={} has replication factor 1, the data is probably lost", keyspace_name);
+                    continue;
+                }
+
+                // get all ranges that change ownership (that is, a node needs
+                // to take responsibility for new range)
+                std::unordered_multimap<dht::token_range, inet_address> changed_ranges =
+                    ss.get_changed_ranges_for_leaving(keyspace_name, endpoint);
+                for (auto& x: changed_ranges) {
+                    auto ep = x.second;
+                    if (ss._gossiper.is_alive(ep)) {
+                        ss._replicating_nodes.emplace(ep);
+                    } else {
+                        slogger.warn("Endpoint {} is down and will not receive data for re-replication of {}", ep, endpoint);
+                    }
                }
            }
-            slogger.info("removenode[{}]: Started removenode operation, removing node={}, sync_nodes={}, ignore_nodes={}", uuid, endpoint, nodes, ignore_nodes);
+            slogger.info("removenode: endpoint = {}, replicating_nodes = {}", endpoint, ss._replicating_nodes);
+            ss._removing_node = endpoint;
+            tmptr->add_leaving_endpoint(endpoint);
+            ss.update_pending_ranges(tmptr, format("removenode {}", endpoint)).get();
+            ss.replicate_to_all_cores(std::move(tmptr)).get();
+            tmlock.reset();

-            // Step 2: Prepare to sync data
-            std::unordered_set<gms::inet_address> nodes_unknown_verb;
-            std::unordered_set<gms::inet_address> nodes_down;
-            auto req = node_ops_cmd_request{node_ops_cmd::removenode_prepare, uuid, ignore_nodes, leaving_nodes};
-            try {
-                parallel_for_each(nodes, [&ss, &req, &nodes_unknown_verb, &nodes_down, uuid] (const gms::inet_address& node) {
-                    return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
-                        slogger.debug("removenode[{}]: Got prepare response from node={}", uuid, node);
-                    }).handle_exception_type([&nodes_unknown_verb, node, uuid] (seastar::rpc::unknown_verb_error&) {
-                        slogger.warn("removenode[{}]: Node {} does not support removenode verb", uuid, node);
-                        nodes_unknown_verb.emplace(node);
-                    }).handle_exception_type([&nodes_down, node, uuid] (seastar::rpc::closed_error&) {
-                        slogger.warn("removenode[{}]: Node {} is down for node_ops_cmd verb", uuid, node);
-                        nodes_down.emplace(node);
-                    });
-                }).get();
-                if (!nodes_unknown_verb.empty()) {
-                    auto msg = format("removenode[{}]: Nodes={} do not support removenode verb. Please upgrade your cluster and run removenode again.", uuid, nodes_unknown_verb);
-                    slogger.warn("{}", msg);
-                    throw std::runtime_error(msg);
-                }
-                if (!nodes_down.empty()) {
-                    auto msg = format("removenode[{}]: Nodes={} needed for removenode operation are down. It is highly recommended to fix the down nodes and try again. To proceed with best-effort mode which might cause data inconsistency, run nodetool removenode --ignore-dead-nodes <list_of_dead_nodes> <host_id>. E.g., nodetool removenode --ignore-dead-nodes 127.0.0.1,127.0.0.2 817e9515-316f-4fe3-aaab-b00d6f12dddd", uuid, nodes_down);
-                    slogger.warn("{}", msg);
-                    throw std::runtime_error(msg);
-                }
+            // the gossiper will handle spoofing this node's state to REMOVING_TOKEN for us
+            // we add our own token so other nodes to let us know when they're done
+            ss._gossiper.advertise_removing(endpoint, host_id, local_host_id).get();

-                // Step 3: Start heartbeat updater
-                heartbeat_updater = seastar::async([&ss, &nodes, uuid, heartbeat_updater_done] {
-                    slogger.debug("removenode[{}]: Started heartbeat_updater", uuid);
-                    while (!(*heartbeat_updater_done)) {
-                        auto req = node_ops_cmd_request{node_ops_cmd::removenode_heartbeat, uuid, {}, {}};
-                        parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) {
-                            return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
-                                slogger.debug("removenode[{}]: Got heartbeat response from node={}", uuid, node);
-                                return make_ready_future<>();
-                            });
-                        }).handle_exception([uuid] (std::exception_ptr ep) {
-                            slogger.warn("removenode[{}]: Failed to send heartbeat", uuid);
-                        }).get();
-                        int nr_seconds = 10;
-                        while (!(*heartbeat_updater_done) && nr_seconds--) {
-                            sleep_abortable(std::chrono::seconds(1), ss._abort_source).get();
-                        }
-                    }
-                    slogger.debug("removenode[{}]: Stopped heartbeat_updater", uuid);
-                });
-                auto stop_heartbeat_updater = defer([&] {
-                    *heartbeat_updater_done = true;
-                    heartbeat_updater.get();
-                });
+            // kick off streaming commands
+            // No need to wait for restore_replica_count to complete, since
+            // when it completes, the node will be removed from _replicating_nodes,
+            // and we wait for _replicating_nodes to become empty below
+            //FIXME: discarded future.
+            (void)ss.restore_replica_count(endpoint, my_address).handle_exception([endpoint, my_address] (auto ep) {
+                slogger.info("Failed to restore_replica_count for node {} on node {}", endpoint, my_address);
+            });

-                // Step 4: Start to sync data
-                req.cmd = node_ops_cmd::removenode_sync_data;
-                parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) {
-                    return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
-                        slogger.debug("removenode[{}]: Got sync_data response from node={}", uuid, node);
-                        return make_ready_future<>();
-                    });
-                }).get();
-
-
-                // Step 5: Announce the node has left
-                std::unordered_set<token> tmp(tokens.begin(), tokens.end());
-                ss.excise(std::move(tmp), endpoint);
-                ss._gossiper.advertise_token_removed(endpoint, host_id).get();
-
-                // Step 6: Finish
-                req.cmd = node_ops_cmd::removenode_done;
-                parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) {
-                    return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
-                        slogger.debug("removenode[{}]: Got done response from node={}", uuid, node);
-                        return make_ready_future<>();
-                    });
-                }).get();
-                slogger.info("removenode[{}]: Finished removenode operation, removing node={}, sync_nodes={}, ignore_nodes={}", uuid, endpoint, nodes, ignore_nodes);
-            } catch (...) {
-                // we need to revert the effect of prepare verb the removenode ops is failed
-                req.cmd = node_ops_cmd::removenode_abort;
-                parallel_for_each(nodes, [&ss, &req, &nodes_unknown_verb, &nodes_down, uuid] (const gms::inet_address& node) {
-                    if (nodes_unknown_verb.contains(node) || nodes_down.contains(node)) {
-                        // No need to revert previous prepare cmd for those who do not apply prepare cmd.
-                        return make_ready_future<>();
-                    }
-                    return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
-                        slogger.debug("removenode[{}]: Got abort response from node={}", uuid, node);
-                    });
-                }).get();
-                slogger.info("removenode[{}]: Aborted removenode operation, removing node={}, sync_nodes={}, ignore_nodes={}", uuid, endpoint, nodes, ignore_nodes);
-                throw;
+            // wait for ReplicationFinishedVerbHandler to signal we're done
+            while (!(ss._replicating_nodes.empty() || ss._force_remove_completion)) {
+                sleep_abortable(std::chrono::milliseconds(100), ss._abort_source).get();
            }
-        });
-    });
-}

-future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_address coordinator, node_ops_cmd_request req) {
-    return get_storage_service().invoke_on(0, [coordinator, req = std::move(req)] (auto& ss) mutable {
-        return seastar::async([&ss, coordinator, req = std::move(req)] () mutable {
-            auto ops_uuid = req.ops_uuid;
-            slogger.debug("node_ops_cmd_handler cmd={}, ops_uuid={}", uint32_t(req.cmd), ops_uuid);
-            if (req.cmd == node_ops_cmd::removenode_prepare) {
-                if (req.leaving_nodes.size() > 1) {
-                    auto msg = format("removenode[{}]: Could not removenode more than one node at a time: leaving_nodes={}", req.ops_uuid, req.leaving_nodes);
-                    slogger.warn("{}", msg);
-                    throw std::runtime_error(msg);
-                }
-                ss.mutate_token_metadata([coordinator, &req, &ss] (mutable_token_metadata_ptr tmptr) mutable {
-                    for (auto& node : req.leaving_nodes) {
-                        slogger.info("removenode[{}]: Added node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator);
-                        tmptr->add_leaving_endpoint(node);
-                    }
-                    return ss.update_pending_ranges(tmptr, format("removenode {}", req.leaving_nodes));
-                }).get();
-                auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
-                auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [&ss, coordinator, req = std::move(req)] () mutable {
-                    return ss.mutate_token_metadata([&ss, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
-                        for (auto& node : req.leaving_nodes) {
-                            slogger.info("removenode[{}]: Removed node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator);
-                            tmptr->del_leaving_endpoint(node);
-                        }
-                        return ss.update_pending_ranges(tmptr, format("removenode {}", req.leaving_nodes));
-                    });
-                },
-                [&ss, ops_uuid] () mutable { ss.node_ops_singal_abort(ops_uuid); });
-                ss._node_ops.emplace(ops_uuid, std::move(meta));
-            } else if (req.cmd == node_ops_cmd::removenode_heartbeat) {
-                slogger.debug("removenode[{}]: Updated heartbeat from coordinator={}", req.ops_uuid,  coordinator);
-                ss.node_ops_update_heartbeat(ops_uuid);
-            } else if (req.cmd == node_ops_cmd::removenode_done) {
-                slogger.info("removenode[{}]: Marked ops done from coordinator={}", req.ops_uuid, coordinator);
-                ss.node_ops_done(ops_uuid);
-            } else if (req.cmd == node_ops_cmd::removenode_sync_data) {
-                auto it = ss._node_ops.find(ops_uuid);
-                if (it == ss._node_ops.end()) {
-                    throw std::runtime_error(format("removenode[{}]: Can not find ops_uuid={}", ops_uuid, ops_uuid));
-                }
-                auto ops = it->second.get_ops_info();
-                for (auto& node : req.leaving_nodes) {
-                    slogger.info("removenode[{}]: Started to sync data for removing node={}, coordinator={}", req.ops_uuid, node, coordinator);
-                    removenode_with_repair(ss._db, ss._messaging, ss.get_token_metadata_ptr(), node, ops).get();
-                }
-            } else if (req.cmd == node_ops_cmd::removenode_abort) {
-                ss.node_ops_abort(ops_uuid);
-            } else {
-                auto msg = format("node_ops_cmd_handler: ops_uuid={}, unknown cmd={}", req.ops_uuid, uint32_t(req.cmd));
-                slogger.warn("{}", msg);
-                throw std::runtime_error(msg);
+            if (ss._force_remove_completion) {
+                throw std::runtime_error("nodetool removenode force is called by user");
            }
-            node_ops_cmd_response resp;
-            resp.ok = true;
-            return resp;
+
+            std::unordered_set<token> tmp(tokens.begin(), tokens.end());
+            ss.excise(std::move(tmp), endpoint);
+
+            // gossiper will indicate the token has left
+            ss._gossiper.advertise_token_removed(endpoint, host_id).get();
+
+            ss._replicating_nodes.clear();
+            ss._removing_node = std::nullopt;
        });
    });
 }
@@ -2606,9 +2525,7 @@ void storage_service::unbootstrap() {

 future<> storage_service::restore_replica_count(inet_address endpoint, inet_address notify_endpoint) {
    if (is_repair_based_node_ops_enabled()) {
-        auto ops_uuid = utils::make_random_uuid();
-        auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::list<gms::inet_address>()});
-        return removenode_with_repair(_db, _messaging, get_token_metadata_ptr(), endpoint, ops).finally([this, notify_endpoint] () {
+        return removenode_with_repair(_db, _messaging, get_token_metadata_ptr(), endpoint).finally([this, notify_endpoint] () {
            return send_replication_notification(notify_endpoint);
        });
    }
@@ -3323,111 +3240,5 @@ bool storage_service::is_repair_based_node_ops_enabled() {
    return _db.local().get_config().enable_repair_based_node_ops();
 }

-node_ops_meta_data::node_ops_meta_data(
-        utils::UUID ops_uuid,
-        gms::inet_address coordinator,
-        shared_ptr<node_ops_info> ops,
-        std::function<future<> ()> abort_func,
-        std::function<void ()> signal_func)
-    : _ops_uuid(std::move(ops_uuid))
-    , _coordinator(std::move(coordinator))
-    , _abort(std::move(abort_func))
-    , _signal(std::move(signal_func))
-    , _ops(std::move(ops))
-    , _watchdog([sig = _signal] { sig(); }) {
-    _watchdog.arm(_watchdog_interval);
-}
-
-future<> node_ops_meta_data::abort() {
-    slogger.debug("node_ops_meta_data: ops_uuid={} abort", _ops_uuid);
-    _aborted = true;
-    if (_ops) {
-        _ops->abort = true;
-    }
-    _watchdog.cancel();
-    return _abort();
-}
-
-void node_ops_meta_data::update_watchdog() {
-    slogger.debug("node_ops_meta_data: ops_uuid={} update_watchdog", _ops_uuid);
-    if (_aborted) {
-        return;
-    }
-    _watchdog.cancel();
-    _watchdog.arm(_watchdog_interval);
-}
-
-void node_ops_meta_data::cancel_watchdog() {
-    slogger.debug("node_ops_meta_data: ops_uuid={} cancel_watchdog", _ops_uuid);
-    _watchdog.cancel();
-}
-
-shared_ptr<node_ops_info> node_ops_meta_data::get_ops_info() {
-    return _ops;
-}
-
-void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
-    slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
-    auto it = _node_ops.find(ops_uuid);
-    if (it != _node_ops.end()) {
-        node_ops_meta_data& meta = it->second;
-        meta.update_watchdog();
-    }
-}
-
-void storage_service::node_ops_done(utils::UUID ops_uuid) {
-    slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
-    auto it = _node_ops.find(ops_uuid);
-    if (it != _node_ops.end()) {
-        node_ops_meta_data& meta = it->second;
-        meta.cancel_watchdog();
-        _node_ops.erase(it);
-    }
-}
-
-void storage_service::node_ops_abort(utils::UUID ops_uuid) {
-    slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
-    auto it = _node_ops.find(ops_uuid);
-    if (it != _node_ops.end()) {
-        node_ops_meta_data& meta = it->second;
-        meta.abort().get();
-        abort_repair_node_ops(ops_uuid).get();
-        _node_ops.erase(it);
-    }
-}
-
-void storage_service::node_ops_singal_abort(std::optional<utils::UUID> ops_uuid) {
-    slogger.debug("node_ops_singal_abort: ops_uuid={}", ops_uuid);
-    _node_ops_abort_queue.push_back(ops_uuid);
-    _node_ops_abort_cond.signal();
-}
-
-future<> storage_service::node_ops_abort_thread() {
-    return seastar::async([this] {
-        slogger.info("Started node_ops_abort_thread");
-        for (;;) {
-            _node_ops_abort_cond.wait([this] { return !_node_ops_abort_queue.empty(); }).get();
-            slogger.debug("Awoke node_ops_abort_thread: node_ops_abort_queue={}", _node_ops_abort_queue);
-            while (!_node_ops_abort_queue.empty()) {
-                auto uuid_opt = _node_ops_abort_queue.front();
-                _node_ops_abort_queue.pop_front();
-                if (!uuid_opt) {
-                    return;
-                }
-                try {
-                    storage_service::node_ops_abort(*uuid_opt);
-                } catch (...) {
-                    slogger.warn("Failed to abort node operation ops_uuid={}: {}", *uuid_opt, std::current_exception());
-                }
-            }
-        }
-        slogger.info("Stopped node_ops_abort_thread");
-    });
-}
-
-
 } // namespace service

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -63,12 +63,6 @@
 #include <seastar/core/rwlock.hh>
 #include "sstables/version.hh"
 #include "cdc/metadata.hh"
-#include <seastar/core/shared_ptr.hh>
-#include <seastar/core/lowres_clock.hh>
-
-class node_ops_cmd_request;
-class node_ops_cmd_response;
-class node_ops_info;

 namespace cql_transport { class controller; }

@@ -109,28 +103,6 @@ struct storage_service_config {
    size_t available_memory;
 };

-class node_ops_meta_data {
-    utils::UUID _ops_uuid;
-    gms::inet_address _coordinator;
-    std::function<future<> ()> _abort;
-    std::function<void ()> _signal;
-    shared_ptr<node_ops_info> _ops;
-    seastar::timer<lowres_clock> _watchdog;
-    std::chrono::seconds _watchdog_interval{30};
-    bool _aborted = false;
-public:
-    explicit node_ops_meta_data(
-            utils::UUID ops_uuid,
-            gms::inet_address coordinator,
-            shared_ptr<node_ops_info> ops,
-            std::function<future<> ()> abort_func,
-            std::function<void ()> signal_func);
-    shared_ptr<node_ops_info> get_ops_info();
-    future<> abort();
-    void update_watchdog();
-    void cancel_watchdog();
-};
-
 /**
 * This abstraction contains the token/identifier of this node
 * on the identifier space. This token gets gossiped around.
@@ -186,17 +158,6 @@ private:
     * and would only slow down tests (by having them wait).
     */
    bool _for_testing;
-
-    std::unordered_map<utils::UUID, node_ops_meta_data> _node_ops;
-    std::list<std::optional<utils::UUID>> _node_ops_abort_queue;
-    seastar::condition_variable _node_ops_abort_cond;
-    named_semaphore _node_ops_abort_sem{1, named_semaphore_exception_factory{"node_ops_abort_sem"}};
-    future<> _node_ops_abort_thread;
-    void node_ops_update_heartbeat(utils::UUID ops_uuid);
-    void node_ops_done(utils::UUID ops_uuid);
-    void node_ops_abort(utils::UUID ops_uuid);
-    void node_ops_singal_abort(std::optional<utils::UUID> ops_uuid);
-    future<> node_ops_abort_thread();
 public:
    storage_service(abort_source& as, distributed<database>& db, gms::gossiper& gossiper, sharded<db::system_distributed_keyspace>&, sharded<db::view::view_update_generator>&, gms::feature_service& feature_service, storage_service_config config, sharded<service::migration_notifier>& mn, locator::shared_token_metadata& stm, sharded<netw::messaging_service>& ms, /* only for tests */ bool for_testing = false);

@@ -810,8 +771,7 @@ public:
     *
     * @param hostIdString token for the node
     */
-    future<> removenode(sstring host_id_string, std::list<gms::inet_address> ignore_nodes);
-    future<node_ops_cmd_response> node_ops_cmd_handler(gms::inet_address coordinator, node_ops_cmd_request req);
+    future<> removenode(sstring host_id_string);

    future<sstring> get_operation_mode();

--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -310,6 +310,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
        _generated_monitors.emplace_back(std::move(sst), _compaction_manager, _cf);
        return _generated_monitors.back();
    }
+
    compaction_read_monitor_generator(compaction_manager& cm, column_family& cf)
        : _compaction_manager(cm)
        , _cf(cf) {}
@@ -570,6 +571,7 @@ private:
            // Do not actually compact a sstable that is fully expired and can be safely
            // dropped without ressurrecting old data.
            if (tombstone_expiration_enabled() && fully_expired.contains(sst)) {
+                on_skipped_expired_sstable(sst);
                continue;
            }

@@ -676,6 +678,9 @@ private:

    virtual void on_end_of_compaction() {};

+    // Inform about every expired sstable that was skipped during setup phase
+    virtual void on_skipped_expired_sstable(shared_sstable sstable) {}
+
    // create a writer based on decorated key.
    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) = 0;
    // stop current writer
@@ -919,6 +924,12 @@ public:
        }
        replace_remaining_exhausted_sstables();
    }
+
+    virtual void on_skipped_expired_sstable(shared_sstable sstable) override {
+        // manually register expired sstable into monitor, as it's not being actually compacted
+        // this will allow expired sstable to be removed from tracker once compaction completes
+        _monitor_generator(std::move(sstable));
+    }
 private:
    void backlog_tracker_incrementally_adjust_charges(std::vector<shared_sstable> exhausted_sstables) {
        //
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -629,10 +629,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
    _tasks.push_back(task);

    auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
+    auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
    auto sstables_ptr = sstables.get();
    _stats.pending_tasks += sstables->size();

-    task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr] () mutable {
+    task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr, compacting] () mutable {

        // FIXME: lock cf here
        if (!can_proceed(task)) {
@@ -642,7 +643,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        auto sst = sstables_ptr->back();
        sstables_ptr->pop_back();

-        return repeat([this, task, options, sst = std::move(sst)] () mutable {
+        return repeat([this, task, options, sst = std::move(sst), compacting] () mutable {
            column_family& cf = *task->compacting_cf;
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
@@ -650,7 +651,6 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            auto descriptor = sstables::compaction_descriptor({ sst }, cf.get_sstable_set(), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

-            auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
            // Releases reference to cleaned sstable such that respective used disk space can be freed.
            descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting->release_compacting(exhausted_sstables);
@@ -664,7 +664,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
                return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
                    return cf.run_compaction(std::move(descriptor));
                });
-            }).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+            }).then_wrapped([this, task, compacting] (future<> f) mutable {
                task->compaction_running = false;
                _stats.active_tasks--;
                if (!can_proceed(task)) {
--- a/sstables/time_window_compaction_strategy.cc
+++ b/sstables/time_window_compaction_strategy.cc
@@ -162,7 +162,7 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
    for (auto& pair : all_buckets.first) {
        auto ssts = std::move(pair.second);
        if (ssts.size() > offstrategy_threshold) {
-            ssts.resize(std::min(multi_window.size(), max_sstables));
+            ssts.resize(std::min(ssts.size(), max_sstables));
            compaction_descriptor desc(std::move(ssts), std::optional<sstables::sstable_set>(), iop);
            desc.options = compaction_options::make_reshape();
            return desc;
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -380,7 +380,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
            try {
                db.find_column_family(ks, cf);
            } catch (no_such_column_family&) {
-                auto err = format("[Stream #{{}}] prepare requested ks={{}} cf={{}} does not exist", ks, cf);
+                auto err = format("[Stream #{{}}] prepare requested ks={{}} cf={{}} does not exist", plan_id, ks, cf);
                sslog.warn(err.c_str());
                throw std::runtime_error(err);
            }
--- a/table.cc
+++ b/table.cc
@@ -1677,7 +1677,8 @@ write_memtable_to_sstable(flat_mutation_reader reader,
                          const io_priority_class& pc) {
    cfg.replay_position = mt.replay_position();
    cfg.monitor = &monitor;
-    return sst->write_components(std::move(reader), mt.partition_count(), mt.schema(), cfg, mt.get_encoding_stats(), pc);
+    schema_ptr s = reader.schema();
+    return sst->write_components(std::move(reader), mt.partition_count(), s, cfg, mt.get_encoding_stats(), pc);
 }

 future<>
--- a/test/alternator/test_condition_expression.py
+++ b/test/alternator/test_condition_expression.py
@@ -136,7 +136,7 @@ def test_update_condition_eq_different(test_table_s):
                        ConditionExpression='a = :val2',
                        ExpressionAttributeValues={':val1': val1, ':val2': val2})

-# Also check an actual case of same time, but inequality.
+# Also check an actual case of same type, but inequality.
 def test_update_condition_eq_unequal(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -146,6 +146,13 @@ def test_update_condition_eq_unequal(test_table_s):
            UpdateExpression='SET a = :val1',
            ConditionExpression='a = :oldval',
            ExpressionAttributeValues={':val1': 3, ':oldval': 2})
+    # If the attribute being compared doesn't exist, it's considered a failed
+    # condition, not an error:
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q = :oldval',
+            ExpressionAttributeValues={':val1': 3, ':oldval': 2})

 # Check that set equality is checked correctly. Unlike string equality (for
 # example), it cannot be done with just naive string comparison of the JSON
@@ -269,15 +276,44 @@ def test_update_condition_lt(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a < :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
-    # Trying to compare an unsupported type - e.g., in the following test
-    # a boolean, is unfortunately caught by boto3 and cannot be tested here...
-    #test_table_s.update_item(Key={'p': p},
-    #    AttributeUpdates={'d': {'Value': False, 'Action': 'PUT'}})
-    #with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
-    #    test_table_s.update_item(Key={'p': p},
-    #        UpdateExpression='SET z = :newval',
-    #        ConditionExpression='d < :oldval',
-    #        ExpressionAttributeValues={':newval': 2, ':oldval': True})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q < :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval < q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a < :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval < a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x < :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval < x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

 # Test for ConditionExpression with operator "<="
@@ -341,6 +377,44 @@ def test_update_condition_le(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a <= :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q <= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval <= q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a <= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval <= a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x <= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval <= x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 7

 # Test for ConditionExpression with operator ">"
@@ -404,6 +478,44 @@ def test_update_condition_gt(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a > :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q > :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval > q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a > :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval > a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x > :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval > x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

 # Test for ConditionExpression with operator ">="
@@ -467,6 +579,44 @@ def test_update_condition_ge(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a >= :oldval',
            ExpressionAttributeValues={':newval': 2, ':oldval': '0'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q >= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval >= q',
+            ExpressionAttributeValues={':newval': 2, ':oldval': '17'})
+    # If a comparison parameter comes from a constant specified in the query,
+    # and it has a type not supported by the comparison (e.g., a list), it's
+    # not just a failed comparison - it is considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a >= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval >= a',
+            ExpressionAttributeValues={':newval': 2, ':oldval': [1,2]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='x >= :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression=':oldval >= x',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 7

 # Test for ConditionExpression with ternary operator "BETWEEN" (checking
@@ -548,6 +698,60 @@ def test_update_condition_between(test_table_s):
            UpdateExpression='SET z = :newval',
            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
            ExpressionAttributeValues={':newval': 2, ':oldval1': '0', ':oldval2': '2'})
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='q BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': b'dog', ':oldval2': b'zebra'})
+    # If and operand from the query, and it has a type not supported by the
+    # comparison (e.g., a list), it's not just a failed condition - it is
+    # considered a ValidationException
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': [1,2], ':oldval2': [2,3]})
+    # However, if when the wrong type comes from an item attribute, not the
+    # query, the comparison is simply false - not a ValidationException.
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': [1,2,3], 'Action': 'PUT'},
+                                                             'y': {'Value': [2,3,4], 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN x and y',
+            ExpressionAttributeValues={':newval': 2})
+    # If the two operands come from the query (":val" references) then if they
+    # have different types or the wrong order, this is a ValidationException.
+    # But if one or more of the operands come from the item, this only causes
+    # a false condition - not a ValidationException.
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': 2, ':oldval2': 1})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval1 AND :oldval2',
+            ExpressionAttributeValues={':newval': 2, ':oldval1': 2, ':oldval2': 'dog'})
+    test_table_s.update_item(Key={'p': p}, AttributeUpdates={'two': {'Value': 2, 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN two AND :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN :oldval AND two',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 3})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET z = :newval',
+            ConditionExpression='a BETWEEN two AND :oldval',
+            ExpressionAttributeValues={':newval': 2, ':oldval': 'dog'})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 9

 # Test for ConditionExpression with multi-operand operator "IN", checking
@@ -605,6 +809,13 @@ def test_update_condition_in(test_table_s):
            UpdateExpression='SET c = :val37',
            ConditionExpression='a IN ()',
            ExpressionAttributeValues=values)
+    # If the attribute being compared doesn't even exist, this is also
+    # considered as a false condition - not an error.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET c = :val37',
+            ConditionExpression='q IN ({})'.format(','.join(values.keys())),
+            ExpressionAttributeValues=values)

 # Beyond the above operators, there are also test functions supported -
 # attribute_exists, attribute_not_exists, attribute_type, begins_with,
--- a/test/alternator/test_expected.py
+++ b/test/alternator/test_expected.py
@@ -237,6 +237,30 @@ def test_update_expected_1_le(test_table_s):
                            'AttributeValueList': [2, 3]}}
        )

+# Comparison operators like le work only on numbers, strings or bytes.
+# As noted in issue #8043, if any other type is included in *the query*,
+# the result should be a ValidationException, but if the wrong type appears
+# in the item, not the query, the result is a failed condition.
+def test_update_expected_1_le_validation(test_table_s):
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'},
+                          'b': {'Value': [1,2], 'Action': 'PUT'}})
+    # Bad type (a list) in the query. Result is ValidationException.
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'a': {'ComparisonOperator': 'LE',
+                            'AttributeValueList': [[1,2,3]]}}
+        )
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'b': {'ComparisonOperator': 'LE',
+                            'AttributeValueList': [3]}}
+        )
+    assert not 'z' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+
 # Tests for Expected with ComparisonOperator = "LT":
 def test_update_expected_1_lt(test_table_s):
    p = random_string()
@@ -894,6 +918,34 @@ def test_update_expected_1_between(test_table_s):
            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
            Expected={'d': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': [set([1]), set([2])]}})

+# BETWEEN work only on numbers, strings or bytes. As noted in issue #8043,
+# if any other type is included in *the query*, the result should be a
+# ValidationException, but if the wrong type appears in the item, not the
+# query, the result is a failed condition.
+# BETWEEN should also generate ValidationException if the two ends of the
+# range are not of the same type or not in the correct order, but this
+# already is tested in the test above (test_update_expected_1_between).
+def test_update_expected_1_between_validation(test_table_s):
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'},
+                          'b': {'Value': [1,2], 'Action': 'PUT'}})
+    # Bad type (a list) in the query. Result is ValidationException.
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'a': {'ComparisonOperator': 'BETWEEN',
+                            'AttributeValueList': [[1,2,3], [2,3,4]]}}
+        )
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 17, 'Action': 'PUT'}},
+            Expected={'b': {'ComparisonOperator': 'BETWEEN',
+                            'AttributeValueList': [1,2]}}
+        )
+    assert not 'z' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+
+
 ##############################################################################
 # Instead of ComparisonOperator and AttributeValueList, one can specify either
 # Value or Exists:
--- a/test/alternator/test_filter_expression.py
+++ b/test/alternator/test_filter_expression.py
@@ -235,6 +235,30 @@ def test_filter_expression_ge(test_table_sn_with_data):
        expected_items = [item for item in items if item[xn] >= xv]
        assert(got_items == expected_items)

+# Comparison operators such as >= or BETWEEN only work on numbers, strings or
+# bytes. When an expression's operands come from the item and has a wrong type
+# (e.g., a list), the result is that the item is skipped - aborting the scan
+# with a ValidationException is a bug (this was issue #8043).
+def test_filter_expression_le_bad_type(test_table_sn_with_data):
+    table, p, items = test_table_sn_with_data
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='l <= :xv',
+        ExpressionAttributeValues={':p': p, ':xv': 3})
+    assert got_items == []
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression=':xv <= l',
+        ExpressionAttributeValues={':p': p, ':xv': 3})
+    assert got_items == []
+def test_filter_expression_between_bad_type(test_table_sn_with_data):
+    table, p, items = test_table_sn_with_data
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='s between :xv and l',
+        ExpressionAttributeValues={':p': p, ':xv': 'cat'})
+    assert got_items == []
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='s between l and :xv',
+        ExpressionAttributeValues={':p': p, ':xv': 'cat'})
+    assert got_items == []
+    got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='s between i and :xv',
+        ExpressionAttributeValues={':p': p, ':xv': 'cat'})
+    assert got_items == []
+
 # Test the "BETWEEN/AND" ternary operator on a numeric, string and bytes
 # attribute. These keywords are case-insensitive.
 def test_filter_expression_between(test_table_sn_with_data):
--- a/test/boost/cdc_test.cc
+++ b/test/boost/cdc_test.cc
@@ -252,29 +252,46 @@ SEASTAR_THREAD_TEST_CASE(test_disallow_cdc_on_materialized_view) {

 SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_description) {
    do_with_cql_env_thread([] (cql_test_env& e) {
-        auto test_table = [&e] (const sstring& table_name) {
-            auto assert_unauthorized = [&e] (const sstring& stmt) {
-                testlog.info("Must throw unauthorized_exception: {}", stmt);
-                BOOST_REQUIRE_THROW(e.execute_cql(stmt).get(), exceptions::unauthorized_exception);
-            };
-
-            e.require_table_exists("system_distributed", table_name).get();
-
-            const sstring full_name = "system_distributed." + table_name;
-
-            // Allow MODIFY, SELECT
-            e.execute_cql(format("INSERT INTO {} (time) VALUES (toTimeStamp(now()))", full_name)).get();
-            e.execute_cql(format("UPDATE {} SET expired = toTimeStamp(now()) WHERE time = toTimeStamp(now())", full_name)).get();
-            e.execute_cql(format("DELETE FROM {} WHERE time = toTimeStamp(now())", full_name)).get();
-            e.execute_cql(format("SELECT * FROM {}", full_name)).get();
-
-            // Disallow ALTER, DROP
-            assert_unauthorized(format("ALTER TABLE {} ALTER time TYPE blob", full_name));
-            assert_unauthorized(format("DROP TABLE {}", full_name));
+        auto assert_unauthorized = [&e] (const sstring& stmt) {
+            testlog.info("Must throw unauthorized_exception: {}", stmt);
+            BOOST_REQUIRE_THROW(e.execute_cql(stmt).get(), exceptions::unauthorized_exception);
        };

-        test_table("cdc_streams_descriptions");
-        test_table("cdc_generation_descriptions");
+        auto full_name = [] (const sstring& table_name) {
+            return "system_distributed." + table_name;
+        };
+
+        const sstring generations = "cdc_generation_descriptions";
+        const sstring streams = "cdc_streams_descriptions_v2";
+        const sstring timestamps = "cdc_generation_timestamps";
+
+        for (auto& t : {generations, streams, timestamps}) {
+            e.require_table_exists("system_distributed", t).get();
+
+            // Disallow DROP
+            assert_unauthorized(format("DROP TABLE {}", full_name(t)));
+
+            // Allow SELECT
+            e.execute_cql(format("SELECT * FROM {}", full_name(t))).get();
+        }
+
+        // Disallow ALTER
+        for (auto& t : {generations, streams}) {
+            assert_unauthorized(format("ALTER TABLE {} ALTER time TYPE blob", full_name(t)));
+        }
+        assert_unauthorized(format("ALTER TABLE {} ALTER key TYPE blob", full_name(timestamps)));
+
+        // Allow DELETE
+        for (auto& t : {generations, streams}) {
+            e.execute_cql(format("DELETE FROM {} WHERE time = toTimeStamp(now())", full_name(t))).get();
+        }
+        e.execute_cql(format("DELETE FROM {} WHERE key = 'timestamps'", full_name(timestamps))).get();
+
+        // Allow UPDATE, INSERT
+        e.execute_cql(format("UPDATE {} SET expired = toTimeStamp(now()) WHERE time = toTimeStamp(now())", full_name(generations))).get();
+        e.execute_cql(format("INSERT INTO {} (time) VALUES (toTimeStamp(now()))", full_name(generations))).get();
+        e.execute_cql(format("INSERT INTO {} (time, range_end) VALUES (toTimeStamp(now()), 0)", full_name(streams))).get();
+        e.execute_cql(format("UPDATE {} SET expired = toTimeStamp(now()) WHERE key = 'timestamps' AND time = toTimeStamp(now())", full_name(timestamps))).get();
    }).get();
 }

--- a/test/boost/query_processor_test.cc
+++ b/test/boost/query_processor_test.cc
@@ -134,7 +134,7 @@ SEASTAR_TEST_CASE(test_querying_with_consumer) {
        auto& db = e.local_db();
        auto s = db.find_schema("ks", "cf");

-        e.local_qp().query("SELECT * from ks.cf", [&counter] (const cql3::untyped_result_set::row& row) mutable {
+        e.local_qp().query_internal("SELECT * from ks.cf", [&counter] (const cql3::untyped_result_set::row& row) mutable {
            counter++;
            return make_ready_future<stop_iteration>(stop_iteration::no);
        }).get();
@@ -145,7 +145,7 @@ SEASTAR_TEST_CASE(test_querying_with_consumer) {
            total += i;
            e.local_qp().execute_internal("insert into ks.cf (k , v) values (?, ? );", { to_sstring(i), i}).get();
        }
-        e.local_qp().query("SELECT * from ks.cf", [&counter, &sum] (const cql3::untyped_result_set::row& row) mutable {
+        e.local_qp().query_internal("SELECT * from ks.cf", [&counter, &sum] (const cql3::untyped_result_set::row& row) mutable {
            counter++;
            sum += row.get_as<int>("v");
            return make_ready_future<stop_iteration>(stop_iteration::no);
@@ -158,7 +158,7 @@ SEASTAR_TEST_CASE(test_querying_with_consumer) {
            total += i;
            e.local_qp().execute_internal("insert into ks.cf (k , v) values (?, ? );", { to_sstring(i), i}).get();
        }
-        e.local_qp().query("SELECT * from ks.cf", [&counter, &sum] (const cql3::untyped_result_set::row& row) mutable {
+        e.local_qp().query_internal("SELECT * from ks.cf", [&counter, &sum] (const cql3::untyped_result_set::row& row) mutable {
            counter++;
            sum += row.get_as<int>("v");
            return make_ready_future<stop_iteration>(stop_iteration::no);
@@ -166,7 +166,7 @@ SEASTAR_TEST_CASE(test_querying_with_consumer) {
        BOOST_CHECK_EQUAL(counter, 2200);
        BOOST_CHECK_EQUAL(total, sum);
        counter = 1000;
-        e.local_qp().query("SELECT * from ks.cf", [&counter] (const cql3::untyped_result_set::row& row) mutable {
+        e.local_qp().query_internal("SELECT * from ks.cf", [&counter] (const cql3::untyped_result_set::row& row) mutable {
            counter++;
            if (counter == 1010) {
                return make_ready_future<stop_iteration>(stop_iteration::yes);
--- a/test/cql-pytest/cassandra_tests/validation/entities/timeuuid_test.py
+++ b/test/cql-pytest/cassandra_tests/validation/entities/timeuuid_test.py
@@ -49,6 +49,16 @@ def testTimeuuid(cql, test_keyspace):
        for i in range(4):
            uuid = rows[i][1]
            datetime = datetime_from_uuid1(uuid)
+            # Before comparing this datetime to the result of dateOf(), we
+            # must truncate the resolution of datetime to milliseconds.
+            # he problem is that the dateOf(timeuuid) CQL function converts a
+            # timeuuid to CQL's "timestamp" type, which has millisecond
+            # resolution, but datetime *may* have finer resolution. It will
+            # usually be whole milliseconds, because this is what the now()
+            # implementation usually does, but when now() is called more than
+            # once per millisecond, it *may* start incrementing the sub-
+            # millisecond part.
+            datetime = datetime.replace(microsecond=datetime.microsecond//1000*1000)
            timestamp = round(datetime.replace(tzinfo=timezone.utc).timestamp() * 1000)
            assert_rows(execute(cql, table, "SELECT dateOf(t), unixTimestampOf(t) FROM %s WHERE k = 0 AND t = ?", rows[i][1]),
                       [datetime, timestamp])
--- a/test/cql-pytest/test_cdc.py
+++ b/test/cql-pytest/test_cdc.py
@@ -0,0 +1,46 @@
+# Copyright 2021 ScyllaDB
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+from cassandra.cluster import ConsistencyLevel
+from cassandra.query import SimpleStatement
+
+from util import new_test_table
+
+def test_cdc_log_entries_use_cdc_streams(cql, test_keyspace):
+    '''Test that the stream IDs chosen for CDC log entries come from the CDC generation
+    whose streams are listed in the streams description table. Since this test is executed
+    on a single-node cluster, there is only one generation.'''
+
+    schema = "pk int primary key"
+    extra = " with cdc = {'enabled': true}"
+    with new_test_table(cql, test_keyspace, schema, extra) as table:
+        stmt = cql.prepare(f"insert into {table} (pk) values (?) using timeout 5m")
+        for i in range(100):
+            cql.execute(stmt, [i])
+
+        log_stream_ids = set(r[0] for r in cql.execute(f'select "cdc$stream_id" from {table}_scylla_cdc_log'))
+
+    # There should be exactly one generation, so we just select the streams
+    streams_desc = cql.execute(SimpleStatement(
+            'select streams from system_distributed.cdc_streams_descriptions_v2',
+            consistency_level=ConsistencyLevel.ONE))
+    stream_ids = set()
+    for entry in streams_desc:
+        stream_ids.update(entry.streams)
+
+    assert(log_stream_ids.issubset(stream_ids))
+
--- a/test/cql-pytest/test_using_timeout.py
+++ b/test/cql-pytest/test_using_timeout.py
@@ -136,7 +136,7 @@ def test_mix_per_query_timeout_with_other_params(scylla_only, cql, table1):
    cql.execute(f"INSERT INTO {table} (p,c,v) VALUES ({key},1,1) USING TIMEOUT 60m AND TTL 1000000 AND TIMESTAMP 321")
    cql.execute(f"INSERT INTO {table} (p,c,v) VALUES ({key},2,1) USING TIMESTAMP 42 AND TIMEOUT 30m")
    res = list(cql.execute(f"SELECT ttl(v), writetime(v) FROM {table} WHERE p = {key} and c = 1"))
-    assert len(res) == 1 and res[0].ttl_v == 1000000 and res[0].writetime_v == 321
+    assert len(res) == 1 and res[0].ttl_v > 0 and res[0].writetime_v == 321
    res = list(cql.execute(f"SELECT ttl(v), writetime(v) FROM {table} WHERE p = {key} and c = 2"))
    assert len(res) == 1 and not res[0].ttl_v and res[0].writetime_v == 42

--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -568,7 +568,7 @@ public:
            db::system_keyspace::init_local_cache().get();
            auto stop_local_cache = defer([] { db::system_keyspace::deinit_local_cache().get(); });

-            sys_dist_ks.start(std::ref(qp), std::ref(mm)).get();
+            sys_dist_ks.start(std::ref(qp), std::ref(mm), std::ref(proxy)).get();

            service::get_local_storage_service().init_server(service::bind_messaging_port(false)).get();
            service::get_local_storage_service().join_cluster().get();
--- a/tools/jmx
+++ b/tools/jmx
--- a/tools/python3
+++ b/tools/python3