gossip: Add an option to force gossip generation

Consider 3 nodes in the cluster, n1, n2, n3 with gossip generation number g1, g2, g3. n1, n2, n3 running scylla version with commit 0a52ecb6df (gossip: Fix max generation drift measure) One year later, user wants the upgrade n1,n2,n3 to a new version when n3 does a rolling restart with a new version, n3 will use a generation number g3'. Because g3' - g2 > MAX_GENERATION_DIFFERENCE and g3' - g1 > MAX_GENERATION_DIFFERENCE, so g1 and g2 will reject n3's gossip update and mark g3 as down. Such unnecessary marking of node down can cause availability issues. For example: DC1: n1, n2 DC2: n3, n4 When n3 and n4 restart, n1 and n2 will mark n3 and n4 as down, which causes the whole DC2 to be unavailable. To fix, we can start the node with a gossip generation within MAX_GENERATION_DIFFERENCE difference for the new node. Once all the nodes run the version with commit 0a52ecb6df, the option is no logger needed. Fixes #5164 (cherry picked from commit 743b529c2b) [tgrabiec: resolved major conflicts in config.hh]
gossiper: Always use the new generation number
2020-03-27 13:08:26 +01:00 · 2020-03-27 12:53:26 +01:00 · 2020-03-22 10:08:48 +01:00 · 2020-03-12 12:10:45 +02:00 · 2020-03-12 11:25:50 +02:00 · 2020-03-09 15:22:58 +02:00
44 changed files with 617 additions and 92 deletions
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=3.1.2
+VERSION=3.1.4

 if test -f version
 then
--- a/cql3/functions/time_uuid_fcts.hh
+++ b/cql3/functions/time_uuid_fcts.hh
@@ -61,6 +61,16 @@ make_now_fct() {
    });
 }

+static int64_t get_valid_timestamp(const data_value& ts_obj) {
+    auto ts = value_cast<db_clock::time_point>(ts_obj);
+    int64_t ms = ts.time_since_epoch().count();
+    auto nanos_since = utils::UUID_gen::make_nanos_since(ms);
+    if (!utils::UUID_gen::is_valid_nanos_since(nanos_since)) {
+        throw exceptions::server_exception(format("{}: timestamp is out of range. Must be in milliseconds since epoch", ms));
+    }
+    return ms;
+}
+
 inline
 shared_ptr<function>
 make_min_timeuuid_fct() {
@@ -74,8 +84,7 @@ make_min_timeuuid_fct() {
        if (ts_obj.is_null()) {
            return {};
        }
-        auto ts = value_cast<db_clock::time_point>(ts_obj);
-        auto uuid = utils::UUID_gen::min_time_UUID(ts.time_since_epoch().count());
+        auto uuid = utils::UUID_gen::min_time_UUID(get_valid_timestamp(ts_obj));
        return {timeuuid_type->decompose(uuid)};
    });
 }
@@ -85,7 +94,6 @@ shared_ptr<function>
 make_max_timeuuid_fct() {
    return make_native_scalar_function<true>("maxtimeuuid", timeuuid_type, { timestamp_type },
            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
-        // FIXME: should values be a vector<optional<bytes>>?
        auto& bb = values[0];
        if (!bb) {
            return {};
@@ -94,12 +102,22 @@ make_max_timeuuid_fct() {
        if (ts_obj.is_null()) {
            return {};
        }
-        auto ts = value_cast<db_clock::time_point>(ts_obj);
-        auto uuid = utils::UUID_gen::max_time_UUID(ts.time_since_epoch().count());
+        auto uuid = utils::UUID_gen::max_time_UUID(get_valid_timestamp(ts_obj));
        return {timeuuid_type->decompose(uuid)};
    });
 }

+inline utils::UUID get_valid_timeuuid(bytes raw) {
+    if (!utils::UUID_gen::is_valid_UUID(raw)) {
+        throw exceptions::server_exception(format("invalid timeuuid: size={}", raw.size()));
+    }
+    auto uuid = utils::UUID_gen::get_UUID(raw);
+    if (!uuid.is_timestamp()) {
+        throw exceptions::server_exception(format("{}: Not a timeuuid: version={}", uuid, uuid.version()));
+    }
+    return uuid;
+}
+
 inline
 shared_ptr<function>
 make_date_of_fct() {
@@ -110,7 +128,7 @@ make_date_of_fct() {
        if (!bb) {
            return {};
        }
-        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb))));
+        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb))));
        return {timestamp_type->decompose(ts)};
    });
 }
@@ -125,7 +143,7 @@ make_unix_timestamp_of_fct() {
        if (!bb) {
            return {};
        }
-        return {long_type->decompose(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb)))};
+        return {long_type->decompose(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb)))};
    });
 }

@@ -176,7 +194,7 @@ make_timeuuidtodate_fct() {
        if (!bb) {
            return {};
        }
-        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb))));
+        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb))));
        auto to_simple_date = get_castas_fctn(simple_date_type, timestamp_type);
        return {simple_date_type->decompose(to_simple_date(ts))};
    });
@@ -211,7 +229,7 @@ make_timeuuidtotimestamp_fct() {
        if (!bb) {
            return {};
        }
-        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb))));
+        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb))));
        return {timestamp_type->decompose(ts)};
    });
 }
@@ -245,10 +263,14 @@ make_timeuuidtounixtimestamp_fct() {
        if (!bb) {
            return {};
        }
-        return {long_type->decompose(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb)))};
+        return {long_type->decompose(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb)))};
    });
 }

+inline bytes time_point_to_long(const data_value& v) {
+    return data_value(get_valid_timestamp(v)).serialize();
+}
+
 inline
 shared_ptr<function>
 make_timestamptounixtimestamp_fct() {
@@ -263,7 +285,7 @@ make_timestamptounixtimestamp_fct() {
        if (ts_obj.is_null()) {
            return {};
        }
-        return {long_type->decompose(ts_obj)};
+        return time_point_to_long(ts_obj);
    });
 }

@@ -282,7 +304,7 @@ make_datetounixtimestamp_fct() {
            return {};
        }
        auto from_simple_date = get_castas_fctn(timestamp_type, simple_date_type);
-        return {long_type->decompose(from_simple_date(simple_date_obj))};
+        return time_point_to_long(from_simple_date(simple_date_obj));
    });
 }

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -380,28 +380,45 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
    if (need_filtering()) {
        auto& sim = db.find_column_family(_schema).get_index_manager();
        auto [opt_idx, _] = find_idx(sim);
-        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
-            return opt_idx && opt_idx->depends_on(*cdef);
+        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef, ::shared_ptr<single_column_restriction> restr) {
+            return opt_idx && restr && restr->is_supported_by(*opt_idx);
        };
+        auto single_pk_restrs = dynamic_pointer_cast<single_column_partition_key_restrictions>(_partition_key_restrictions);
        if (_partition_key_restrictions->needs_filtering(*_schema)) {
            for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
-                if (!column_uses_indexing(cdef)) {
+                ::shared_ptr<single_column_restriction> restr;
+                if (single_pk_restrs) {
+                    auto it = single_pk_restrs->restrictions().find(cdef);
+                    if (it != single_pk_restrs->restrictions().end()) {
+                        restr = dynamic_pointer_cast<single_column_restriction>(it->second);
+                    }
+                }
+                if (!column_uses_indexing(cdef, restr)) {
                    column_defs_for_filtering.emplace_back(cdef);
                }
            }
        }
+        auto single_ck_restrs = dynamic_pointer_cast<single_column_clustering_key_restrictions>(_clustering_columns_restrictions);
        const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
        if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
            column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
-                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
+                ::shared_ptr<single_column_restriction> restr;
+                if (single_pk_restrs) {
+                    auto it = single_ck_restrs->restrictions().find(cdef);
+                    if (it != single_ck_restrs->restrictions().end()) {
+                        restr = dynamic_pointer_cast<single_column_restriction>(it->second);
+                    }
+                }
+                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef, restr)) {
                    column_defs_for_filtering.emplace_back(cdef);
                }
            }
        }
        for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
-            if (!column_uses_indexing(cdef)) {
+            auto restr = dynamic_pointer_cast<single_column_restriction>(_nonprimary_key_restrictions->get_restriction(*cdef));
+            if (!column_uses_indexing(cdef, restr)) {
                column_defs_for_filtering.emplace_back(cdef);
            }
        }
--- a/cql3/selection/abstract_function_selector.hh
+++ b/cql3/selection/abstract_function_selector.hh
@@ -92,6 +92,14 @@ public:
            : abstract_function_selector(fun, std::move(arg_selectors))
            , _tfun(dynamic_pointer_cast<T>(fun)) {
    }
+
+    const functions::function_name& name() const {
+        return _tfun->name();
+    }
+
+    virtual sstring assignment_testable_source_context() const override {
+        return format("{}", this->name());
+    }
 };

 }
--- a/cql3/selection/aggregate_function_selector.hh
+++ b/cql3/selection/aggregate_function_selector.hh
@@ -79,11 +79,6 @@ public:
                    dynamic_pointer_cast<functions::aggregate_function>(func), std::move(arg_selectors))
            , _aggregate(fun()->new_aggregate()) {
    }
-
-    virtual sstring assignment_testable_source_context() const override {
-        // FIXME:
-        return "FIXME";
-    }
 };

 }
--- a/cql3/selection/scalar_function_selector.hh
+++ b/cql3/selection/scalar_function_selector.hh
@@ -82,12 +82,6 @@ public:
            : abstract_function_selector_for<functions::scalar_function>(
                dynamic_pointer_cast<functions::scalar_function>(std::move(fun)), std::move(arg_selectors)) {
    }
-
-    virtual sstring assignment_testable_source_context() const override {
-        // FIXME:
-        return "FIXME";
-    }
-
 };

 }
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -142,7 +142,7 @@ shared_ptr<selector::factory>
 selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
    auto&& factory = _selected->new_selector_factory(db, s, defs);
    auto&& type = factory->new_instance()->get_type();
-    auto&& ut = dynamic_pointer_cast<const user_type_impl>(std::move(type));
+    auto&& ut = dynamic_pointer_cast<const user_type_impl>(type->underlying_type());
    if (!ut) {
        throw exceptions::invalid_request_exception(
                format("Invalid field selection: {} of type {} is not a user type",
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -166,7 +166,8 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad
 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
    if (get_idx_of_field(to_update, _field_name)) {
-        throw exceptions::invalid_request_exception(format("Cannot add new field {} to type {}: a field of the same name already exists", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Cannot add new field {} to type {}: a field of the same name already exists",
+            _field_name->to_string(), _name.to_string()));
    }

    std::vector<bytes> new_names(to_update->field_names());
@@ -174,7 +175,7 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_
    std::vector<data_type> new_types(to_update->field_types());
    auto&& add_type = _field_type->prepare(db, keyspace()).get_type();
    if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
-        throw exceptions::invalid_request_exception(format("Cannot add new field {} of type {} to type {} as this would create a circular reference", _field_name->name(), _field_type->to_string(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Cannot add new field {} of type {} to type {} as this would create a circular reference", _field_name->to_string(), _field_type->to_string(), _name.to_string()));
    }
    new_types.push_back(std::move(add_type));
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
@@ -184,13 +185,14 @@ user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type t
 {
    std::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
    if (!idx) {
-        throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", _field_name->to_string(), _name.to_string()));
    }

    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace()).get_type();
    if (!new_type->is_compatible_with(*previous)) {
-        throw exceptions::invalid_request_exception(format("Type {} in incompatible with previous type {} of field {} in user type {}", _field_type->to_string(), previous->as_cql3_type().to_string(), _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Type {} in incompatible with previous type {} of field {} in user type {}",
+            _field_type->to_string(), previous->as_cql3_type().to_string(), _field_name->to_string(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
--- a/cql3/tuples.cc
+++ b/cql3/tuples.cc
@@ -32,7 +32,7 @@ tuples::component_spec_of(shared_ptr<column_specification> column, size_t compon
            column->ks_name,
            column->cf_name,
            ::make_shared<column_identifier>(format("{}[{:d}]", column->name, component), true),
-            static_pointer_cast<const tuple_type_impl>(column->type)->type(component));
+            static_pointer_cast<const tuple_type_impl>(column->type->underlying_type())->type(component));
 }

 shared_ptr<term>
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -70,7 +70,7 @@ public:

    private:
        void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
-            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type);
+            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type->underlying_type());
            if (!tt) {
                throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver->name, receiver->type->as_cql3_type()));
            }
--- a/database.cc
+++ b/database.cc
@@ -1967,7 +1967,8 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
        return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db), partitioner, std::move(s), pr, ps, pc,
                std::move(trace_state), fwd_mr);
    });
-    return make_flat_multi_range_reader(std::move(schema), std::move(ms), std::move(range_generator), schema->full_slice(),
+    auto&& full_slice = schema->full_slice();
+    return make_flat_multi_range_reader(std::move(schema), std::move(ms), std::move(range_generator), std::move(full_slice),
            service::get_local_streaming_read_priority(), {}, mutation_reader::forwarding::no);
 }

--- a/db/config.hh
+++ b/db/config.hh
@@ -735,6 +735,7 @@ public:
    val(shutdown_announce_in_ms, uint32_t, 2 * 1000, Used, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.") \
    val(developer_mode, bool, false, Used, "Relax environment checks. Setting to true can reduce performance and reliability significantly.") \
    val(skip_wait_for_gossip_to_settle, int32_t, -1, Used, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.") \
+    val(force_gossip_generation, int32_t, -1, Used, "Force gossip to use the generation number provided by user") \
    val(experimental, bool, false, Used, "Set to true to unlock experimental features.") \
    val(lsa_reclamation_step, size_t, 1, Used, "Minimum number of segments to reclaim in a single step") \
    val(prometheus_port, uint16_t, 9180, Used, "Prometheus port, set to zero to disable") \
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -405,11 +405,8 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_m
            return _proxy.send_to_endpoint(std::move(m), end_point_key(), { }, write_type::SIMPLE, service::allow_hints::no);
        } else {
            manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", end_point_key());
-            // FIXME: using 1h as infinite timeout. If a node is down, we should get an
-            // unavailable exception.
-            auto timeout = db::timeout_clock::now() + 1h;
            //FIXME: Add required frozen_mutation overloads
-            return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
+            return _proxy.mutate_hint_from_scratch(std::move(m));
        }
    });
 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -1092,10 +1092,31 @@ static std::vector<V> get_list(const query::result_set_row& row, const sstring&
 // Create types for a given keyspace. This takes care of topologically sorting user defined types.
 template <typename T> static std::vector<user_type> create_types(keyspace_metadata& ks, T&& range) {
    cql_type_parser::raw_builder builder(ks);
+    std::unordered_set<bytes> names;
    for (const query::result_set_row& row : range) {
-        builder.add(row.get_nonnull<sstring>("type_name"),
-                        get_list<sstring>(row, "field_names"),
-                        get_list<sstring>(row, "field_types"));
+        auto name = row.get_nonnull<sstring>("type_name");
+        names.insert(to_bytes(name));
+        builder.add(std::move(name), get_list<sstring>(row, "field_names"), get_list<sstring>(row, "field_types"));
+    }
+    // Add user types that use any of the above types. From the
+    // database point of view they haven't changed since the content
+    // of system.types is the same for them. The runtime objects in
+    // the other hand now point to out of date types, so we need to
+    // recreate them.
+    for (const auto& p : ks.user_types()->get_all_types()) {
+        const user_type& t = p.second;
+        if (names.count(t->_name) != 0) {
+            continue;
+        }
+        for (const auto& name : names) {
+            if (t->references_user_type(t->_keyspace, name)) {
+                std::vector<sstring> field_types;
+                for (const data_type& f : t->field_types()) {
+                    field_types.push_back(f->as_cql3_type().to_string());
+                }
+                builder.add(t->get_name_as_string(), t->string_field_names(), std::move(field_types));
+            }
+        }
    }
    return builder.build();
 }
--- a/db/view/build_progress_virtual_reader.hh
+++ b/db/view/build_progress_virtual_reader.hh
@@ -44,6 +44,11 @@ namespace db::view {
 // columns. When reading the results from the scylla_views_builds_in_progress
 // table, we adjust the clustering key (we shed the cpu_id column) and map
 // back the regular columns.
+// Since mutation fragment consumers expect clustering_row fragments
+// not to be duplicated for given primary key, previous clustering key
+// is stored between mutation fragments. If the clustering key becomes
+// the same as the previous one (as a result of trimming cpu_id),
+// the duplicated fragment is ignored.
 class build_progress_virtual_reader {
    database& _db;

@@ -55,6 +60,7 @@ class build_progress_virtual_reader {
        const query::partition_slice& _legacy_slice;
        query::partition_slice _slice;
        flat_mutation_reader _underlying;
+        std::optional<clustering_key> _previous_clustering_key;

        build_progress_reader(
                schema_ptr legacy_schema,
@@ -79,7 +85,8 @@ class build_progress_virtual_reader {
                        pc,
                        std::move(trace_state),
                        fwd,
-                        fwd_mr)) {
+                        fwd_mr))
+                , _previous_clustering_key() {
        }

        const schema& underlying_schema() const {
@@ -127,8 +134,13 @@ class build_progress_virtual_reader {
                                legacy_in_progress_row.append_cell(_legacy_generation_number_col, std::move(c));
                            }
                        });
+                        auto ck = adjust_ckey(scylla_in_progress_row.key());
+                        if (_previous_clustering_key && ck.equal(*_schema, *_previous_clustering_key)) {
+                            continue;
+                        }
+                        _previous_clustering_key = ck;
                        mf = clustering_row(
-                                adjust_ckey(scylla_in_progress_row.key()),
+                                std::move(ck),
                                std::move(scylla_in_progress_row.tomb()),
                                std::move(scylla_in_progress_row.marker()),
                                std::move(legacy_in_progress_row));
@@ -140,6 +152,8 @@ class build_progress_virtual_reader {
                                adjust_ckey(scylla_in_progress_rt.end),
                                scylla_in_progress_rt.end_kind,
                                scylla_in_progress_rt.tomb);
+                    } else if (mf.is_end_of_partition()) {
+                        _previous_clustering_key.reset();
                    }
                    push_mutation_fragment(std::move(mf));
                }
@@ -192,4 +206,4 @@ public:
    }
 };

-}
+}
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -125,7 +125,7 @@ if [ -z "$TARGET" ]; then
 fi
 RELOC_PKG_FULLPATH=$(readlink -f $RELOC_PKG)
 RELOC_PKG_BASENAME=$(basename $RELOC_PKG)
-SCYLLA_VERSION=$(cat SCYLLA-VERSION-FILE)
+SCYLLA_VERSION=$(cat SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/')
 SCYLLA_RELEASE=$(cat SCYLLA-RELEASE-FILE)

 ln -fv $RELOC_PKG_FULLPATH ../$PRODUCT-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
--- a/dist/debian/debian/scylla-server.postrm
+++ b/dist/debian/debian/scylla-server.postrm
@@ -4,7 +4,11 @@ set -e

 case "$1" in
    purge|remove)
-        rm -rf /etc/systemd/system/scylla-server.service.d/
+        # We need to keep dependencies.conf and sysconfdir.conf on 'remove',
+        # otherwise it will be missing after rollback.
+        if [ "$1" = "purge" ]; then
+            rm -rf /etc/systemd/system/scylla-server.service.d/
+        fi
        ;;
 esac

--- a/dist/debian/scylla-server.install.mustache
+++ b/dist/debian/scylla-server.install.mustache
@@ -15,3 +15,4 @@ dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system
 dist/common/systemd/scylla-fstrim.timer /lib/systemd/system
 dist/debian/scripts/scylla_save_coredump usr/lib/scylla
 dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla
+tools/scyllatop usr/lib/scylla
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -15,6 +15,10 @@ Obsoletes:	scylla-server < 1.1
 %global __brp_python_bytecompile %{nil}
 %global __brp_mangle_shebangs %{nil}

+# Prevent find-debuginfo.sh from tempering with scylla's build-id (#5881)
+%undefine _unique_build_ids
+%global _no_recompute_build_ids 1
+
 %description
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
--- a/exceptions/exceptions.hh
+++ b/exceptions/exceptions.hh
@@ -98,6 +98,13 @@ public:
    sstring get_message() const { return what(); }
 };

+class server_exception : public cassandra_exception {
+public:
+    server_exception(sstring msg) noexcept
+        : exceptions::cassandra_exception{exceptions::exception_code::SERVER_ERROR, std::move(msg)}
+    { }
+};
+
 class protocol_exception : public cassandra_exception {
 public:
    protocol_exception(sstring msg) noexcept
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1612,11 +1612,15 @@ future<> gossiper::start_gossiping(int generation_nbr, std::map<application_stat
    // message on all cpus and forard them to cpu0 to process.
    return get_gossiper().invoke_on_all([do_bind] (gossiper& g) {
        g.init_messaging_service_handler(do_bind);
-    }).then([this, generation_nbr, preload_local_states] {
+    }).then([this, generation_nbr, preload_local_states] () mutable {
        build_seeds_list();
-        /* initialize the heartbeat state for this localEndpoint */
-        maybe_initialize_local_state(generation_nbr);
+        if (_cfg.force_gossip_generation() > 0) {
+            generation_nbr = _cfg.force_gossip_generation();
+            logger.warn("Use the generation number provided by user: generation = {}", generation_nbr);
+        }
        endpoint_state& local_state = endpoint_state_map[get_broadcast_address()];
+        local_state.set_heart_beat_state_and_update_timestamp(heart_beat_state(generation_nbr));
+        local_state.mark_alive();
        for (auto& entry : preload_local_states) {
            local_state.add_application_state(entry.first, entry.second);
        }
@@ -1820,7 +1824,8 @@ future<> gossiper::do_stop_gossiping() {
        if (my_ep_state && !is_silent_shutdown_state(*my_ep_state)) {
            logger.info("Announcing shutdown");
            add_local_application_state(application_state::STATUS, _value_factory.shutdown(true)).get();
-            for (inet_address addr : _live_endpoints) {
+            auto live_endpoints = _live_endpoints;
+            for (inet_address addr : live_endpoints) {
                msg_addr id = get_msg_addr(addr);
                logger.trace("Sending a GossipShutdown to {}", id);
                ms().send_gossip_shutdown(id, get_broadcast_address()).then_wrapped([id] (auto&&f) {
--- a/locator/simple_strategy.cc
+++ b/locator/simple_strategy.cc
@@ -53,13 +53,13 @@ std::vector<inet_address> simple_strategy::calculate_natural_endpoints(const tok
    endpoints.reserve(replicas);

    for (auto& token : tm.ring_range(t)) {
+        if (endpoints.size() == replicas) {
+           break;
+        }
        auto ep = tm.get_endpoint(token);
        assert(ep);

        endpoints.push_back(*ep);
-        if (endpoints.size() == replicas) {
-           break;
-        }
    }

    return std::move(endpoints.get_vector());
--- a/main.cc
+++ b/main.cc
@@ -54,6 +54,7 @@
 #include <seastar/core/file.hh>
 #include <sys/time.h>
 #include <sys/resource.h>
+#include <sys/prctl.h>
 #include "disk-error-handler.hh"
 #include "tracing/tracing.hh"
 #include "tracing/tracing_backend_registry.hh"
@@ -323,6 +324,15 @@ static std::optional<std::vector<sstring>> parse_hinted_handoff_enabled(sstring
 }

 int main(int ac, char** av) {
+    // Allow core dumps. The would be disabled by default if
+    // CAP_SYS_NICE was added to the binary, as is suggested by the
+    // epoll backend.
+    int r = prctl(PR_SET_DUMPABLE, 1, 0, 0, 0);
+    if (r) {
+        std::cerr << "Could not make scylla dumpable\n";
+        exit(1);
+    }
+
  int return_value = 0;
  try {
    // early check to avoid triggering
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -39,6 +39,9 @@
 #include "mutation_cleaner.hh"
 #include <seastar/core/execution_stage.hh>
 #include "types/map.hh"
+#include "utils/exceptions.hh"
+
+logging::logger mplog("mutation_partition");

 template<bool reversed>
 struct reversal_traits;
@@ -1227,7 +1230,9 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
 void
 row::append_cell(column_id id, atomic_cell_or_collection value) {
    if (_type == storage_type::vector && id < max_vector_size) {
-        assert(_storage.vector.v.size() <= id);
+        if (_storage.vector.v.size() > id) {
+            on_internal_error(mplog, format("Attempted to append cell#{} to row already having {} cells", id, _storage.vector.v.size()));
+        }
        _storage.vector.v.resize(id);
        _storage.vector.v.emplace_back(cell_and_hash{std::move(value), cell_hash_opt()});
        _storage.vector.present.set(id);
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -371,6 +371,10 @@ class repair_writer {
    std::vector<std::optional<seastar::queue<mutation_fragment_opt>>> _mq;
    // Current partition written to disk
    std::vector<lw_shared_ptr<const decorated_key_with_hash>> _current_dk_written_to_sstable;
+    // Is current partition still open. A partition is opened when a
+    // partition_start is written and is closed when a partition_end is
+    // written.
+    std::vector<bool> _partition_opened;
 public:
    repair_writer(
            schema_ptr schema,
@@ -385,10 +389,13 @@ public:
    future<> write_start_and_mf(lw_shared_ptr<const decorated_key_with_hash> dk, mutation_fragment mf, unsigned node_idx)  {
        _current_dk_written_to_sstable[node_idx] = dk;
        if (mf.is_partition_start()) {
-            return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf)));
+            return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf))).then([this, node_idx] {
+                _partition_opened[node_idx] = true;
+            });
        } else {
            auto start = mutation_fragment(partition_start(dk->dk, tombstone()));
            return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(start))).then([this, node_idx, mf = std::move(mf)] () mutable {
+                _partition_opened[node_idx] = true;
                return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf)));
            });
        }
@@ -398,6 +405,7 @@ public:
        _writer_done.resize(_nr_peer_nodes);
        _mq.resize(_nr_peer_nodes);
        _current_dk_written_to_sstable.resize(_nr_peer_nodes);
+        _partition_opened.resize(_nr_peer_nodes, false);
    }

    void create_writer(unsigned node_idx) {
@@ -434,12 +442,21 @@ public:
        t.stream_in_progress());
    }

+    future<> write_partition_end(unsigned node_idx) {
+        if (_partition_opened[node_idx]) {
+            return _mq[node_idx]->push_eventually(mutation_fragment(partition_end())).then([this, node_idx] {
+                _partition_opened[node_idx] = false;
+            });
+        }
+        return make_ready_future<>();
+    }
+
    future<> do_write(unsigned node_idx, lw_shared_ptr<const decorated_key_with_hash> dk, mutation_fragment mf) {
        if (_current_dk_written_to_sstable[node_idx]) {
            if (_current_dk_written_to_sstable[node_idx]->dk.equal(*_schema, dk->dk)) {
                return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf)));
            } else {
-                return _mq[node_idx]->push_eventually(mutation_fragment(partition_end())).then([this,
+                return write_partition_end(node_idx).then([this,
                        node_idx, dk = std::move(dk), mf = std::move(mf)] () mutable {
                    return write_start_and_mf(std::move(dk), std::move(mf), node_idx);
                });
@@ -453,7 +470,7 @@ public:
        return parallel_for_each(boost::irange(unsigned(0), unsigned(_nr_peer_nodes)), [this] (unsigned node_idx) {
            if (_writer_done[node_idx] && _mq[node_idx]) {
                // Partition_end is never sent on wire, so we have to write one ourselves.
-                return _mq[node_idx]->push_eventually(mutation_fragment(partition_end())).then([this, node_idx] () mutable {
+                return write_partition_end(node_idx).then([this, node_idx] () mutable {
                    // Empty mutation_fragment_opt means no more data, so the writer can seal the sstables.
                    return _mq[node_idx]->push_eventually(mutation_fragment_opt()).then([this, node_idx] () mutable {
                        return (*_writer_done[node_idx]).then([] (uint64_t partitions) {
@@ -1458,7 +1475,7 @@ class row_level_repair {

    // If the total size of the `_row_buf` on either of the nodes is zero,
    // we set this flag, which is an indication that rows are not synced.
-    bool _zero_rows;
+    bool _zero_rows = false;

    // Sum of estimated_partitions on all peers
    uint64_t _estimated_partitions = 0;
--- a/2
+++ b/2
--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -162,13 +162,14 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
        auto& g = gms::get_local_gossiper();
        auto& ss = get_local_storage_service();
        _slen = _gstate.size();
-        g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(_gstate));
-        // if max difference during this round is big schedule next recalculate earlier
-        if (_diff < 0.01) {
-            return std::chrono::milliseconds(2000);
-        } else {
-            return std::chrono::milliseconds(500);
-        }
+        return g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(_gstate)).then([this] {
+            // if max difference during this round is big schedule next recalculate earlier
+            if (_diff < 0.01) {
+                return std::chrono::milliseconds(2000);
+            } else {
+                return std::chrono::milliseconds(500);
+            }
+        });
    }).finally([this] {
        _gstate = std::string(); // free memory, do not trust clear() to do that for string
        _rates.clear();
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1560,6 +1560,14 @@ future<> storage_proxy::send_to_endpoint(
            allow_hints);
 }

+future<> storage_proxy::mutate_hint_from_scratch(frozen_mutation_and_schema fm_a_s) {
+    // FIXME: using 1h as infinite timeout. If a node is down, we should get an
+    // unavailable exception.
+    const auto timeout = db::timeout_clock::now() + 1h;
+    std::array<mutation, 1> ms{fm_a_s.fm.unfreeze(fm_a_s.s)};
+    return mutate_internal(std::move(ms), db::consistency_level::ALL, false, nullptr, timeout);
+}
+
 /**
 * Send the mutations to the right targets, write it locally if it corresponds or writes a hint when the node
 * is not available.
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -387,6 +387,8 @@ public:
    */
    future<> mutate_atomically(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state);

+    future<> mutate_hint_from_scratch(frozen_mutation_and_schema fm_a_s);
+
    // Send a mutation to one specific remote target.
    // Inspired by Cassandra's StorageProxy.sendToHintedEndpoints but without
    // hinted handoff support, and just one target. See also
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -1440,7 +1440,8 @@ future<> storage_service::drain_on_shutdown() {
            ss._sys_dist_ks.invoke_on_all(&db::system_distributed_keyspace::stop).get();
            slogger.info("Drain on shutdown: system distributed keyspace stopped");

-            get_storage_proxy().invoke_on_all([&ss] (storage_proxy& local_proxy) mutable {
+            get_storage_proxy().invoke_on_all([] (storage_proxy& local_proxy) mutable {
+                auto& ss = service::get_local_storage_service();
                ss.unregister_subscriber(&local_proxy);
                return local_proxy.drain_on_shutdown();
            }).get();
--- a/table.cc
+++ b/table.cc
@@ -2518,7 +2518,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(const schema
        std::move(slice),
        std::move(m),
        [base, views = std::move(views), lock = std::move(lock), this, timeout, source = std::move(source), &io_priority] (auto& pk, auto& slice, auto& m) mutable {
-            auto reader = source.make_reader(base, pk, slice, io_priority);
+            auto reader = source.make_reader(base, pk, slice, io_priority, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
            return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader)).then([lock = std::move(lock)] () mutable {
                // return the local partition/row lock we have taken so it
                // remains locked until the caller is done modifying this
--- a/tests/UUID_test.cc
+++ b/tests/UUID_test.cc
@@ -77,3 +77,45 @@ BOOST_AUTO_TEST_CASE(test_make_random_uuid) {
    std::sort(uuids.begin(), uuids.end());
    BOOST_CHECK(std::unique(uuids.begin(), uuids.end()) == uuids.end());
 }
+
+BOOST_AUTO_TEST_CASE(test_get_time_uuid) {
+    using namespace std::chrono;
+
+    auto uuid = utils::UUID_gen::get_time_UUID();
+    BOOST_CHECK(uuid.is_timestamp());
+
+    auto tp = system_clock::now();
+    uuid = utils::UUID_gen::get_time_UUID(tp);
+    BOOST_CHECK(uuid.is_timestamp());
+
+    auto millis = duration_cast<milliseconds>(tp.time_since_epoch()).count();
+    uuid = utils::UUID_gen::get_time_UUID(millis);
+    BOOST_CHECK(uuid.is_timestamp());
+
+    auto unix_timestamp = utils::UUID_gen::unix_timestamp(uuid);
+    BOOST_CHECK(unix_timestamp == millis);
+}
+
+BOOST_AUTO_TEST_CASE(test_min_time_uuid) {
+    using namespace std::chrono;
+
+    auto tp = system_clock::now();
+    auto millis = duration_cast<milliseconds>(tp.time_since_epoch()).count();
+    auto uuid = utils::UUID_gen::min_time_UUID(millis);
+    BOOST_CHECK(uuid.is_timestamp());
+
+    auto unix_timestamp = utils::UUID_gen::unix_timestamp(uuid);
+    BOOST_CHECK(unix_timestamp == millis);
+}
+
+BOOST_AUTO_TEST_CASE(test_max_time_uuid) {
+    using namespace std::chrono;
+
+    auto tp = system_clock::now();
+    auto millis = duration_cast<milliseconds>(tp.time_since_epoch()).count();
+    auto uuid = utils::UUID_gen::max_time_UUID(millis);
+    BOOST_CHECK(uuid.is_timestamp());
+
+    auto unix_timestamp = utils::UUID_gen::unix_timestamp(uuid);
+    BOOST_CHECK(unix_timestamp == millis);
+}
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -1526,6 +1526,18 @@ SEASTAR_TEST_CASE(test_user_type_nested) {
    });
 }

+SEASTAR_TEST_CASE(test_user_type_reversed) {
+    return do_with_cql_env_thread([](cql_test_env& e) {
+        e.execute_cql("create type my_type (a int);").get();
+        e.execute_cql("create table tbl (a int, b frozen<my_type>, primary key ((a), b)) with clustering order by (b desc);").get();
+        e.execute_cql("insert into tbl (a, b) values (1, (2));").get();
+        assert_that(e.execute_cql("select a,b.a from tbl;").get0())
+                .is_rows()
+                .with_size(1)
+                .with_row({int32_type->decompose(1), int32_type->decompose(2)});
+    });
+}
+
 SEASTAR_TEST_CASE(test_user_type) {
    return do_with_cql_env([] (cql_test_env& e) {
        return e.execute_cql("create type ut1 (my_int int, my_bigint bigint, my_text text);").discard_result().then([&e] {
@@ -3577,3 +3589,239 @@ SEASTAR_TEST_CASE(test_describe_varchar) {
                });
   });
 }
+
+SEASTAR_TEST_CASE(test_alter_type_on_compact_storage_with_no_regular_columns_does_not_crash) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        cquery_nofail(e, "CREATE TYPE my_udf (first text);");
+        cquery_nofail(e, "create table z (pk int, ck frozen<my_udf>, primary key(pk, ck)) with compact storage;");
+        cquery_nofail(e, "alter type my_udf add test_int int;");
+    });
+}
+
+shared_ptr<cql_transport::messages::result_message> cql_func_require_nofail(
+        cql_test_env& env,
+        const seastar::sstring& fct,
+        const seastar::sstring& inp,
+        std::unique_ptr<cql3::query_options>&& qo = nullptr,
+        const std::experimental::source_location& loc = std::experimental::source_location::current()) {
+    auto res = shared_ptr<cql_transport::messages::result_message>(nullptr);
+    auto query = format("SELECT {}({}) FROM t;", fct, inp);
+    try {
+        if (qo) {
+            res = env.execute_cql(query, std::move(qo)).get0();
+        } else {
+            res = env.execute_cql(query).get0();
+        }
+        BOOST_TEST_MESSAGE(format("Query '{}' succeeded as expected", query));
+    } catch (...) {
+        BOOST_ERROR(format("query '{}' failed unexpectedly with error: {}\n{}:{}: originally from here",
+                query, std::current_exception(),
+                loc.file_name(), loc.line()));
+    }
+    return res;
+}
+
+// FIXME: should be in cql_assertions, but we don't want to call boost from cql_assertions.hh
+template <typename Exception>
+void cql_func_require_throw(
+        cql_test_env& env,
+        const seastar::sstring& fct,
+        const seastar::sstring& inp,
+        std::unique_ptr<cql3::query_options>&& qo = nullptr,
+        const std::experimental::source_location& loc = std::experimental::source_location::current()) {
+    auto query = format("SELECT {}({}) FROM t;", fct, inp);
+    try {
+        if (qo) {
+            env.execute_cql(query, std::move(qo)).get();
+        } else {
+            env.execute_cql(query).get();
+        }
+        BOOST_ERROR(format("query '{}' succeeded unexpectedly\n{}:{}: originally from here", query,
+                loc.file_name(), loc.line()));
+    } catch (Exception& e) {
+        BOOST_TEST_MESSAGE(format("Query '{}' failed as expected with error: {}", query, e));
+    } catch (...) {
+        BOOST_ERROR(format("query '{}' failed with unexpected error: {}\n{}:{}: originally from here",
+                query, std::current_exception(),
+                loc.file_name(), loc.line()));
+    }
+}
+
+static void create_time_uuid_fcts_schema(cql_test_env& e) {
+    cquery_nofail(e, "CREATE TABLE t (id int primary key, t timestamp, l bigint, f float, u timeuuid, d date)");
+    cquery_nofail(e, "INSERT INTO t (id, t, l, f, u, d) VALUES "
+            "(1, 1579072460606, 1579072460606000, 1579072460606, a66525e0-3766-11ea-8080-808080808080, '2020-01-13')");
+    cquery_nofail(e, "SELECT * FROM t;");
+}
+
+SEASTAR_TEST_CASE(test_basic_time_uuid_fcts) {
+    return do_with_cql_env_thread([] (auto& e) {
+        create_time_uuid_fcts_schema(e);
+
+        cql_func_require_nofail(e, "currenttime", "");
+        cql_func_require_nofail(e, "currentdate", "");
+        cql_func_require_nofail(e, "now", "");
+        cql_func_require_nofail(e, "currenttimeuuid", "");
+        cql_func_require_nofail(e, "currenttimestamp", "");
+    });
+}
+
+SEASTAR_TEST_CASE(test_time_uuid_fcts_input_validation) {
+    return do_with_cql_env_thread([] (auto& e) {
+        create_time_uuid_fcts_schema(e);
+
+        // test timestamp arg
+        auto require_timestamp = [&e] (const sstring& fct) {
+            cql_func_require_nofail(e, fct, "t");
+            cql_func_require_throw<exceptions::server_exception>(e, fct, "l");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "f");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "u");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "d");
+
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttime()");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currentdate()");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "now()");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttimeuuid()");
+            cql_func_require_nofail(e, fct, "currenttimestamp()");
+        };
+
+        require_timestamp("mintimeuuid");
+        require_timestamp("maxtimeuuid");
+
+        // test timeuuid arg
+        auto require_timeuuid = [&e] (const sstring& fct) {
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "t");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "l");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "f");
+            cql_func_require_nofail(e, fct, "u");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "d");
+
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttime()");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currentdate()");
+            cql_func_require_nofail(e, fct, "now()");
+            cql_func_require_nofail(e, fct, "currenttimeuuid()");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttimestamp()");
+        };
+
+        require_timeuuid("dateof");
+        require_timeuuid("unixtimestampof");
+
+        // test timeuuid or date arg
+        auto require_timeuuid_or_date = [&e] (const sstring& fct) {
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "t");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "l");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "f");
+            cql_func_require_nofail(e, fct, "u");
+            cql_func_require_nofail(e, fct, "d");
+
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttime()");
+            cql_func_require_nofail(e, fct, "currentdate()");
+            cql_func_require_nofail(e, fct, "now()");
+            cql_func_require_nofail(e, fct, "currenttimeuuid()");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttimestamp()");
+        };
+
+        require_timeuuid_or_date("totimestamp");
+
+        // test timestamp or timeuuid arg
+        auto require_timestamp_or_timeuuid = [&e] (const sstring& fct) {
+            cql_func_require_nofail(e, fct, "t");
+            cql_func_require_throw<std::exception>(e, fct, "l");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "f");
+            cql_func_require_nofail(e, fct, "u");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "d");
+
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttime()");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currentdate()");
+            cql_func_require_nofail(e, fct, "now()");
+            cql_func_require_nofail(e, fct, "currenttimeuuid()");
+            cql_func_require_nofail(e, fct, "currenttimestamp()");
+        };
+
+        require_timestamp_or_timeuuid("todate");
+
+        // test timestamp, timeuuid, or date arg
+        auto require_timestamp_timeuuid_or_date = [&e] (const sstring& fct) {
+            cql_func_require_nofail(e, fct, "t");
+            cql_func_require_throw<exceptions::server_exception>(e, fct, "l");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "f");
+            cql_func_require_nofail(e, fct, "u");
+            cql_func_require_nofail(e, fct, "d");
+
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "currenttime()");
+            cql_func_require_nofail(e, fct, "currentdate()");
+            cql_func_require_nofail(e, fct, "now()");
+            cql_func_require_nofail(e, fct, "currenttimeuuid()");
+            cql_func_require_nofail(e, fct, "currenttimestamp()");
+        };
+
+        require_timestamp_timeuuid_or_date("tounixtimestamp");
+    });
+}
+
+SEASTAR_TEST_CASE(test_time_uuid_fcts_result) {
+    return do_with_cql_env_thread([] (auto& e) {
+        create_time_uuid_fcts_schema(e);
+
+        // test timestamp arg
+        auto require_timestamp = [&e] (const sstring& fct) {
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "mintimeuuid(t)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "maxtimeuuid(t)");
+            cql_func_require_nofail(e, fct, "dateof(u)");
+            cql_func_require_nofail(e, fct, "unixtimestampof(u)");
+            cql_func_require_nofail(e, fct, "totimestamp(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "todate(u)");
+            cql_func_require_nofail(e, fct, "tounixtimestamp(u)");
+        };
+
+        require_timestamp("mintimeuuid");
+        require_timestamp("maxtimeuuid");
+
+        // test timeuuid arg
+        auto require_timeuuid = [&e] (const sstring& fct) {
+            cql_func_require_nofail(e, fct, "mintimeuuid(t)");
+            cql_func_require_nofail(e, fct, "maxtimeuuid(t)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "dateof(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "unixtimestampof(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "totimestamp(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "todate(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "tounixtimestamp(u)");
+        };
+
+        require_timeuuid("dateof");
+        require_timeuuid("unixtimestampof");
+
+        // test timeuuid or date arg
+        auto require_timeuuid_or_date = [&e] (const sstring& fct) {
+            cql_func_require_nofail(e, fct, "mintimeuuid(t)");
+            cql_func_require_nofail(e, fct, "maxtimeuuid(t)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "dateof(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "unixtimestampof(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "totimestamp(u)");
+            cql_func_require_nofail(e, fct, "todate(u)");
+            cql_func_require_throw<exceptions::invalid_request_exception>(e, fct, "tounixtimestamp(u)");
+        };
+
+        require_timeuuid_or_date("totimestamp");
+
+        // test timestamp or timeuuid arg
+        auto require_timestamp_or_timeuuid = [&e] (const sstring& fct) {
+        };
+
+        require_timestamp_or_timeuuid("todate");
+
+        // test timestamp, timeuuid, or date arg
+        auto require_timestamp_timeuuid_or_date = [&e] (const sstring& fct) {
+            cql_func_require_nofail(e, fct, "mintimeuuid(t)");
+            cql_func_require_nofail(e, fct, "maxtimeuuid(t)");
+            cql_func_require_nofail(e, fct, "dateof(u)");
+            cql_func_require_nofail(e, fct, "unixtimestampof(u)");
+            cql_func_require_nofail(e, fct, "totimestamp(u)");
+            cql_func_require_nofail(e, fct, "todate(u)");
+            cql_func_require_nofail(e, fct, "tounixtimestamp(u)");
+        };
+
+        require_timestamp_timeuuid_or_date("tounixtimestamp");
+    });
+}
+
--- a/tests/eventually.hh
+++ b/tests/eventually.hh
@@ -25,7 +25,7 @@
 #include <seastar/util/noncopyable_function.hh>

 inline
-void eventually(noncopyable_function<void ()> f, size_t max_attempts = 12) {
+void eventually(noncopyable_function<void ()> f, size_t max_attempts = 17) {
    size_t attempts = 0;
    while (true) {
        try {
@@ -43,7 +43,7 @@ void eventually(noncopyable_function<void ()> f, size_t max_attempts = 12) {

 inline
 bool eventually_true(noncopyable_function<bool ()> f) {
-    const unsigned max_attempts = 10;
+    const unsigned max_attempts = 15;
    unsigned attempts = 0;
    while (true) {
        if (f()) {
--- a/tests/schema_change_test.cc
+++ b/tests/schema_change_test.cc
@@ -421,6 +421,47 @@ public:
    virtual void on_drop_view(const sstring&, const sstring&) override { ++drop_view_count; }
 };

+SEASTAR_TEST_CASE(test_alter_nested_type) {
+    return do_with_cql_env_thread([](cql_test_env& e) {
+        e.execute_cql("CREATE TYPE foo (foo_k int);").get();
+        e.execute_cql("CREATE TYPE bar (bar_k frozen<foo>);").get();
+        e.execute_cql("alter type foo add zed_v int;").get();
+        e.execute_cql("CREATE TABLE tbl (key int PRIMARY KEY, val frozen<bar>);").get();
+        e.execute_cql("insert into tbl (key, val) values (1, {bar_k: {foo_k: 2, zed_v: 3} });").get();
+    });
+}
+
+SEASTAR_TEST_CASE(test_nested_type_mutation_in_update) {
+    // ALTER TYPE always creates a mutation with a single type. This
+    // creates a mutation with 2 types, one nested in the other, to
+    // show that we can handle that.
+    return do_with_cql_env_thread([](cql_test_env& e) {
+        counting_migration_listener listener;
+        service::get_local_migration_manager().register_listener(&listener);
+
+        e.execute_cql("CREATE TYPE foo (foo_k int);").get();
+        e.execute_cql("CREATE TYPE bar (bar_k frozen<foo>);").get();
+
+        BOOST_REQUIRE_EQUAL(listener.create_user_type_count, 2);
+
+        service::migration_manager& mm = service::get_local_migration_manager();
+        auto&& keyspace = e.db().local().find_keyspace("ks").metadata();
+
+        auto type1 = user_type_impl::get_instance("ks", to_bytes("foo"), {"foo_k", "extra"}, {int32_type, int32_type});
+        auto muts1 = db::schema_tables::make_create_type_mutations(keyspace, type1, api::new_timestamp());
+
+        auto type2 = user_type_impl::get_instance("ks", to_bytes("bar"), {"bar_k", "extra"}, {type1, int32_type});
+        auto muts2 = db::schema_tables::make_create_type_mutations(keyspace, type2, api::new_timestamp());
+
+        auto muts = muts1;
+        muts.insert(muts.end(), muts2.begin(), muts2.end());
+        mm.announce(std::move(muts), false).get();
+
+        BOOST_REQUIRE_EQUAL(listener.create_user_type_count, 2);
+        BOOST_REQUIRE_EQUAL(listener.update_user_type_count, 2);
+    });
+}
+
 SEASTAR_TEST_CASE(test_notifications) {
    return do_with_cql_env([](cql_test_env& e) {
        return seastar::async([&] {
--- a/tests/types_test.cc
+++ b/tests/types_test.cc
@@ -385,6 +385,8 @@ BOOST_AUTO_TEST_CASE(test_varint) {
    BOOST_CHECK_EQUAL(value_cast<boost::multiprecision::cpp_int>(varint_type->deserialize(from_hex("00deadbeef"))), boost::multiprecision::cpp_int("0xdeadbeef"));
    BOOST_CHECK_EQUAL(value_cast<boost::multiprecision::cpp_int>(varint_type->deserialize(from_hex("00ffffffffffffffffffffffffffffffff"))), boost::multiprecision::cpp_int("340282366920938463463374607431768211455"));

+    BOOST_REQUIRE_EQUAL(from_hex("80000000"), varint_type->decompose(boost::multiprecision::cpp_int(-2147483648)));
+
    test_parsing_fails(varint_type, "1A");
 }

--- a/tools/toolchain/dbuild
+++ b/tools/toolchain/dbuild
@@ -38,6 +38,7 @@ if [[ "$1" = -* ]]; then
 fi

 docker_common_args=(
+       --pids-limit -1 \
       --network host \
       -u "$(id -u):$(id -g)" \
       "${group_args[@]}" \
--- a/tracing/tracing.cc
+++ b/tracing/tracing.cc
@@ -206,8 +206,9 @@ void tracing::set_trace_probability(double p) {
 }

 one_session_records::one_session_records()
-    : backend_state_ptr(tracing::get_local_tracing_instance().allocate_backend_session_state())
-    , budget_ptr(tracing::get_local_tracing_instance().get_cached_records_ptr()) {}
+    : _local_tracing_ptr(tracing::get_local_tracing_instance().shared_from_this())
+    , backend_state_ptr(_local_tracing_ptr->allocate_backend_session_state())
+    , budget_ptr(_local_tracing_ptr->get_cached_records_ptr()) {}

 std::ostream& operator<<(std::ostream& os, const span_id& id) {
    return os << id.get_id();
--- a/tracing/tracing.hh
+++ b/tracing/tracing.hh
@@ -240,6 +240,8 @@ public:
 };

 class one_session_records {
+private:
+    shared_ptr<tracing> _local_tracing_ptr;
 public:
    utils::UUID session_id;
    session_record session_rec;
@@ -665,7 +667,7 @@ private:

 void one_session_records::set_pending_for_write() {
    _is_pending_for_write = true;
-    budget_ptr = tracing::get_local_tracing_instance().get_pending_records_ptr();
+    budget_ptr = _local_tracing_ptr->get_pending_records_ptr();
 }

 void one_session_records::data_consumed() {
@@ -674,7 +676,7 @@ void one_session_records::data_consumed() {
    }

    _is_pending_for_write = false;
-    budget_ptr = tracing::get_local_tracing_instance().get_cached_records_ptr();
+    budget_ptr = _local_tracing_ptr->get_cached_records_ptr();
 }

 inline span_id span_id::make_span_id() {
--- a/types.cc
+++ b/types.cc
@@ -1558,6 +1558,13 @@ public:
        }
        out = std::copy(b.crbegin(), b.crend(), out);
    }
+    static size_t serialized_size_aux(const boost::multiprecision::cpp_int& num) {
+        if (num) {
+            return align_up(boost::multiprecision::msb(num) + 2, 8u) / 8;
+        } else {
+            return 1;
+        }
+    }
    virtual size_t serialized_size(const void* value) const override {
        if (!value) {
            return 0;
@@ -1570,8 +1577,10 @@ public:
        if (!num) {
            return 1;
        }
-        auto pnum = abs(num);
-        return align_up(boost::multiprecision::msb(pnum) + 2, 8u) / 8;
+        if (num < 0) {
+            return serialized_size_aux(-num - 1);
+        }
+        return serialized_size_aux(num);
    }
    virtual int32_t compare(bytes_view v1, bytes_view v2) const override {
        if (v1.empty()) {
@@ -2087,8 +2096,7 @@ struct empty_type_impl : abstract_type {
        return false;
    }
    virtual std::optional<data_type> update_user_type(const shared_ptr<const user_type_impl> updated) const {
-        // Can't happen
-        abort();
+        return std::nullopt;
    }
 };

--- a/types/user.hh
+++ b/types/user.hh
@@ -51,6 +51,7 @@ public:
    bytes_view field_name(size_t i) const { return _field_names[i]; }
    sstring field_name_as_string(size_t i) const { return _string_field_names[i]; }
    const std::vector<bytes>& field_names() const { return _field_names; }
+    const std::vector<sstring>& string_field_names() const { return _string_field_names; }
    sstring get_name_as_string() const;
    virtual sstring cql3_type_name_impl() const override;
    virtual bool is_native() const override { return false; }
--- a/utils/UUID.hh
+++ b/utils/UUID.hh
@@ -59,11 +59,15 @@ public:
        return (most_sig_bits >> 12) & 0xf;
    }

+    bool is_timestamp() const {
+        return version() == 1;
+    }
+
    int64_t timestamp() const {
        //if (version() != 1) {
        //     throw new UnsupportedOperationException("Not a time-based UUID");
        //}
-        assert(version() == 1);
+        assert(is_timestamp());

        return ((most_sig_bits & 0xFFF) << 48) |
               (((most_sig_bits >> 16) & 0xFFFF) << 32) |
--- a/utils/UUID_gen.hh
+++ b/utils/UUID_gen.hh
@@ -75,7 +75,7 @@ private:
    // placement of this singleton is important.  It needs to be instantiated *AFTER* the other statics.
    static thread_local const std::unique_ptr<UUID_gen> instance;

-    int64_t last_nanos = 0;
+    uint64_t last_nanos = 0;

    UUID_gen()
    {
@@ -91,7 +91,9 @@ public:
     */
    static UUID get_time_UUID()
    {
-        return UUID(instance->create_time_safe(), clock_seq_and_node);
+        auto uuid = UUID(instance->create_time_safe(), clock_seq_and_node);
+        assert(uuid.is_timestamp());
+        return uuid;
    }

    /**
@@ -101,7 +103,9 @@ public:
     */
    static UUID get_time_UUID(int64_t when)
    {
-        return UUID(create_time(from_unix_timestamp(when)), clock_seq_and_node);
+        auto uuid = UUID(create_time(from_unix_timestamp(when)), clock_seq_and_node);
+        assert(uuid.is_timestamp());
+        return uuid;
    }

    /**
@@ -115,12 +119,21 @@ public:
        // "nanos" needs to be in 100ns intervals since the adoption of the Gregorian calendar in the West.
        uint64_t nanos = duration_cast<nanoseconds>(tp.time_since_epoch()).count() / 100;
        nanos -= (10000ULL * START_EPOCH);
-        return UUID(create_time(nanos), clock_seq_and_node);
+        auto uuid = UUID(create_time(nanos), clock_seq_and_node);
+        assert(uuid.is_timestamp());
+        return uuid;
    }

    static UUID get_time_UUID(int64_t when, int64_t clock_seq_and_node)
    {
-        return UUID(create_time(from_unix_timestamp(when)), clock_seq_and_node);
+        auto uuid = UUID(create_time(from_unix_timestamp(when)), clock_seq_and_node);
+        assert(uuid.is_timestamp());
+        return uuid;
+    }
+
+    /** validates uuid from raw bytes. */
+    static bool is_valid_UUID(bytes raw) {
+        return raw.size() == 16;
    }

    /** creates uuid from raw bytes. */
@@ -176,7 +189,9 @@ public:
     */
    static UUID min_time_UUID(int64_t timestamp)
    {
-        return UUID(create_time(from_unix_timestamp(timestamp)), MIN_CLOCK_SEQ_AND_NODE);
+        auto uuid = UUID(create_time(from_unix_timestamp(timestamp)), MIN_CLOCK_SEQ_AND_NODE);
+        assert(uuid.is_timestamp());
+        return uuid;
    }

    /**
@@ -192,7 +207,9 @@ public:
        // timestamp 1ms, then we should not extend 100's nanoseconds
        // precision by taking 10000, but rather 19999.
        int64_t uuid_tstamp = from_unix_timestamp(timestamp + 1) - 1;
-        return UUID(create_time(uuid_tstamp), MAX_CLOCK_SEQ_AND_NODE);
+        auto uuid = UUID(create_time(uuid_tstamp), MAX_CLOCK_SEQ_AND_NODE);
+        assert(uuid.is_timestamp());
+        return uuid;
    }

    /**
@@ -276,6 +293,15 @@ public:
        return (uuid.timestamp() / 10000) + START_EPOCH;
    }

+    static uint64_t make_nanos_since(int64_t millis) {
+        return (static_cast<uint64_t>(millis) - static_cast<uint64_t>(START_EPOCH)) * 10000;
+    }
+
+    // nanos_since must fit in 60 bits
+    static bool is_valid_nanos_since(uint64_t nanos_since) {
+        return !(0xf000000000000000UL & nanos_since);
+    }
+
 private:

    // needs to return two different values for the same when.
@@ -287,7 +313,7 @@ private:
        using namespace std::chrono;
        int64_t millis = duration_cast<milliseconds>(
                system_clock::now().time_since_epoch()).count();
-        int64_t nanos_since = (millis - START_EPOCH) * 10000;
+        uint64_t nanos_since = make_nanos_since(millis);
        if (nanos_since > last_nanos)
            last_nanos = nanos_since;
        else
@@ -298,16 +324,17 @@ private:

    int64_t create_time_unsafe(int64_t when, int nanos)
    {
-        uint64_t nanos_since = ((when - START_EPOCH) * 10000) + nanos;
+        uint64_t nanos_since = make_nanos_since(when) + static_cast<uint64_t>(static_cast<int64_t>(nanos));
        return create_time(nanos_since);
    }

    static int64_t create_time(uint64_t nanos_since)
    {
        uint64_t msb = 0L;
+        assert(is_valid_nanos_since(nanos_since));
        msb |= (0x00000000ffffffffL & nanos_since) << 32;
        msb |= (0x0000ffff00000000UL & nanos_since) >> 16;
-        msb |= (0xffff000000000000UL & nanos_since) >> 48;
+        msb |= (0x0fff000000000000UL & nanos_since) >> 48;
        msb |= 0x0000000000001000L; // sets the version to 1.
        return msb;
    }
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -2065,6 +2065,17 @@ bool segment_pool::migrate_segment(segment* src, segment* dst)
 #endif

 void tracker::impl::register_region(region::impl* r) {
+    // If needed, increase capacity of regions before taking the reclaim lock,
+    // to avoid failing an allocation when push_back() tries to increase
+    // capacity.
+    //
+    // The capacity increase is atomic (wrt _regions) so it cannot be
+    // observed
+    if (_regions.size() == _regions.capacity()) {
+        auto copy = _regions;
+        copy.reserve(copy.capacity() * 2);
+        _regions = std::move(copy);
+    }
    reclaiming_lock _(*this);
    _regions.push_back(r);
    llogger.debug("Registered region @{} with id={}", r, r->id());