doc: remove wrong image upgrade info (5.2-to-2023.1)

This commit removes the information about the recommended way of upgrading ScyllaDB images - by updating ScyllaDB and OS packages in one step. This upgrade procedure is not supported (it was implemented, but then reverted). Refs https://github.com/scylladb/scylladb/issues/15733 Closes scylladb/scylladb#21876 Fixes https://github.com/scylladb/scylla-enterprise/issues/5041 Fixes https://github.com/scylladb/scylladb/issues/21898 (cherry picked from commit 98860905d8)
db/config.cc: increment components_memory_reclaim_threshold config default
2024-12-12 15:28:20 +02:00 · 2024-06-04 07:13:28 +03:00 · 2024-05-30 11:11:39 +03:00 · 2024-05-30 11:10:49 +03:00 · 2024-05-27 08:52:06 +03:00 · 2024-05-26 16:30:06 +03:00
212 changed files with 4124 additions and 1387 deletions
--- a/2
+++ b/2
@@ -72,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.2.11
+VERSION=5.2.19

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -764,7 +764,6 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
        co_return api_error::access_denied("Incorrect resource identifier");
    }
    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
    const rjson::value* tags = rjson::find(request, "Tags");
    if (!tags || !tags->IsArray()) {
        co_return api_error::validation("Cannot parse tags");
@@ -772,8 +771,9 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    if (tags->Size() < 1) {
        co_return api_error::validation("The number of tags must be at least 1") ;
    }
-    update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
+    });
    co_return json_string("");
 }

@@ -791,9 +791,9 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli

    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+    });
    co_return json_string("");
 }

--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -94,24 +94,25 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    if (enabled) {
-        if (tags_map.contains(TTL_TAG_KEY)) {
-            co_return api_error::validation("TTL is already enabled");
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
+        if (enabled) {
+            if (tags_map.contains(TTL_TAG_KEY)) {
+                throw api_error::validation("TTL is already enabled");
+            }
+            tags_map[TTL_TAG_KEY] = attribute_name;
+        } else {
+            auto i = tags_map.find(TTL_TAG_KEY);
+            if (i == tags_map.end()) {
+                throw api_error::validation("TTL is already disabled");
+            } else if (i->second != attribute_name) {
+                throw api_error::validation(format(
+                    "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
+                    attribute_name, i->second));
+            }
+            tags_map.erase(TTL_TAG_KEY);
        }
-        tags_map[TTL_TAG_KEY] = attribute_name;
-    } else {
-        auto i = tags_map.find(TTL_TAG_KEY);
-        if (i == tags_map.end()) {
-            co_return api_error::validation("TTL is already disabled");
-        } else if (i->second != attribute_name) {
-            co_return api_error::validation(format(
-                "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
-                attribute_name, i->second));
-        }
-        tags_map.erase(TTL_TAG_KEY);
-    }
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    });
+
    // Prepare the response, which contains a TimeToLiveSpecification
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
--- a/api/api-doc/raft.json
+++ b/api/api-doc/raft.json
@@ -0,0 +1,43 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/raft",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/raft/trigger_snapshot/{group_id}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Triggers snapshot creation and log truncation for the given Raft group",
+               "type":"string",
+               "nickname":"trigger_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"group_id",
+                     "description":"The ID of the group which should get snapshotted",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Timeout in seconds after which the endpoint returns a failure. If not provided, 60s is used.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api.cc
+++ b/api/api.cc
@@ -31,6 +31,7 @@
 #include "api/config.hh"
 #include "task_manager.hh"
 #include "task_manager_test.hh"
+#include "raft.hh"

 logging::logger apilog("api");

@@ -277,6 +278,18 @@ future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::confi

 #endif

+future<> set_server_raft(http_context& ctx, sharded<service::raft_group_registry>& raft_gr) {
+    auto rb = std::make_shared<api_registry_builder>(ctx.api_doc);
+    return ctx.http_server.set_routes([rb, &ctx, &raft_gr] (routes& r) {
+        rb->register_function(r, "raft", "The Raft API");
+        set_raft(ctx, r, raft_gr);
+    });
+}
+
+future<> unset_server_raft(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
+}
+
 void req_params::process(const request& req) {
    // Process mandatory parameters
    for (auto& [name, ent] : params) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -22,6 +22,7 @@ namespace service {
 class load_meter;
 class storage_proxy;
 class storage_service;
+class raft_group_registry;

 } // namespace service

@@ -116,5 +117,7 @@ future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
 future<> set_server_task_manager(http_context& ctx);
 future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::config> cfg);
+future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
+future<> unset_server_raft(http_context&);

 }
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <seastar/core/coroutine.hh>
+
+#include "api/api.hh"
+#include "api/api-doc/raft.json.hh"
+
+#include "service/raft/raft_group_registry.hh"
+
+using namespace seastar::httpd;
+
+extern logging::logger apilog;
+
+namespace api {
+
+namespace r = httpd::raft_json;
+using namespace json;
+
+void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
+    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
+        raft::group_id gid{utils::UUID{req->param["group_id"]}};
+        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
+            if (timeout_str.empty()) {
+                return std::chrono::seconds{60};
+            }
+            auto dur = std::stoll(timeout_str);
+            if (dur <= 0) {
+                throw std::runtime_error{"Timeout must be a positive number."};
+            }
+            return std::chrono::seconds{dur};
+        });
+
+        std::atomic<bool> found_srv{false};
+        co_await raft_gr.invoke_on_all([gid, timeout_dur, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
+            auto* srv = raft_gr.find_server(gid);
+            if (!srv) {
+                co_return;
+            }
+
+            found_srv = true;
+            abort_on_expiry aoe(lowres_clock::now() + timeout_dur);
+            apilog.info("Triggering Raft group {} snapshot", gid);
+            auto result = co_await srv->trigger_snapshot(&aoe.abort_source());
+            if (result) {
+                apilog.info("New snapshot for Raft group {} created", gid);
+            } else {
+                apilog.info("Could not create new snapshot for Raft group {}, no new entries applied", gid);
+            }
+        });
+
+        if (!found_srv) {
+            throw std::runtime_error{fmt::format("Server for group ID {} not found", gid)};
+        }
+
+        co_return json_void{};
+    });
+}
+
+void unset_raft(http_context&, httpd::routes& r) {
+    r::trigger_snapshot.unset(r);
+}
+
+}
+
--- a/api/raft.hh
+++ b/api/raft.hh
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2023-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "api_init.hh"
+
+namespace api {
+
+void set_raft(http_context& ctx, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr);
+void unset_raft(http_context& ctx, httpd::routes& r);
+
+}
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -176,7 +176,9 @@ void set_task_manager(http_context& ctx, routes& r) {
        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
            return task->done().then_wrapped([task] (auto f) {
                task->unregister_task();
-                f.get();
+                // done() is called only because we want the task to be complete before getting its status.
+                // The future should be ignored here as the result does not matter.
+                f.ignore_ready_future();
                return make_foreign(task);
            });
        }));
@@ -204,8 +206,8 @@ void set_task_manager(http_context& ctx, routes& r) {
        while (!q.empty()) {
            auto& current = q.front();
            res.push_back(co_await retrieve_status(current));
-            for (auto i = 0; i < current->get_children().size(); ++i) {
-                q.push(co_await current->get_children()[i].copy());
+            for (auto& child: current->get_children()) {
+                q.push(co_await child.copy());
            }
            q.pop();
        }
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -229,6 +229,8 @@ future<authenticated_user> password_authenticator::authenticate(
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
        } catch (exceptions::authentication_exception& e) {
            std::throw_with_nested(e);
+        } catch (exceptions::unavailable_exception& e) {
+            std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
        } catch (...) {
            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -25,6 +25,7 @@
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
 #include "utils/UUID_gen.hh"
+#include "utils/error_injection.hh"

 #include "cdc/generation.hh"
 #include "cdc/cdc_options.hh"
@@ -44,8 +45,16 @@ static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const

 namespace cdc {

-extern const api::timestamp_clock::duration generation_leeway =
-    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+api::timestamp_clock::duration get_generation_leeway() {
+    static thread_local auto generation_leeway =
+            std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+    utils::get_local_injector().inject("increase_cdc_generation_leeway", [&] {
+        generation_leeway = std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::minutes(5));
+    });
+
+    return generation_leeway;
+}

 static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    i = net::hton(i);
@@ -160,18 +169,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(std::vector<token_range_description> entries)
+topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const& {
+const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-std::vector<token_range_description>&& topology_description::entries() && {
+utils::chunked_vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -263,7 +272,7 @@ public:
    topology_description generate() const {
        const auto tokens = get_tokens();

-        std::vector<token_range_description> vnode_descriptions;
+        utils::chunked_vector<token_range_description> vnode_descriptions;
        vnode_descriptions.reserve(tokens.size());

        vnode_descriptions.push_back(
@@ -331,7 +340,7 @@ future<cdc::generation_id> generation_service::make_new_generation(const std::un
    auto new_generation_timestamp = [add_delay, ring_delay = _cfg.ring_delay] {
        auto ts = db_clock::now();
        if (add_delay && ring_delay != 0ms) {
-            ts += 2 * ring_delay + duration_cast<milliseconds>(generation_leeway);
+            ts += 2 * ring_delay + duration_cast<milliseconds>(get_generation_leeway());
        }
        return ts;
    };
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -46,6 +46,8 @@ namespace gms {

 namespace cdc {

+api::timestamp_clock::duration get_generation_leeway();
+
 class stream_id final {
    bytes _value;
 public:
@@ -94,13 +96,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    std::vector<token_range_description> _entries;
+    utils::chunked_vector<token_range_description> _entries;
 public:
-    topology_description(std::vector<token_range_description> entries);
+    topology_description(utils::chunked_vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const&;
-    std::vector<token_range_description>&& entries() &&;
+    const utils::chunked_vector<token_range_description>& entries() const&;
+    utils::chunked_vector<token_range_description>&& entries() &&;
 };

 /**
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -15,10 +15,6 @@

 extern logging::logger cdc_log;

-namespace cdc {
-    extern const api::timestamp_clock::duration generation_leeway;
-} // namespace cdc
-
 static api::timestamp_type to_ts(db_clock::time_point tp) {
    // This assumes that timestamp_clock and db_clock have the same epochs.
    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
@@ -40,7 +36,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const std::vector<cdc::token_range_description>& entries,
+        const utils::chunked_vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
@@ -73,7 +69,7 @@ bool cdc::metadata::streams_available() const {

 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
-    if (ts > now + generation_leeway.count()) {
+    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
@@ -86,27 +82,43 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // Nothing protects us from that until we start using transactions for generation switching.
    }

-    auto it = gen_used_at(now);
-    if (it == _gens.end()) {
+    auto it = gen_used_at(now - get_generation_leeway().count());
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+    }
+
+    if (ts <= now - get_generation_leeway().count()) {
+        // We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
+        // happens iff one of the following is true:
+        // - the write is to no generation,
+        // - the write is to a generation older than the generation under `it`,
+        // - the write is to the generation under `it` and that generation is not the current generation.
+        // Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
+        // but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
+        // are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
+        // the generation under `it` because that generation was operating at `now - generation_leeway`.
+        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
+        if (it == _gens.end() || ts < it->first || is_previous_gen) {
+            throw exceptions::invalid_request_exception(format(
+                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
+                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                    " consistency properties.\n"
+                    "We *do* allow sending writes into the near past, but our ability to do that is limited."
+                    " Are you using client-side timestamps? Make sure your clocks are well-synchronized"
+                    " with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        }
+    }
+
+    it = _gens.begin();
+    if (it == _gens.end() || ts < it->first) {
        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
    }

-    // Garbage-collect generations that will no longer be used.
-    it = _gens.erase(_gens.begin(), it);
-
-    if (it->first > ts) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream from an earlier generation than the currently used one."
-                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                " consistency properties (write timestamp: {}, current generation started at: {})",
-                format_timestamp(ts), format_timestamp(it->first)));
-    }
-
-    // With `generation_leeway` we allow sending writes to the near future. It might happen
-    // that `ts` doesn't belong to the current generation ("current" according to our clock),
-    // but to the next generation. Adjust for this case:
+    // Find the generation operating at `ts`.
    {
        auto next_it = std::next(it);
        while (next_it != _gens.end() && next_it->first <= ts) {
@@ -147,8 +159,8 @@ bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
        ++it;
    }

-    // Check if some new generation has already superseded this one.
-    return it != _gens.end() && it->first <= api::new_timestamp();
+    // Check if the generation is obsolete.
+    return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
 }

 bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
@@ -157,7 +169,7 @@ bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen)
    }

    auto now = api::new_timestamp();
-    auto it = gen_used_at(now);
+    auto it = gen_used_at(now - get_generation_leeway().count());

    if (it != _gens.end()) {
        // Garbage-collect generations that will no longer be used.
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -42,7 +42,9 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
 public:
-    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
+     * it is older than the generation operating at `now - get_generation_leeway()`.
+     */
    bool known_or_obsolete(db_clock::time_point) const;

    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
@@ -54,8 +56,9 @@ public:
     *
     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
-     * by the `cdc::generation_leeway` constant.
+     * yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
+     * the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
+     * `get_generation_leeway()`.
     */
    stream_id get_stream(api::timestamp_type ts, dht::token tok);

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -464,6 +464,7 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    double _estimated_droppable_tombstone_ratio = 0;
    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
@@ -521,7 +522,7 @@ protected:
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -584,6 +585,7 @@ protected:
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
        cfg.run_identifier = gc_run;
        cfg.monitor = monitor.get();
+        uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }
@@ -654,6 +656,7 @@ private:
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

+        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
@@ -688,12 +691,16 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
+            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state());
+            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->all()->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->all()->size(), _sstables.size());
        }
+        // _estimated_droppable_tombstone_ratio could exceed 1.0 in certain cases, so limit it to 1.0.
+        _estimated_droppable_tombstone_ratio = std::min(1.0, sum_of_estimated_droppable_tombstone_ratio / ssts->all()->size());

        _compacting = std::move(ssts);

@@ -1620,7 +1627,7 @@ private:
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1068,7 +1068,7 @@ void compaction_manager::submit(compaction::table_state& t) {

    // OK to drop future.
    // waited via task->stop()
-    (void)perform_task(make_shared<regular_compaction_task>(*this, t));
+    (void)perform_task(make_shared<regular_compaction_task>(*this, t)).then_wrapped([] (auto f) { f.ignore_ready_future(); });
 }

 bool compaction_manager::can_perform_regular_compaction(compaction::table_state& t) {
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -65,7 +65,7 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

-uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
+uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
    return partition_estimate;
 }

@@ -750,8 +750,8 @@ compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema
    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

-uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate);
+uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
+    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
 }

 reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -108,7 +108,7 @@ public:

    compaction_backlog_tracker make_backlog_tracker();

-    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr);

    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -70,7 +70,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() = 0;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema);

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -100,16 +100,27 @@ public:
    };
 };

-uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
-        // Not enough information, we assume the worst
-        return partition_estimate / max_data_segregation_window_count;
-    }
-    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
-    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
-    const auto window_size = get_window_size(_options);
+uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) {
+    // If not enough information, we assume the worst
+    auto estimated_window_count = max_data_segregation_window_count;
+    auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
+    bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
+    auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
+        const auto window_size = get_window_size(_options);
+        return (max_window + (window_size - 1) - min_window) / window_size;
+    };

-    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
+    if (!min_and_max_ts_available && default_ttl.count()) {
+        auto min_window = get_window_for(_options, timestamp_type(0));
+        auto max_window = get_window_for(_options, timestamp_type(default_ttl.count()));
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    } else if (min_and_max_ts_available) {
+        auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
+        auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    }

    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
 }
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -157,7 +157,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) override;
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) override;

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) override;

--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -560,7 +560,7 @@ public:
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
-                throw runtime_exception(format("non-zero component divider found ({:d}) mid", format("0x{:02x}", composite::eoc_type(marker) & 0xff)));
+                throw runtime_exception(format("non-zero component divider found ({:#02x}) mid", composite::eoc_type(marker) & 0xff));
            }
        }
        return ret;
--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -117,6 +117,8 @@ struct date_type_impl final : public concrete_type<db_clock::time_point> {

 using timestamp_date_base_class = concrete_type<db_clock::time_point>;

+sstring timestamp_to_json_string(const timestamp_date_base_class& t, const bytes_view& bv);
+
 struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    timeuuid_type_impl();
    static utils::UUID from_sstring(sstring_view s);
--- a/configure.py
+++ b/configure.py
@@ -698,6 +698,7 @@ scylla_core = (['message/messaging_service.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
+                'utils/on_internal_error.cc',
                'converting_mutation_partition_applier.cc',
                'readers/combined.cc',
                'readers/multishard.cc',
@@ -969,6 +970,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -1077,6 +1079,8 @@ api = ['api/api.cc',
       Json2Code('api/api-doc/error_injection.json'),
       'api/authorization_cache.cc',
       Json2Code('api/api-doc/authorization_cache.json'),
+       'api/raft.cc',
+       Json2Code('api/api-doc/raft.json'),
       ]

 alternator = [
@@ -1269,7 +1273,7 @@ deps['test/boost/bytes_ostream_test'] = [
    "test/lib/log.cc",
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
-deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
+deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc', 'utils/on_internal_error.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -78,7 +78,7 @@ static
 void
 usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    if (!receiver.type->is_user_type()) {
-        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto ut = static_pointer_cast<const user_type_impl>(receiver.type);
@@ -90,7 +90,7 @@ usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_
        const expression& value = u.elements.at(field);
        auto&& field_spec = usertype_field_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *field_spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", receiver.name, field, field_spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", *receiver.name, field, field_spec->type->as_cql3_type()));
        }
    }
 }
@@ -313,7 +313,7 @@ set_validate_assignable_to(const collection_constructor& c, data_dictionary::dat
            return;
        }

-        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto&& value_spec = set_value_spec_of(receiver);
@@ -501,18 +501,18 @@ void
 tuple_constructor_validate_assignable_to(const tuple_constructor& tc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver.type->underlying_type());
    if (!tt) {
-        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    for (size_t i = 0; i < tc.elements.size(); ++i) {
        if (i >= tt->size()) {
            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: too many elements. Type {} expects {:d} but got {:d}",
-                                                            receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
+                                                            *receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
        }

        auto&& value = tc.elements[i];
        auto&& spec = component_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", receiver.name, i, spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", *receiver.name, i, spec->type->as_cql3_type()));
        }
    }
 }
--- a/cql3/operation.cc
+++ b/cql3/operation.cc
@@ -32,9 +32,9 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
    using exceptions::invalid_request_exception;
    auto rtype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!rtype) {
-        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!rtype->is_multi_cell()) {
-        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (rtype->get_kind() == abstract_type::kind::list) {
@@ -47,7 +47,7 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
            return make_shared<lists::setter_by_index>(receiver, std::move(idx), std::move(lval));
        }
    } else if (rtype->get_kind() == abstract_type::kind::set) {
-        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name_as_text()));
    } else if (rtype->get_kind() == abstract_type::kind::map) {
        auto key = prepare_expression(_selector, db, keyspace, nullptr, maps::key_spec_of(*receiver.column_specification));
        auto mval = prepare_expression(_value, db, keyspace, nullptr, maps::value_spec_of(*receiver.column_specification));
@@ -136,11 +136,11 @@ operation::addition::prepare(data_dictionary::database db, const sstring& keyspa
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        return make_shared<constants::adder>(receiver, std::move(v));
    } else if (!ctype->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -169,14 +169,14 @@ operation::subtraction::prepare(data_dictionary::database db, const sstring& key
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);
        return make_shared<constants::subtracter>(receiver, std::move(v));
    }
    if (!ctype->is_multi_cell()) {
        throw exceptions::invalid_request_exception(
-                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -211,9 +211,9 @@ operation::prepend::prepare(data_dictionary::database db, const sstring& keyspac
    auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);

    if (!dynamic_cast<const list_type_impl*>(receiver.type.get())) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name_as_text()));
    }

    return make_shared<lists::prepender>(receiver, std::move(v));
@@ -340,9 +340,9 @@ operation::element_deletion::affected_column() const {
 shared_ptr<operation>
 operation::element_deletion::prepare(data_dictionary::database db, const sstring& keyspace, const column_definition& receiver) const {
    if (!receiver.type->is_collection()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name_as_text()));
    }
    auto ctype = static_pointer_cast<const collection_type_impl>(receiver.type);
    if (ctype->get_kind() == abstract_type::kind::list) {
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -932,6 +932,9 @@ void query_processor::migration_subscriber::on_update_aggregate(const sstring& k
 void query_processor::migration_subscriber::on_update_view(
        const sstring& ks_name,
        const sstring& view_name, bool columns_changed) {
+    // scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
+    // them as such when changed.
+    on_update_column_family(ks_name, view_name, columns_changed);
 }

 void query_processor::migration_subscriber::on_drop_keyspace(const sstring& ks_name) {
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -404,20 +404,19 @@ alter_table_statement::prepare_schema_mutations(query_processor& qp, api::timest

 std::unique_ptr<cql3::statements::prepared_statement>
 cql3::statements::alter_table_statement::prepare(data_dictionary::database db, cql_stats& stats) {
+    auto t = db.try_find_table(keyspace(), column_family());
+    std::optional<schema_ptr> s = t ? std::make_optional(t->schema()) : std::nullopt;
+    std::optional<sstring> warning = check_restricted_table_properties(db, s, keyspace(), column_family(), *_properties);
+    if (warning) {
+        mylogger.warn("{}", *warning);
+    }
    return std::make_unique<prepared_statement>(make_shared<alter_table_statement>(*this));
 }

 future<::shared_ptr<messages::result_message>>
 alter_table_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    auto s = validation::validate_column_family(qp.db(), keyspace(), column_family());
-    std::optional<sstring> warning = check_restricted_table_properties(qp, s, keyspace(), column_family(), *_properties);
-    return schema_altering_statement::execute(qp, state, options).then([this, warning = std::move(warning)] (::shared_ptr<messages::result_message> msg) {
-        if (warning) {
-            msg->add_warning(*warning);
-            mylogger.warn("{}", *warning);
-        }
-        return msg;
-    });
+    validation::validate_column_family(qp.db(), keyspace(), column_family());
+    return schema_altering_statement::execute(qp, state, options);
 }

 }
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -141,6 +141,18 @@ user_type alter_type_statement::add_or_alter::do_add(data_dictionary::database d
        throw exceptions::invalid_request_exception(format("Cannot add new field to type {}: maximum number of fields reached", _name));
    }

+    if (_field_type->is_duration()) {
+        auto&& ks = db.find_keyspace(keyspace());
+        for (auto&& schema : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
+            for (auto&& column : schema->clustering_key_columns()) {
+                if (column.type->references_user_type(_name.get_keyspace(), _name.get_user_type_name())) {
+                    throw exceptions::invalid_request_exception(format("Cannot add new field to type {} because it is used in the clustering key column {} of table {}.{} where durations are not allowed",
+                        _name.to_string(), column.name_as_text(), schema->ks_name(), schema->cf_name()));
+                }
+            }
+        }
+    }
+
    std::vector<bytes> new_names(to_update->field_names());
    new_names.push_back(_field_name->name());
    std::vector<data_type> new_types(to_update->field_types());
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -185,6 +185,10 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    if (_properties.properties()->get_synchronous_updates_flag()) {
        throw exceptions::invalid_request_exception(format("The synchronous_updates option is only applicable to materialized views, not to base tables"));
    }
+    std::optional<sstring> warning = check_restricted_table_properties(db, std::nullopt, keyspace(), column_family(), *_properties.properties());
+    if (warning) {
+        mylogger.warn("{}", *warning);
+    }
    const bool has_default_ttl = _properties.properties()->get_default_time_to_live() > 0;

    auto stmt = ::make_shared<create_table_statement>(*_cf_name, _properties.properties(), _if_not_exists, _static_columns, _properties.properties()->get_id());
@@ -426,7 +430,7 @@ void create_table_statement::raw_statement::add_column_alias(::shared_ptr<column
 // legal but restricted by the configuration. Checks for other of errors
 // in the table's options are done elsewhere.
 std::optional<sstring> check_restricted_table_properties(
-    query_processor& qp,
+    data_dictionary::database db,
    std::optional<schema_ptr> schema,
    const sstring& keyspace, const sstring& table,
    const cf_prop_defs& cfprops)
@@ -450,7 +454,7 @@ std::optional<sstring> check_restricted_table_properties(
    auto cs = (strategy) ? strategy : current_strategy;

    if (strategy && *strategy == sstables::compaction_strategy_type::date_tiered) {
-        switch(qp.db().get_config().restrict_dtcs()) {
+        switch(db.get_config().restrict_dtcs()) {
        case db::tri_mode_restriction_t::mode::TRUE:
            throw exceptions::configuration_exception(
                "DateTieredCompactionStrategy is deprecated, and "
@@ -471,7 +475,7 @@ std::optional<sstring> check_restricted_table_properties(
        std::map<sstring, sstring> options = (strategy) ? cfprops.get_compaction_type_options() : (*schema)->compaction_strategy_options();
        sstables::time_window_compaction_strategy_options twcs_options(options);
        long ttl = (cfprops.has_property(cf_prop_defs::KW_DEFAULT_TIME_TO_LIVE)) ? cfprops.get_default_time_to_live() : current_ttl.count();
-        auto max_windows = qp.db().get_config().twcs_max_window_count();
+        auto max_windows = db.get_config().twcs_max_window_count();

        // It may happen that an user tries to update an unrelated table property. Allow the request through.
        if (!cfprops.has_property(cf_prop_defs::KW_DEFAULT_TIME_TO_LIVE) && !strategy) {
@@ -491,7 +495,7 @@ std::optional<sstring> check_restricted_table_properties(
                                                   "highly discouraged.", ttl, twcs_options.get_sstable_window_size().count(), window_count, max_windows));
            }
        } else {
-              switch (qp.db().get_config().restrict_twcs_without_default_ttl()) {
+              switch (db.get_config().restrict_twcs_without_default_ttl()) {
              case db::tri_mode_restriction_t::mode::TRUE:
                  throw exceptions::configuration_exception(
                      "TimeWindowCompactionStrategy tables without a strict default_time_to_live setting "
@@ -510,18 +514,6 @@ std::optional<sstring> check_restricted_table_properties(
    return std::nullopt;
 }

-future<::shared_ptr<messages::result_message>>
-create_table_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    std::optional<sstring> warning = check_restricted_table_properties(qp, std::nullopt, keyspace(), column_family(), *_properties);
-    return schema_altering_statement::execute(qp, state, options).then([this, warning = std::move(warning)] (::shared_ptr<messages::result_message> msg) {
-        if (warning) {
-            msg->add_warning(*warning);
-            mylogger.warn("{}", *warning);
-        }
-        return msg;
-    });
-}
-
 }

 }
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -79,9 +79,6 @@ public:

    virtual future<> grant_permissions_to_creator(const service::client_state&) const override;

-    virtual future<::shared_ptr<messages::result_message>>
-    execute(query_processor& qp, service::query_state& state, const query_options& options) const override;
-
    schema_ptr get_cf_meta_data(const data_dictionary::database) const;

    class raw_statement;
@@ -129,7 +126,7 @@ public:
 };

 std::optional<sstring> check_restricted_table_properties(
-    query_processor& qp,
+    data_dictionary::database db,
    std::optional<schema_ptr> schema,
    const sstring& keyspace, const sstring& table,
    const cf_prop_defs& cfprops);
--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -60,7 +60,11 @@ void use_statement::validate(query_processor&, const service::client_state& stat

 future<::shared_ptr<cql_transport::messages::result_message>>
 use_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    state.get_client_state().set_keyspace(qp.db().real_database(), _keyspace);
+    try {
+        state.get_client_state().set_keyspace(qp.db().real_database(), _keyspace);
+    } catch(...) {
+        return make_exception_future<::shared_ptr<cql_transport::messages::result_message>>(std::current_exception());
+    }
    auto result =::make_shared<cql_transport::messages::result_message::set_keyspace>(_keyspace);
    return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>(result);
 }
--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -151,14 +151,19 @@ static bytes from_json_object_aux(const map_type_impl& t, const rjson::value& va
    std::map<bytes, bytes, serialized_compare> raw_map(t.get_keys_type()->as_less_comparator());
    for (auto it = value.MemberBegin(); it != value.MemberEnd(); ++it) {
        bytes value = from_json_object(*t.get_values_type(), it->value);
-        if (!t.get_keys_type()->is_compatible_with(*utf8_type)) {
+        if (t.get_keys_type()->underlying_type() == ascii_type ||
+            t.get_keys_type()->underlying_type() == utf8_type) {
+            raw_map.emplace(from_json_object(*t.get_keys_type(), it->name), std::move(value));
+        } else {
            // Keys in maps can only be strings in JSON, but they can also be a string representation
            // of another JSON type, which needs to be reparsed. Example - map<frozen<list<int>>, int>
            // will be represented like this: { "[1, 3, 6]": 3, "[]": 0, "[1, 2]": 2 }
-            rjson::value map_key = rjson::parse(rjson::to_string_view(it->name));
-            raw_map.emplace(from_json_object(*t.get_keys_type(), map_key), std::move(value));
-        } else {
-            raw_map.emplace(from_json_object(*t.get_keys_type(), it->name), std::move(value));
+            try {
+                rjson::value map_key = rjson::parse(rjson::to_string_view(it->name));
+                raw_map.emplace(from_json_object(*t.get_keys_type(), map_key), std::move(value));
+            } catch (rjson::error& e) {
+                throw marshal_exception(format("Failed parsing map_key {}: {}", it->name, e.what()));
+            }
        }
    }
    return map_type_impl::serialize_to_bytes(raw_map);
@@ -480,7 +485,7 @@ struct to_json_string_visitor {
    sstring operator()(const string_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const bytes_type_impl& t) { return quote_json_string("0x" + t.to_string(bv)); }
    sstring operator()(const boolean_type_impl& t) { return t.to_string(bv); }
-    sstring operator()(const timestamp_date_base_class& t) { return quote_json_string(t.to_string(bv)); }
+    sstring operator()(const timestamp_date_base_class& t) { return quote_json_string(timestamp_to_json_string(t, bv)); }
    sstring operator()(const timeuuid_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const map_type_impl& t) { return to_json_string_aux(t, bv); }
    sstring operator()(const set_type_impl& t) { return to_json_string_aux(t, bv); }
@@ -488,7 +493,7 @@ struct to_json_string_visitor {
    sstring operator()(const tuple_type_impl& t) { return to_json_string_aux(t, bv); }
    sstring operator()(const user_type_impl& t) { return to_json_string_aux(t, bv); }
    sstring operator()(const simple_date_type_impl& t) { return quote_json_string(t.to_string(bv)); }
-    sstring operator()(const time_type_impl& t) { return t.to_string(bv); }
+    sstring operator()(const time_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const empty_type_impl& t) { return "null"; }
    sstring operator()(const duration_type_impl& t) {
        auto v = t.deserialize(bv);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -2591,12 +2591,20 @@ db::commitlog::read_log_file(sstring filename, sstring pfx, seastar::io_priority
            return eof || next == pos;
        }
        future<> skip(size_t bytes) {
-            pos += bytes;
-            if (pos > file_size) {
+            auto n = std::min(file_size - pos, bytes);
+            pos += n;
+            if (pos == file_size) {
                eof = true;
-                pos = file_size;
            }
-            return fin.skip(bytes);
+            if (n < bytes) {
+                // if we are trying to skip past end, we have at least
+                // the bytes skipped or the source from where we read 
+                // this corrupt. So add at least four bytes. This is
+                // inexact, but adding the full "bytes" is equally wrong
+                // since it could be complete garbled junk.
+                corrupt_size += std::max(n, sizeof(uint32_t));
+            }
+            return fin.skip(n);
        }
        void stop() {
            eof = true;
--- a/db/config.cc
+++ b/db/config.cc
@@ -406,6 +406,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Adjusts the sensitivity of the failure detector on an exponential scale. Generally this setting never needs adjusting.\n"
        "Related information: Failure detection and recovery")
    , failure_detector_timeout_in_ms(this, "failure_detector_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 20 * 1000, "Maximum time between two successful echo message before gossip mark a node down in milliseconds.\n")
+    , direct_failure_detector_ping_timeout_in_ms(this, "direct_failure_detector_ping_timeout_in_ms", value_status::Used, 600, "Duration after which the direct failure detector aborts a ping message, so the next ping can start.\n"
+        "Note: this failure detector is used by Raft, and is different from gossiper's failure detector (configured by `failure_detector_timeout_in_ms`).\n")
    /* Performance tuning properties */
    /* Tuning performance and system reso   urce utilization, including commit log, compaction, memory, disk I/O, CPU, reads, and writes. */
    /* Commit log settings */
@@ -817,6 +819,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , unspooled_dirty_soft_limit(this, "unspooled_dirty_soft_limit", value_status::Used, 0.6, "Soft limit of unspooled dirty memory expressed as a portion of the hard limit")
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default) "
        "bytes written to data file. Value must be between 0 and 1.")
+    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting")
--- a/db/config.hh
+++ b/db/config.hh
@@ -173,6 +173,7 @@ public:
    named_value<bool> snapshot_before_compaction;
    named_value<uint32_t> phi_convict_threshold;
    named_value<uint32_t> failure_detector_timeout_in_ms;
+    named_value<uint32_t> direct_failure_detector_ping_timeout_in_ms;
    named_value<sstring> commitlog_sync;
    named_value<uint32_t> commitlog_segment_size_in_mb;
    named_value<uint32_t> schema_commitlog_segment_size_in_mb;
@@ -322,6 +323,7 @@ public:
    named_value<unsigned> murmur3_partitioner_ignore_msb_bits;
    named_value<double> unspooled_dirty_soft_limit;
    named_value<double> sstable_summary_ratio;
+    named_value<double> components_memory_reclaim_threshold;
    named_value<size_t> large_memory_allocation_warning_threshold;
    named_value<bool> enable_deprecated_partitioners;
    named_value<bool> enable_keyspace_column_family_metrics;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -860,7 +860,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
        ctx_ptr->mark_hint_as_in_progress(rp);

        // Future is waited on indirectly in `send_one_file()` (via `ctx_ptr->file_send_gate`).
-        (void)with_gate(ctx_ptr->file_send_gate, [this, secs_since_file_mod, &fname, buf = std::move(buf), rp, ctx_ptr] () mutable {
+        auto h = ctx_ptr->file_send_gate.hold();
+        (void)std::invoke([this, secs_since_file_mod, &fname, buf = std::move(buf), rp, ctx_ptr] () mutable {
            try {
                auto m = this->get_mutation(ctx_ptr, buf);
                gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();
@@ -896,7 +897,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                return make_exception_future<>(std::move(eptr));
            }
            return make_ready_future<>();
-        }).then_wrapped([this, units = std::move(units), rp, ctx_ptr] (future<>&& f) {
+        }).then_wrapped([this, units = std::move(units), rp, ctx_ptr, h = std::move(h)] (future<>&& f) {
            // Information about the error was already printed somewhere higher.
            // We just need to account in the ctx that sending of this hint has failed.
            if (!f.failed()) {
--- a/db/hints/sync_point.cc
+++ b/db/hints/sync_point.cc
@@ -17,13 +17,22 @@
 #include "idl/hinted_handoff.dist.hh"
 #include "idl/hinted_handoff.dist.impl.hh"
 #include "utils/base64.hh"
+#include "xx_hasher.hh"

 namespace db {
 namespace hints {
-
+// Sync points can be encoded in two formats: V1 and V2. V2 extends V1 by adding
+// a checksum. Currently, we use the V2 format, but sync points encoded in the V1
+// format still can be safely decoded.
+//
 // Format V1 (encoded in base64):
 //   uint8_t 0x01 - version of format
-//   sync_point_v1 - encoded using IMR
+//   sync_point_v1 - encoded using IDL
+//
+// Format V2 (encoded in base64):
+//   uint8_t 0x02 - version of format
+//   sync_point_v1 - encoded using IDL
+//   uint64_t - checksum computed using the xxHash algorithm
 //
 // sync_point_v1:
 //   UUID host_id - ID of the host which created the sync point
@@ -41,6 +50,9 @@ namespace hints {
 //       Flattened representation was chosen in order to save space on
 //       vector lengths etc.

+static constexpr size_t version_size = sizeof(uint8_t);
+static constexpr size_t checksum_size = sizeof(uint64_t);
+
 static std::vector<sync_point::shard_rps> decode_one_type_v1(uint16_t shard_count, const per_manager_sync_point_v1& v1) {
    std::vector<sync_point::shard_rps> ret;

@@ -67,16 +79,37 @@ static std::vector<sync_point::shard_rps> decode_one_type_v1(uint16_t shard_coun
    return ret;
 }

+static uint64_t calculate_checksum(const sstring_view s) {
+    xx_hasher h;
+    h.update(s.data(), s.size());
+    return h.finalize_uint64();
+}
+
 sync_point sync_point::decode(sstring_view s) {
    bytes raw = base64_decode(s);
    if (raw.empty()) {
        throw std::runtime_error("Could not decode the sync point - not a valid hex string");
    }
-    if (raw[0] != 1) {
-        throw std::runtime_error(format("Unsupported sync point format version: {}", int(raw[0])));
+
+    sstring_view raw_s(reinterpret_cast<const char*>(raw.data()), raw.size());
+    seastar::simple_memory_input_stream in{raw_s.data(), raw_s.size()};
+
+    uint8_t version = ser::serializer<uint8_t>::read(in);
+    if (version == 2) {
+        if (raw_s.size() < version_size + checksum_size) {
+            throw std::runtime_error("Could not decode the sync point encoded in the V2 format - serialized blob is too short");
+        }
+
+        seastar::simple_memory_input_stream in_checksum{raw_s.end() - checksum_size, checksum_size};
+        uint64_t checksum = ser::serializer<uint64_t>::read(in_checksum);
+        if (checksum != calculate_checksum(raw_s.substr(0, raw_s.size() - checksum_size))) {
+            throw std::runtime_error("Could not decode the sync point encoded in the V2 format - wrong checksum");
+        }
+    }
+    else if (version != 1) {
+        throw std::runtime_error(format("Unsupported sync point format version: {}", int(version)));
    }

-    seastar::simple_memory_input_stream in{reinterpret_cast<const char*>(raw.data()) + 1, raw.size() - 1};
    sync_point_v1 v1 = ser::serializer<sync_point_v1>::read(in);

    return sync_point{
@@ -133,11 +166,16 @@ sstring sync_point::encode() const {
    seastar::measuring_output_stream measure;
    ser::serializer<sync_point_v1>::write(measure, v1);

-    // Reserve 1 byte for the version
-    bytes serialized{bytes::initialized_later{}, 1 + measure.size()};
-    serialized[0] = 1;
-    seastar::simple_memory_output_stream out{reinterpret_cast<char*>(serialized.data()), measure.size(), 1};
+    // Reserve version_size bytes for the version and checksum_size bytes for the checksum
+    bytes serialized{bytes::initialized_later{}, version_size + measure.size() + checksum_size};
+
+    // Encode using V2 format
+    seastar::simple_memory_output_stream out{reinterpret_cast<char*>(serialized.data()), serialized.size()};
+    ser::serializer<uint8_t>::write(out, 2);
    ser::serializer<sync_point_v1>::write(out, v1);
+    sstring_view serialized_s(reinterpret_cast<const char*>(serialized.data()), version_size + measure.size());
+    uint64_t checksum = calculate_checksum(serialized_s);
+    ser::serializer<uint64_t>::write(out, checksum);

    return base64_encode(serialized);
 }
--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -157,7 +157,7 @@ future<> cql_table_large_data_handler::try_record(std::string_view large_table,
    const auto sstable_name = large_data_handler::sst_filename(sst);
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
+    large_data_logger.warn("Writing large {} {}/{}: {} ({} bytes) to {}", desc, ks_name, cf_name, extra_path, size, sstable_name);
    return _sys_ks->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -184,10 +184,10 @@ future<> cql_table_large_data_handler::internal_record_large_cells(const sstable
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("/{}/{}", ck_str, column_name), extra_fields, ck_str, column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name);
    } else {
        auto desc = format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, format("//{}", column_name), extra_fields, data_value::make_null(utf8_type), column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
    }
 }

@@ -199,10 +199,10 @@ future<> cql_table_large_data_handler::internal_record_large_cells_and_collectio
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("/{}/{}", ck_str, column_name), extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
    } else {
        auto desc = format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, format("//{}", column_name), extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
    }
 }

@@ -212,7 +212,7 @@ future<> cql_table_large_data_handler::record_large_rows(const sstables::sstable
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        std::string ck_str = key_to_str(*clustering_key, s);
-        return try_record("row", sst, partition_key, int64_t(row_size), "row", format("/{}", ck_str), extra_fields,  ck_str);
+        return try_record("row", sst, partition_key, int64_t(row_size), "row", "", extra_fields, ck_str);
    } else {
        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, data_value::make_null(utf8_type));
    }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -3540,7 +3540,8 @@ view_ptr maybe_fix_legacy_secondary_index_mv_schema(replica::database& db, const
    if (v->clustering_key_size() == 0) {
        return view_ptr(nullptr);
    }
-    const column_definition& first_view_ck = v->clustering_key_columns().front();
+    const auto ck_cols = v->clustering_key_columns();
+    const column_definition& first_view_ck = ck_cols.front();
    if (first_view_ck.is_computed()) {
        return view_ptr(nullptr);
    }
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -486,7 +486,7 @@ system_distributed_keyspace::read_cdc_topology_description(
            return {};
        }

-        std::vector<cdc::token_range_description> entries;
+        utils::chunked_vector<cdc::token_range_description> entries;

        auto entries_val = value_cast<list_type_impl::native_type>(
                cdc_generation_description_type->deserialize(cql_result->one().get_view("description")));
@@ -580,7 +580,7 @@ system_distributed_keyspace::insert_cdc_generation(

 future<std::optional<cdc::topology_description>>
 system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
-    std::vector<cdc::token_range_description> entries;
+    utils::chunked_vector<cdc::token_range_description> entries;
    auto num_ranges = 0;
    co_await _qp.query_internal(
            // This should be a local read so 20s should be more than enough
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2839,8 +2839,7 @@ static void install_virtual_readers(db::system_keyspace& sys_ks, replica::databa

 static bool maybe_write_in_user_memory(schema_ptr s) {
    return (s.get() == system_keyspace::batchlog().get()) || (s.get() == system_keyspace::paxos().get())
-            || s == system_keyspace::v3::scylla_views_builds_in_progress()
-            || s == system_keyspace::raft();
+            || s == system_keyspace::v3::scylla_views_builds_in_progress();
 }

 future<> system_keyspace_make(db::system_keyspace& sys_ks, distributed<replica::database>& dist_db, distributed<service::storage_service>& dist_ss, sharded<gms::gossiper>& dist_gossiper, distributed<service::raft_group_registry>& dist_raft_gr, db::config& cfg, table_selector& tables) {
--- a/db/tags/utils.cc
+++ b/db/tags/utils.cc
@@ -11,6 +11,8 @@
 #include "db/tags/extension.hh"
 #include "schema_builder.hh"
 #include "schema_registry.hh"
+#include "service/storage_proxy.hh"
+#include "data_dictionary/data_dictionary.hh"

 namespace db {

@@ -38,14 +40,27 @@ std::optional<std::string> find_tag(const schema& s, const sstring& tag) {
    }
 }

-future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map) {
-    co_await mm.container().invoke_on(0, [s = global_schema_ptr(std::move(schema)), tags_map = std::move(tags_map)] (service::migration_manager& mm) -> future<> {
+future<> modify_tags(service::migration_manager& mm, sstring ks, sstring cf,
+                     std::function<void(std::map<sstring, sstring>&)> modify) {
+    co_await mm.container().invoke_on(0, [ks = std::move(ks), cf = std::move(cf), modify = std::move(modify)] (service::migration_manager& mm) -> future<> {
        // FIXME: the following needs to be in a loop. If mm.announce() below
        // fails, we need to retry the whole thing.
        auto group0_guard = co_await mm.start_group0_operation();
-
+        // After getting the schema-modification lock, we need to read the
+        // table's *current* schema - it might have changed before we got
+        // the lock, by some concurrent modification. If the table is gone,
+        // this will throw no_such_column_family.
+        schema_ptr s = mm.get_storage_proxy().data_dictionary().find_schema(ks, cf);
+        const std::map<sstring, sstring>* tags_ptr = get_tags_of_table(s);
+        std::map<sstring, sstring> tags;
+        if (tags_ptr) {
+            // tags_ptr is a constant pointer to schema data. To allow func()
+            // to modify the tags, we must make a copy.
+            tags = *tags_ptr;
+        }
+        modify(tags);
        schema_builder builder(s);
-        builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags_map));
+        builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags));

        auto m = co_await mm.prepare_column_family_update_announcement(builder.build(), false, std::vector<view_ptr>(), group0_guard.write_timestamp());

--- a/db/tags/utils.hh
+++ b/db/tags/utils.hh
@@ -33,9 +33,18 @@ const std::map<sstring, sstring>* get_tags_of_table(schema_ptr schema);
 // tags exist but not this tag.
 std::optional<std::string> find_tag(const schema& s, const sstring& tag);

-// FIXME: Updating tags currently relies on updating schema, which may be subject
-// to races during concurrent updates of the same table. Once Scylla schema updates
-// are fixed, this issue will automatically get fixed as well.
-future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map);
-
+// modify_tags() atomically modifies the tags on a given table: It reads the
+// existing tags, passes them as a map to the given function which can modify
+// the map, and finally writes the modified tags. This read-modify-write
+// operation is atomic - isolated from other concurrent schema operations.
+//
+// The isolation requirement is also why modify_tags() takes the table's name
+// ks,cf and not a schema object - the current schema may not be relevant by
+// the time the tags are modified, due to some other concurrent modification.
+// If a table (ks, cf) doesn't exist, no_such_column_family is thrown.
+//
+// If the table didn't have the tags schema extension, it's fine: The function
+// is passed an empty map, and the tags it adds will be added to the table.
+future<> modify_tags(service::migration_manager& mm, sstring ks, sstring cf,
+                     std::function<void(std::map<sstring, sstring>&)> modify_func);
 }
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -487,37 +487,56 @@ mutation_partition& view_updates::partition_for(partition_key&& key) {
 }

 size_t view_updates::op_count() const {
-    return _op_count++;;
+    return _op_count;
 }

 row_marker view_updates::compute_row_marker(const clustering_or_static_row& base_row) const {
    /*
-     * We need to compute both the timestamp and expiration.
+     * We need to compute both the timestamp and expiration for view rows.
     *
-     * There are 3 cases:
-     *   1) There is a column that is not in the base PK but is in the view PK. In that case, as long as that column
-     *      lives, the view entry does too, but as soon as it expires (or is deleted for that matter) the entry also
-     *      should expire. So the expiration for the view is the one of that column, regardless of any other expiration.
-     *      To take an example of that case, if you have:
-     *        CREATE TABLE t (a int, b int, c int, PRIMARY KEY (a, b))
-     *        CREATE MATERIALIZED VIEW mv AS SELECT * FROM t WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)
-     *        INSERT INTO t(a, b) VALUES (0, 0) USING TTL 3;
-     *        UPDATE t SET c = 0 WHERE a = 0 AND b = 0;
-     *      then even after 3 seconds elapsed, the row will still exist (it just won't have a "row marker" anymore) and so
-     *      the MV should still have a corresponding entry.
-     *      This cell determines the liveness of the view row.
-     *   2) The columns for the base and view PKs are exactly the same, and all base columns are selected by the view.
-     *      In that case, all components (marker, deletion and cells) are the same and trivially mapped.
-     *   3) The columns for the base and view PKs are exactly the same, but some base columns are not selected in the view.
-     *      Use the max timestamp out of the base row marker and all the unselected columns - this ensures we can keep the
-     *      view row alive. Do the same thing for the expiration, if the marker is dead or will expire, and so
-     *      will all unselected columns.
+     * Below there are several distinct cases depending on how many new key
+     * columns the view has - i.e., how many of the view's key columns were
+     * regular columns in the base. base_regular_columns_in_view_pk.size():
+     *
+     * Zero new key columns:
+     *     The view rows key is composed only from base key columns, and those
+     *     cannot be changed in an update, so the view row remains alive as
+     *     long as the base row is alive. We need to return the same row
+     *     marker as the base for the view - to keep an empty view row alive
+    *      for as long as an empty base row exists.
+     *     Note that in this case, if there are *unselected* base columns, we
+     *     may need to keep an empty view row alive even without a row marker
+     *     because the base row (which has additional columns) is still alive.
+     *     For that we have the "virtual columns" feature: In the zero new
+     *     key columns case, we put unselected columns in the view as empty
+     *     columns, to keep the view row alive.
+     *
+     * One new key column:
+     *     In this case, there is a regular base column that is part of the
+     *     view key. This regular column can be added or deleted in an update,
+     *     or its expiration be set, and those can cause the view row -
+     *     including its row marker - to need to appear or disappear as well.
+     *     So the liveness of cell of this one column determines the liveness
+     *     of the view row and the row marker that we return.
+     *
+     * Two or more new key columns:
+     *     This case is explicitly NOT supported in CQL - one cannot create a
+     *     view with more than one base-regular columns in its key. In general
+     *     picking one liveness (timestamp and expiration) is not possible
+     *     if there are multiple regular base columns in the view key, as
+     *     those can have different liveness.
+     *     However, we do allow this case for Alternator - we need to allow
+     *     the case of two (but not more) because the DynamoDB API allows
+     *     creating a GSI whose two key columns (hash and range key) were
+     *     regular columns.
+     *     We can support this case in Alternator because it doesn't use
+     *     expiration (the "TTL" it does support is different), and doesn't
+     *     support user-defined timestamps. But, the two columns can still
+     *     have different timestamps - this happens if an update modifies
+     *     just one of them. In this case the timestamp of the view update
+     *     (and that of the row marker we return) is the later of these two
+     *     updated columns.
     */
-
-    // WARNING: The code assumes that if multiple regular base columns are present in the view key,
-    // they share liveness information. It's true especially in the only case currently allowed by CQL,
-    // which assumes there's up to one non-pk column in the view key. It's also true in alternator,
-    // which does not carry TTL information.
    const auto& col_ids = base_row.is_clustering_row()
            ? _base_info->base_regular_columns_in_view_pk()
            : _base_info->base_static_columns_in_view_pk();
@@ -525,7 +544,20 @@ row_marker view_updates::compute_row_marker(const clustering_or_static_row& base
        auto& def = _base->column_at(base_row.column_kind(), col_ids[0]);
        // Note: multi-cell columns can't be part of the primary key.
        auto cell = base_row.cells().cell_at(col_ids[0]).as_atomic_cell(def);
-        return cell.is_live_and_has_ttl() ? row_marker(cell.timestamp(), cell.ttl(), cell.expiry()) : row_marker(cell.timestamp());
+        auto ts = cell.timestamp();
+        if (col_ids.size() > 1){
+            // As explained above, this case only happens in Alternator,
+            // and we may need to pick a higher ts:
+            auto& second_def = _base->column_at(base_row.column_kind(), col_ids[1]);
+            auto second_cell = base_row.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
+            auto second_ts = second_cell.timestamp();
+            ts = std::max(ts, second_ts);
+            // Alternator isn't supposed to have TTL or more than two col_ids!
+            if (col_ids.size() != 2 || cell.is_live_and_has_ttl() || second_cell.is_live_and_has_ttl()) [[unlikely]] {
+                utils::on_internal_error(format("Unexpected col_ids length {} or has TTL", col_ids.size()));
+            }
+        }
+        return cell.is_live_and_has_ttl() ? row_marker(ts, cell.ttl(), cell.expiry()) : row_marker(ts);
    }

    return base_row.marker();
@@ -923,8 +955,22 @@ void view_updates::do_delete_old_entry(const partition_key& base_key, const clus
            // Note: multi-cell columns can't be part of the primary key.
            auto& def = _base->column_at(kind, col_ids[0]);
            auto cell = existing.cells().cell_at(col_ids[0]).as_atomic_cell(def);
+            auto ts = cell.timestamp();
+            if (col_ids.size() > 1) {
+                // This is the Alternator-only support for two regular base
+                // columns that become view key columns. See explanation in
+                // view_updates::compute_row_marker().
+                auto& second_def = _base->column_at(kind, col_ids[1]);
+                auto second_cell = existing.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
+                auto second_ts = second_cell.timestamp();
+                ts = std::max(ts, second_ts);
+                // Alternator isn't supposed to have more than two col_ids!
+                if (col_ids.size() != 2) [[unlikely]] {
+                    utils::on_internal_error(format("Unexpected col_ids length {}", col_ids.size()));
+                }
+            }
            if (cell.is_live()) {
-                r->apply(shadowable_tombstone(cell.timestamp(), now));
+                r->apply(shadowable_tombstone(ts, now));
            }
        } else {
            // "update" caused the base row to have been deleted, and !col_id
@@ -1308,11 +1354,12 @@ void view_update_builder::generate_update(static_row&& update, const tombstone&

 future<stop_iteration> view_update_builder::on_results() {
    constexpr size_t max_rows_for_view_updates = 100;
-    size_t rows_for_view_updates = std::accumulate(_view_updates.begin(), _view_updates.end(), 0, [] (size_t acc, const view_updates& vu) {
-        return acc + vu.op_count();
-    });
-    const bool stop_updates = rows_for_view_updates >= max_rows_for_view_updates;
-
+    auto should_stop_updates = [this] () -> bool {
+        size_t rows_for_view_updates = std::accumulate(_view_updates.begin(), _view_updates.end(), 0, [] (size_t acc, const view_updates& vu) {
+            return acc + vu.op_count();
+        });
+        return rows_for_view_updates >= max_rows_for_view_updates;
+    };
    if (_update && !_update->is_end_of_partition() && _existing && !_existing->is_end_of_partition()) {
        auto cmp = position_in_partition::tri_compare(*_schema)(_update->position(), _existing->position());
        if (cmp < 0) {
@@ -1335,7 +1382,7 @@ future<stop_iteration> view_update_builder::on_results() {
                              : std::nullopt;
                generate_update(std::move(update), _update_partition_tombstone, std::move(existing), _existing_partition_tombstone);
            }
-            return stop_updates ? stop() : advance_updates();
+            return should_stop_updates() ? stop() : advance_updates();
        }
        if (cmp > 0) {
            // We have something existing but no update (which will happen either because it's a range tombstone marker in
@@ -1371,7 +1418,7 @@ future<stop_iteration> view_update_builder::on_results() {
                    generate_update(std::move(update), _update_partition_tombstone, { std::move(existing) }, _existing_partition_tombstone);
                }
            }
-            return stop_updates ? stop () : advance_existings();
+            return should_stop_updates() ? stop () : advance_existings();
        }
        // We're updating a row that had pre-existing data
        if (_update->is_range_tombstone_change()) {
@@ -1393,8 +1440,9 @@ future<stop_iteration> view_update_builder::on_results() {
                                                  mutation_fragment_v2::printer(*_schema, *_update), mutation_fragment_v2::printer(*_schema, *_existing)));
            }
            generate_update(std::move(*_update).as_static_row(), _update_partition_tombstone, { std::move(*_existing).as_static_row() }, _existing_partition_tombstone);
+
        }
-        return stop_updates ? stop() : advance_all();
+        return should_stop_updates() ? stop() : advance_all();
    }

    auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
@@ -1409,7 +1457,7 @@ future<stop_iteration> view_update_builder::on_results() {
            auto update = static_row();
            generate_update(std::move(update), _update_partition_tombstone, { std::move(existing) }, _existing_partition_tombstone);
        }
-        return stop_updates ? stop() : advance_existings();
+        return should_stop_updates() ? stop() : advance_existings();
    }

    // If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
@@ -1430,7 +1478,7 @@ future<stop_iteration> view_update_builder::on_results() {
                          : std::nullopt;
            generate_update(std::move(*_update).as_static_row(), _update_partition_tombstone, std::move(existing), _existing_partition_tombstone);
        }
-        return stop_updates ? stop() : advance_updates();
+        return should_stop_updates() ? stop() : advance_updates();
    }

    return stop();
@@ -1609,6 +1657,13 @@ static bool should_update_synchronously(const schema& s) {
    return *tag_opt == "true";
 }

+size_t memory_usage_of(const frozen_mutation_and_schema& mut) {
+    // Overhead of sending a view mutation, in terms of data structures used by the storage_proxy, as well as possible background tasks
+    // allocated for a remote view update.
+    constexpr size_t base_overhead_bytes = 2288;
+    return base_overhead_bytes + mut.fm.representation().size();
+}
+
 // Take the view mutations generated by generate_view_updates(), which pertain
 // to a modification of a single base partition, and apply them to the
 // appropriate paired replicas. This is done asynchronously - we do not wait
@@ -1630,7 +1685,7 @@ future<> mutate_MV(
        auto& keyspace_name = mut.s->ks_name();
        auto target_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto remote_endpoints = service::get_local_storage_proxy().get_token_metadata_ptr()->pending_endpoints_for(view_token, keyspace_name);
-        auto sem_units = pending_view_updates.split(mut.fm.representation().size());
+        auto sem_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_updates.split(memory_usage_of(mut)));

        const bool update_synchronously = should_update_synchronously(*mut.s);
        if (update_synchronously) {
@@ -1678,7 +1733,7 @@ future<> mutate_MV(
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
            local_view_update = service::get_local_storage_proxy().mutate_mv_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
-                            units = sem_units.split(sem_units.count())] (future<>&& f) {
+                            sem_units] (future<>&& f) {
                --stats.writes;
                if (f.failed()) {
                    ++stats.view_updates_failed_local;
@@ -1715,7 +1770,7 @@ future<> mutate_MV(
            schema_ptr s = mut.s;
            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), std::move(mut), base_token, view_token, allow_hints, tr_state).then_wrapped(
                    [s = std::move(s), &stats, &cf_stats, tr_state, base_token, view_token, target_endpoint, updates_pushed_remote,
-                            units = sem_units.split(sem_units.count()), apply_update_synchronously] (future<>&& f) mutable {
+                            sem_units, apply_update_synchronously] (future<>&& f) mutable {
                if (f.failed()) {
                    stats.view_updates_failed_remote += updates_pushed_remote;
                    cf_stats.total_view_updates_failed_remote += updates_pushed_remote;
@@ -2230,7 +2285,7 @@ future<> view_builder::do_build_step() {
            }
        }
    }).handle_exception([] (std::exception_ptr ex) {
-        vlogger.warn("Unexcepted error executing build step: {}. Ignored.", std::current_exception());
+        vlogger.warn("Unexcepted error executing build step: {}. Ignored.", ex);
    });
 }

--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -213,7 +213,7 @@ class view_updates final {
    schema_ptr _base;
    base_info_ptr _base_info;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
-    mutable size_t _op_count = 0;
+    size_t _op_count = 0;
 public:
    explicit view_updates(view_and_base vab)
            : _view(std::move(vab.view))
@@ -327,6 +327,8 @@ future<> mutate_MV(
        service::allow_hints allow_hints,
        wait_for_all_updates wait_for_all);

+size_t memory_usage_of(const frozen_mutation_and_schema& mut);
+
 /**
 * create_virtual_column() adds a "virtual column" to a schema builder.
 * The definition of a "virtual column" is based on the given definition
--- a/direct_failure_detector/failure_detector.cc
+++ b/direct_failure_detector/failure_detector.cc
@@ -96,6 +96,7 @@ struct failure_detector::impl {
    clock& _clock;

    clock::interval_t _ping_period;
+    clock::interval_t _ping_timeout;

    // Number of workers on each shard.
    // We use this to decide where to create new workers (we pick a shard with the smallest number of workers).
@@ -138,7 +139,7 @@ struct failure_detector::impl {
    // The unregistering process requires cross-shard operations which we perform on this fiber.
    future<> _destroy_subscriptions = make_ready_future<>();

-    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period);
+    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period, clock::interval_t ping_timeout);
    ~impl();

    // Inform update_endpoint_fiber() about an added/removed endpoint.
@@ -174,12 +175,14 @@ struct failure_detector::impl {
    future<> mark(listener* l, pinger::endpoint_id ep, bool alive);
 };

-failure_detector::failure_detector(pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period))
+failure_detector::failure_detector(
+    pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period, ping_timeout))
 {}

-failure_detector::impl::impl(failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period) {
+failure_detector::impl::impl(
+    failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period), _ping_timeout(ping_timeout) {
    if (this_shard_id() != 0) {
        return;
    }
@@ -536,11 +539,9 @@ future<> endpoint_worker::ping_fiber() noexcept {
        auto start = clock.now();
        auto next_ping_start = start + _fd._ping_period;

-        // A ping should take significantly less time than _ping_period, but we give it a multiple of ping_period before it times out
-        // just in case of transient network partitions.
-        // However, if there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
+        auto timeout = start + _fd._ping_timeout;
+        // If there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
        // the listener (mark it as dead).
-        auto timeout = start + 3 * _fd._ping_period;
        for (auto& [threshold, l]: _fd._listeners_liveness) {
            if (l.endpoint_liveness[_id].alive && last_response + threshold < timeout) {
                timeout = last_response + threshold;
--- a/direct_failure_detector/failure_detector.hh
+++ b/direct_failure_detector/failure_detector.hh
@@ -120,14 +120,14 @@ public:

        // Every endpoint in the detected set will be periodically pinged every `ping_period`,
        // assuming that the pings return in a timely manner. A ping may take longer than `ping_period`
-        // before it's aborted (up to a certain multiple of `ping_period`), in which case the next ping
-        // will start immediately.
-        //
-        // `ping_period` should be chosen so that during normal operation, a ping takes significantly
-        // less time than `ping_period` (preferably at least an order of magnitude less).
+        // before it's aborted (up to `ping_timeout`), in which case the next ping will start immediately.
        //
        // The passed-in value must be the same on every shard.
-        clock::interval_t ping_period
+        clock::interval_t ping_period,
+
+        // Duration after which a ping is aborted, so that next ping can be started
+        // (pings are sent sequentially).
+        clock::interval_t ping_timeout
    );

    ~failure_detector();
@@ -147,7 +147,7 @@ public:
    // The listener stops being called when the returned subscription is destroyed.
    // The subscription must be destroyed before service is stopped.
    //
-    // `threshold` should be significantly larger than `ping_period`, preferably at least an order of magnitude larger.
+    // `threshold` should be significantly larger than `ping_timeout`, preferably at least an order of magnitude larger.
    //
    // Different listeners may use different thresholds, depending on the use case:
    // some listeners may want to mark endpoints as dead more aggressively if fast reaction times are important
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -62,8 +62,7 @@ ExternalSizeMax=1024G
 [Unit]
 Description=Save coredump to scylla data directory
 Conflicts=umount.target
-Before=scylla-server.service
-After=local-fs.target
+Before=local-fs.target scylla-server.service
 DefaultDependencies=no

 [Mount]
@@ -73,7 +72,7 @@ Type=none
 Options=bind

 [Install]
-WantedBy=multi-user.target
+WantedBy=local-fs.target
 '''[1:-1]
            with open('/etc/systemd/system/var-lib-systemd-coredump.mount', 'w') as f:
                f.write(dot_mount)
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -15,10 +15,84 @@ import grp
 import sys
 import stat
 import distro
+import logging
+import pyudev
 from pathlib import Path
 from scylla_util import *
 from subprocess import run, SubprocessError

+LOGGER = logging.getLogger(__name__)
+
+class UdevInfo:
+    def __init__(self, device_file):
+        self.context = pyudev.Context()
+        self.device = pyudev.Devices.from_device_file(self.context, device_file)
+
+    def verify(self):
+        if not self.id_fs_uuid:
+            LOGGER.error('ID_FS_UUID does not found')
+        if self.id_fs_type != 'xfs':
+            LOGGER.error('ID_FS_TYPE is not "xfs"')
+        if self.id_fs_usage != 'filesystem':
+            LOGGER.error('ID_FS_USAGE is not "filesystem"')
+
+    def dump_variables(self):
+        LOGGER.error(f'    sys_path: {self.device.sys_path}')
+        LOGGER.error(f'    sys_name: {self.device.sys_name}')
+        LOGGER.error(f'    sys_number: {self.device.sys_number}')
+        LOGGER.error(f'    device_path: {self.device.device_path}')
+        LOGGER.error(f'    tags: {list(self.device.tags)}')
+        LOGGER.error(f'    subsystem: {self.device.subsystem}')
+        LOGGER.error(f'    driver: {self.device.driver}')
+        LOGGER.error(f'    device_type: {self.device.device_type}')
+        LOGGER.error(f'    device_node: {self.device.device_node}')
+        LOGGER.error(f'    device_number: {self.device.device_number}')
+        LOGGER.error(f'    device_links: {list(self.device.device_links)}')
+        LOGGER.error(f'    is_initialized: {self.device.is_initialized}')
+        LOGGER.error(f'    time_since_initialized: {self.device.time_since_initialized}')
+        for k, v in self.device.properties.items():
+            LOGGER.error(f'    {k}: {v}')
+
+    @property
+    def id_fs_uuid(self):
+        return self.device.properties.get('ID_FS_UUID')
+
+    @property
+    def id_fs_type(self):
+        return self.device.properties.get('ID_FS_TYPE')
+
+    @property
+    def id_fs_usage(self):
+        return self.device.properties.get('ID_FS_USAGE')
+
+    @property
+    def uuid_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-uuid/'):
+                return l
+
+    @property
+    def label_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-label/'):
+                return l
+
+    @property
+    def partuuid_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-partuuid/'):
+                return l
+
+    @property
+    def path_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-path/'):
+                return l
+
+    @property
+    def id_links(self):
+        return [l for l in self.device.device_links if l.startswith('/dev/disk/by-id')]
+
 if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
@@ -161,35 +235,51 @@ if __name__ == '__main__':

    os.makedirs(mount_at, exist_ok=True)

-    uuid = out(f'blkid -s UUID -o value {fsdev}')
-    if not uuid:
-        raise Exception(f'Failed to get UUID of {fsdev}')
+    udev_info = UdevInfo(fsdev)
+    mount_dev = None
+    if udev_info.uuid_link:
+        mount_dev = udev_info.uuid_link
+    else:
+        if udev_info.label_link:
+            mount_dev = udev_info.label_link
+            dev_type = 'label'
+        elif udev_info.partuuid_link:
+            mount_dev = udev_info.partuuid_link
+            dev_type = 'partuuid'
+        elif udev_info.path_link:
+            mount_dev = udev_info.path_link
+            dev_type = 'path'
+        elif udev_info.id_links:
+            mount_dev = udev_info.id_links[0]
+            dev_type = 'id'
+        else:
+            mount_dev = fsdev
+            dev_type = 'realpath'
+        LOGGER.error(f'Failed to detect uuid, using {dev_type}: {mount_dev}')

-    uuidpath = f'/dev/disk/by-uuid/{uuid}'
-
-    after = 'local-fs.target'
+    after = ''
    wants = ''
    if raid and args.raid_level != '0':
-        after += f' {md_service}'
-        wants = f'\nWants={md_service}'
+        after = wants = 'md_service'
    opt_discard = ''
    if args.online_discard:
        opt_discard = ',discard'
    unit_data = f'''
 [Unit]
 Description=Scylla data directory
-Before=scylla-server.service
-After={after}{wants}
+Before=local-fs.target scylla-server.service
+After={after}
+Wants={wants}
 DefaultDependencies=no

 [Mount]
-What={uuidpath}
+What={mount_dev}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}

 [Install]
-WantedBy=multi-user.target
+WantedBy=local-fs.target
 '''[1:-1]
    with open(f'/etc/systemd/system/{mntunit_bn}', 'w') as f:
        f.write(unit_data)
@@ -209,10 +299,18 @@ WantedBy=multi-user.target
        mount = systemd_unit(mntunit_bn)
        mount.start()
    except SubprocessError as e:
-        if not os.path.exists(uuidpath):
-            print(f'\nERROR: {uuidpath} is not found\n')
-        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
-            print(f'\nERROR: {uuidpath} is not block device\n')
+        if mount_dev != fsdev:
+            if not os.path.islink(mount_dev):
+                LOGGER.error('{mount_dev} is not found')
+            if not os.path.exists(mount_dev):
+                LOGGER.error('{mount_dev} is broken link')
+        if not os.path.exists(fsdev):
+            LOGGER.error('{fsdev} is not found')
+        if not stat.S_ISBLK(os.stat(fsdev).st_mode):
+            LOGGER.error('{fsdev} is not block device')
+        LOGGER.error(f'Error detected, dumping udev env parameters on {fsdev}')
+        udev_info.verify()
+        udev_info.dump_variables()
        raise e

    if args.enable_on_nextboot:
@@ -228,3 +326,8 @@ WantedBy=multi-user.target

    if is_debian_variant():
        run('update-initramfs -u', shell=True, check=True)
+
+    if not udev_info.uuid_link:
+        LOGGER.error(f'Error detected, dumping udev env parameters on {fsdev}')
+        udev_info.verify()
+        udev_info.dump_variables()
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -63,7 +63,6 @@ bcp "${packages[@]}" packages/

 bcp dist/docker/etc etc/
 bcp dist/docker/scylla-housekeeping-service.sh /scylla-housekeeping-service.sh
-bcp dist/docker/sshd-service.sh /sshd-service.sh

 bcp dist/docker/scyllasetup.py /scyllasetup.py
 bcp dist/docker/commandlineparser.py /commandlineparser.py
@@ -73,10 +72,11 @@ bcp dist/docker/scylla_bashrc /scylla_bashrc

 run apt-get -y clean expire-cache
 run apt-get -y update
+run apt-get -y upgrade
 run apt-get -y install dialog apt-utils
 run bash -ec "echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections"
 run bash -ec "rm -rf /etc/rsyslog.conf"
-run apt-get -y install hostname supervisor openssh-server openssh-client openjdk-11-jre-headless python2 python3 python3-yaml curl rsyslog sudo
+run apt-get -y install hostname supervisor openjdk-11-jre-headless python2 python3 python3-yaml curl rsyslog sudo
 run bash -ec "echo LANG=C.UTF-8 > /etc/default/locale"
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
--- a/dist/docker/etc/supervisord.conf.d/sshd-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/sshd-server.conf
@@ -1,6 +0,0 @@
-[program:sshd]
-command=/sshd-service.sh
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
--- a/dist/docker/sshd-service.sh
+++ b/dist/docker/sshd-service.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-if [ ! -f /run/sshd ]; then
-  mkdir -p /run/sshd
-fi
-
-if [ ! -f /etc/ssh/ssh_host_ed25519_key ]; then
-    ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N ''
-fi
-if [ ! -f /etc/ssh/ssh_host_rsa_key ]; then
-    ssh-keygen -t rsa -b 4096 -f /etc/ssh/ssh_host_rsa_key -N ''
-fi
-
-/usr/sbin/sshd -D
-
--- a/docs/dev/cdc.md
+++ b/docs/dev/cdc.md
@@ -42,7 +42,7 @@ namespace cdc {
        uint8_t sharding_ignore_msb;
    };
    class topology_description {
-        std::vector<token_range_description> _entries;
+        utils::chunked_vector<token_range_description> _entries;
 public:
        ... methods ...
    };
@@ -158,9 +158,27 @@ We're not able to prevent a node learning about a new generation too late due to
 However, it could happen that a node learns about the generation from gossip in time, but then won't be able to extract it from `cdc_generation_descriptions_v2`. In that case we can still maintain consistency: the node will remember that there is a new generation even though it doesn't yet know what it is (it knows only the ID, in particular it knows the timestamp) using the `cdc::metadata::prepare(db_clock::time_point)` method, and then _reject_ writes for CDC-enabled tables that are supposed to use this new generation. The node will keep trying to read the generation's data in background until it succeeds or sees that it's not necessary anymore (e.g. because the generation was already superseded by a new generation).
 Thus we give up availability for safety. This likely won't happen if the administrator ensures that the cluster is not partitioned before bootstrapping a new node. This problem will also be mitigated with a future patch.

-Due to the need of maintaining colocation we don't allow the client to send writes with arbitrary timestamps.
-Suppose that a write is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`). Then we only allow the write if its timestamp is in the interval [`T`, `C + generation_leeway`), where `generation_leeway` is a small time-inteval constant (e.g. 5 seconds).
-Reason: we cannot allow writes before `T`, because they belong to the old generation whose token ranges might no longer refine the current vnodes, so the corresponding log write would not necessarily be colocated with the base write. We also cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.
+#### Generation switching: accepting writes
+
+Due to the need of maintaining colocation we don't allow the client to send writes with arbitrary timestamps. We allow:
+- writes to the current and next generations unless they are too far into the future,
+- writes to the previous generations unless they are too far into the past.
+
+##### Writes to the current and next generations
+
+Suppose that a write with timestamp `W` is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`) such that `T <= W`. Then we only allow the write if `W < C + generation_leeway`, where `generation_leeway` is a small time-interval constant (e.g. 5 seconds).
+
+We cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.
+
+##### Writes to the previous generations
+
+This time suppose that `T > W`. Then we only allow the write if `W > C - generation_leeway` and there was a generation operating at `W`.
+
+We allow writes to previous generations to improve user experience. If a client generates timestamps by itself and clocks are not perfectly synchronized, there may be short periods of time around the moment of switching generations when client's writes are rejected because they fall into one of the previous generations. Usually, this problem is easy to overcome by the client. It can simply repeat a write a few times, but using a higher timestamp. Unfortunately, if a table additionally uses LWT, the client cannot increase the timestamp because LWT makes timestamps permanent. Once Paxos commits an entry with a given timestamp, Scylla will keep trying to apply that entry until it succeeds, with the same timestamp. Applying the entry involves doing a CDC log table write. If it fails, we are stuck. Allowing writes to the previous generations is also a probabilistic fix for this bug.
+
+Note that writing only to the previous generation might not be enough. With the Raft-based topology and tablets, we can add multiple nodes almost instantly. Then, we can have multiple generations with almost identical timestamps.
+
+We allow writes only to the recent past to reduce the number of generations that must be stored in memory.

 ### Streams description tables

--- a/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
+++ b/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
@@ -7,10 +7,11 @@

 .. Note:: 

-   If ``authenticator`` is set to ``PasswordAuthenticator`` - increase the replication factor of the ``system_auth`` keyspace.
-
-   For example:
-
+   If ``authenticator`` is set to ``PasswordAuthenticator``, increase the replication factor of the ``system_auth`` keyspace.
+   For example: 
+   
   ``ALTER KEYSPACE system_auth WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1' : <new_replication_factor>};``
+   
+   Ensure you run repair after you alter the keyspace. See :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`.

   It is recommended to set ``system_auth`` replication factor to the number of nodes in each DC.
--- a/docs/troubleshooting/debugging-large-partition.rst
+++ b/docs/troubleshooting/debugging-large-partition.rst
@@ -21,7 +21,7 @@ Any of the following:

  .. code-block:: none

-     WARN  2022-09-22 17:33:11,075 [shard 1]large_data - Writing large partition Some_KS/Some_table: PK[/CK[/COL]] (SIZE bytes) to SSTABLE_NAME
+     WARN  2022-09-22 17:33:11,075 [shard 1]large_data - Writing large partition Some_KS/Some_table: [COL] (SIZE bytes) to SSTABLE_NAME

  In this case, refer to :ref:`Troubleshooting Large Partition Tables <large-partition-table-configure>` for more information.

--- a/docs/upgrade/_common/upgrade-guide-v4-rpm.rst
+++ b/docs/upgrade/_common/upgrade-guide-v4-rpm.rst
@@ -31,7 +31,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/index.html>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `ScyllaDB Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade Steps
 =============
@@ -58,9 +58,14 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet

 Backup the configuration file
 ------------------------------
+
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
 .. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+   sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup

 Stop ScyllaDB
 ---------------
@@ -122,29 +127,33 @@ Rollback Steps
 ==============
 Gracefully shutdown ScyllaDB
 -----------------------------
+
 .. code:: sh

   nodetool drain
- .. include:: /rst_include/scylla-commands-stop-index.rst
+   nodetool snapshot
+   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------
-#. Remove the old repo file.
+#. Restore the |SRC_VERSION| packages backed up during the upgrade.

    .. code:: sh

-       sudo rm -rf /etc/yum.repos.d/scylla.repo
+       sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo
+       sudo chown root.root /etc/yum.repos.d/scylla.repo
+       sudo chmod 644 /etc/yum.repos.d/scylla.repo

-#. Update the |SCYLLA_REPO|_  to |SRC_VERSION|.
 #. Install:

    .. code:: console

       sudo yum clean all
       sudo rm -rf /var/cache/yum
-       sudo yum remove scylla\\*tools-core
-       sudo yum downgrade scylla\\* -y
-       sudo yum install scylla
+       sudo yum downgrade scylla-\*cqlsh -y
+       sudo yum remove scylla-\*cqlsh -y
+       sudo yum downgrade scylla\* -y
+       sudo yum install scylla -y
     

 Restore the configuration file
@@ -153,18 +162,7 @@ Restore the configuration file
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for details.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ---------------------------------
@@ -182,4 +180,4 @@ Start the node

 Validate
 --------
-Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.
+Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.
--- a/docs/upgrade/_common/upgrade-guide-v4-ubuntu-and-debian.rst
+++ b/docs/upgrade/_common/upgrade-guide-v4-ubuntu-and-debian.rst
@@ -34,7 +34,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/index.html>`_ for suspending Scylla Manager (only available Scylla Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `Scylla Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `Scylla Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade steps
 =============
--- a/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
@@ -32,7 +32,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `ScyllaDB Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade Steps
 =============
@@ -60,9 +60,13 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet
 Backup the configuration file
 ------------------------------

+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
 .. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+   sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup

 Gracefully stop the node
 ------------------------
--- a/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p2.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p2.rst
@@ -44,7 +44,6 @@ For each of the nodes you rollback to |SRC_VERSION|, you will:
 * Drain the node and stop Scylla
 * Retrieve the old ScyllaDB packages
 * Restore the configuration file
-* Restore system tables
 * Reload systemd configuration
 * Restart ScyllaDB
 * Validate the rollback success
@@ -59,17 +58,19 @@ Gracefully shutdown ScyllaDB
 .. code:: sh

   nodetool drain
+   nodetool snapshot
   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------
-#. Remove the old repo file.
+#. Restore the |SRC_VERSION| packages backed up during the upgrade.

    .. code:: sh

-       sudo rm -rf /etc/apt/sources.list.d/scylla.list
+       sudo cp ~/scylla.list-backup /etc/apt/sources.list.d/scylla.list
+       sudo chown root.root /etc/apt/sources.list.d/scylla.list
+       sudo chmod 644 /etc/apt/sources.list.d/scylla.list

-#. Update the |SCYLLA_REPO|_ to |SRC_VERSION|.
 #. Install:

    .. code-block::
@@ -85,18 +86,7 @@ Restore the configuration file
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ----------------------------
--- a/docs/upgrade/ami-upgrade.rst
+++ b/docs/upgrade/ami-upgrade.rst
@@ -2,13 +2,14 @@
 Upgrade ScyllaDB Image: EC2 AMI, GCP, and Azure Images
 ======================================================

-To upgrade ScyllaDB images, you need to update:
+ScyllaDB images are based on **Ubuntu 22.04**.

-#. ScyllaDB packages. Since ScyllaDB Open Source **5.2** and ScyllaDB 
-   Enterprise **2023.1**, the images are based on **Ubuntu 22.04**. 
-   See the :doc:`upgrade guide <./index>` for your ScyllaDB version 
-   for instructions for updating ScyllaDB packages on Ubuntu.
-#. Underlying OS packages. ScyllaDB includes a list of 3rd party and OS packages 
-   tested with the ScyllaDB release. 
+If you’re using the ScyllaDB official image (recommended), follow the upgrade 
+instructions on the **Debian/Ubuntu** tab in the :doc:`upgrade guide </upgrade/index/>`
+for your ScyllaDB version.
+
+If you’re using your own image and have installed ScyllaDB packages for Ubuntu or Debian, 
+follow the extended upgrade procedure on the **EC2/GCP/Azure Ubuntu image** tab 
+in the :doc:`upgrade guide </upgrade/index/>` for your ScyllaDB version.

 To check your Scylla version, run the ``scylla --version`` command.
--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.0-to-5.1/upgrade-guide-from-5.0-to-5.1-generic.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.0-to-5.1/upgrade-guide-from-5.0-to-5.1-generic.rst
@@ -90,9 +90,25 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet

 Backup the configuration file
 ------------------------------
-.. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
+
+   .. group-tab:: RHEL/CentOS
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup

 Gracefully stop the node
 ------------------------
@@ -190,7 +206,6 @@ For each of the nodes you rollback to |SRC_VERSION|, serially (i.e. one node at
 * Drain the node and stop Scylla
 * Retrieve the old ScyllaDB packages
 * Restore the configuration file
-* Restore system tables
 * Reload systemd configuration
 * Restart ScyllaDB
 * Validate the rollback success
@@ -205,25 +220,24 @@ Drain and gracefully stop the node
 .. code:: sh

   nodetool drain
+   nodetool snapshot
   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------

-..
-    TODO: downgrade for 3rd party packages in EC2/GCP/Azure - like in the upgrade section?
-
 .. tabs::

   .. group-tab:: Debian/Ubuntu

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
+               sudo cp ~/scylla.list-backup /etc/apt/sources.list.d/scylla.list
+               sudo chown root.root /etc/apt/sources.list.d/scylla.list
+               sudo chmod 644 /etc/apt/sources.list.d/scylla.list

-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
        #. Install:

            .. code-block::
@@ -236,59 +250,31 @@ Download and install the old release

   .. group-tab:: RHEL/CentOS

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/yum.repos.d/scylla.repo
+               sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo
+               sudo chown root.root /etc/yum.repos.d/scylla.repo
+               sudo chmod 644 /etc/yum.repos.d/scylla.repo

-        #. Update the |SCYLLA_RPM_SRC_REPO|_  to |SRC_VERSION|.
        #. Install:

            .. code:: console

               sudo yum clean all
               sudo rm -rf /var/cache/yum
-               sudo yum remove scylla\\*tools-core
-               sudo yum downgrade scylla\\* -y
-               sudo yum install scylla
-
-   .. group-tab:: EC2/GCP/Azure Ubuntu Image
-
-        #. Remove the old repo file.
-
-            .. code:: sh
-
-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
-
-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
-        #. Install:
-
-            .. code-block::
-
-               sudo apt-get update
-               sudo apt-get remove scylla\* -y
-               sudo apt-get install scylla
-
-        Answer ‘y’ to the first two questions.
+               sudo yum downgrade scylla-\*cqlsh -y
+               sudo yum remove scylla-\*cqlsh -y
+               sudo yum downgrade scylla\* -y
+               sudo yum install scylla -y

 Restore the configuration file
 ------------------------------
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ----------------------------
--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic.rst
@@ -98,9 +98,25 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet

 Backup the configuration file
 ------------------------------
-.. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
+
+   .. group-tab:: RHEL/CentOS
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup

 Gracefully stop the node
 ------------------------
@@ -238,7 +254,6 @@ For each of the nodes you rollback to |SRC_VERSION|, serially (i.e. one node at
 * Drain the node and stop Scylla
 * Retrieve the old ScyllaDB packages
 * Restore the configuration file
-* Restore system tables
 * Reload systemd configuration
 * Restart ScyllaDB
 * Validate the rollback success
@@ -253,25 +268,24 @@ Drain and gracefully stop the node
 .. code:: sh

   nodetool drain
+   nodetool snapshot
   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------

-..
-    TODO: downgrade for 3rd party packages in EC2/GCP/Azure - like in the upgrade section?
-
 .. tabs::

   .. group-tab:: Debian/Ubuntu

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
+               sudo cp ~/scylla.list-backup /etc/apt/sources.list.d/scylla.list
+               sudo chown root.root /etc/apt/sources.list.d/scylla.list
+               sudo chmod 644 /etc/apt/sources.list.d/scylla.list

-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
        #. Install:

            .. code-block::
@@ -284,59 +298,32 @@ Download and install the old release

   .. group-tab:: RHEL/CentOS

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/yum.repos.d/scylla.repo
+               sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo
+               sudo chown root.root /etc/yum.repos.d/scylla.repo
+               sudo chmod 644 /etc/yum.repos.d/scylla.repo

-        #. Update the |SCYLLA_RPM_SRC_REPO|_  to |SRC_VERSION|.
        #. Install:

            .. code:: console

               sudo yum clean all
               sudo rm -rf /var/cache/yum
-               sudo yum remove scylla\\*tools-core
-               sudo yum downgrade scylla\\* -y
-               sudo yum install scylla
+               sudo yum downgrade scylla-\*cqlsh -y
+               sudo yum remove scylla-\*cqlsh -y
+               sudo yum downgrade scylla\* -y
+               sudo yum install scylla -y

-   .. group-tab:: EC2/GCP/Azure Ubuntu Image
-
-        #. Remove the old repo file.
-
-            .. code:: sh
-
-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
-
-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
-        #. Install:
-
-            .. code-block::
-
-               sudo apt-get update
-               sudo apt-get remove scylla\* -y
-               sudo apt-get install scylla
-
-        Answer ‘y’ to the first two questions.

 Restore the configuration file
 ------------------------------
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ----------------------------
--- a/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
+++ b/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
@@ -167,54 +167,27 @@ Download and install the new release

   .. group-tab:: EC2/GCP/Azure Ubuntu Image

-        Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
+      Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.

-        There are two alternative upgrade procedures: upgrading ScyllaDB and simultaneously updating 3rd party and OS packages - recommended if you 
-        are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04, and upgrading ScyllaDB without updating 
-        any external packages.
+      If you’re using the ScyllaDB official image (recommended), see
+      the **Debian/Ubuntu** tab for upgrade instructions. If you’re using your
+      own image and have installed ScyllaDB packages for Ubuntu or Debian,
+      you need to apply an extended upgrade procedure:
+      
+      #. Update the ScyllaDB deb repo (see above).
+      #. Configure Java 1.8 (see above).
+      #. Install the new ScyllaDB version with the additional 
+         ``scylla-enterprise-machine-image`` package:

-        **To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-        Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Load the new repo:
-
-            .. code:: sh
-
-               sudo apt-get update
-
-        #. Run the following command to update the manifest file:
-
-            .. code:: sh
-
-               cat scylla-enterprise-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-
-            Where:
-
-              * ``<version>`` - The ScyllaDB Enterprise version to which you are upgrading ( |NEW_VERSION| ).
-              * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-
-            The file is included in the ScyllaDB Enterprise packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-            Example:
-
-                .. code:: sh
-
-                   cat scylla-enterprise-packages-2022.2.0-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-
-                .. note::
-
-                   Alternatively, you can update the manifest file with the following command:
-
-                   ``sudo apt-get install $(awk '{print $1'} scylla-enterprise-packages-<version>-<arch>.txt) -y``
-
-
-
-        To upgrade ScyllaDB without updating any external packages, follow the :ref:`download and installation instructions for Debian/Ubuntu <upgrade-debian-ubuntu-5.2-to-enterprise-2023.1>`.
+          .. code::
+         
+           sudo apt-get clean all
+           sudo apt-get update
+           sudo apt-get dist-upgrade scylla-enterprise
+           sudo apt-get dist-upgrade scylla-enterprise-machine-image

+      #. Run ``scylla_setup`` without running ``io_setup``.
+      #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.

 Start the node
 --------------
--- a/docs/using-scylla/cdc/cdc-stream-generations.rst
+++ b/docs/using-scylla/cdc/cdc-stream-generations.rst
@@ -124,58 +124,9 @@ Example: The Next Generation

   There are two entries with the same base partition key, but in different streams. One of them corresponds to the write made before the generation change, the other --- to the write made after the change.

-After the operating CDC generation changes, all writes with timestamps greater than or equal to the new generation's timestamp will use the new stream IDs. If you try to perform a write with a timestamp that is smaller than the new generation's timestamp, the write may be rejected, depending on the node you're connected to:
+After the operating CDC generation changes, all writes with timestamps greater than or equal to the new generation's timestamp will use the new stream IDs.

-* if the clock of the node you're connected to reports earlier time than the generation's timestamp, it will allow the write to be performed.
-* Otherwise, the write will be rejected.
-
-Therefore, if you've configured the driver to generate timestamps for you, make sure that the clock of the machine your driver is running on is not too desynchronized with the clock of the node you're connecting to. That way you can minimize the chance of writes being rejected while a new node is being bootstrapped.
-
-Example: rejecting writes to an old generation
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This is a continuation of the :ref:`previous example <next-gen>`; a second node was bootstrapped recently, thus a new generation superseded the previous one.
-
-#. Get the timestamp of the latest generation as an integer:
-
-   .. code-block:: cql
-
-    SELECT tounixtimestamp(time) FROM system_distributed.cdc_generation_timestamps WHERE key = 'timestamps';
-
-   result:
-
-   .. code-block:: none
-
-     system.tounixtimestamp(time)
-    ------------------------------
-                    1585152329484
-                    1585140283006
-
-    (2 rows)
-
-   Generation timestamps have millisecond resolution. Here, the latest generation's timestamp is equal to ``1585152329484`` milliseconds.
-
-#. Try to perform a write with a slightly smaller timestamp (remember that the ``USING TIMESTAMP`` clause expects a timestamp in **microseconds**):
-
-   .. code-block:: cql
-
-    INSERT INTO ks.t (pk, ck, v) VALUES (0, 0, 0) USING TIMESTAMP 1585152329483000;
-
-   result:
-
-   .. code-block:: none
-
-    InvalidRequest: Error from server: code=2200 [Invalid query] message="cdc: attempted to get a stream from an earlier generation than the currently used one. With CDC you cannot send writes with timestamps too far into the past, because that would break consistency properties (write timestamp: 2020/03/25 16:05:29, current generation started at: 2020/03/25 16:05:29)"
-
-   The write was rejected.
-
-#. Perform a write with a timestamp equal to the generation's timestamp:
-
-   .. code-block:: cql
-
-    INSERT INTO ks.t (pk, ck, v) VALUES (0, 0, 0) USING TIMESTAMP 1585152329484000;
-
-   The write succeeds.
+If the clock of the node you're connected to reports time distant from the write's timestamp, it may reject the write. If you've configured the driver to generate timestamps for you, make sure that the clock of the machine your driver is running on is not too desynchronized with the clock of the node you're connecting to. That way you can minimize the chance of writes being rejected.

 The first generation's timestamp
 --------------------------------
--- a/docs/using-scylla/drivers/index.rst
+++ b/docs/using-scylla/drivers/index.rst
@@ -14,7 +14,7 @@ Scylla Drivers
 You can use Scylla with:

 * :doc:`Apache Cassandra CQL Compatible Drivers <cql-drivers/index>`
-* :doc:`AWS DynamoDB Compatible API Drivers <dynamo-drivers/index>`
+* :doc:`Amazon DynamoDB Compatible API Drivers <dynamo-drivers/index>`

 Additional drivers coming soon!

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -702,21 +702,24 @@ future<> gossiper::do_status_check() {

    auto now = this->now();

-    for (auto it = _endpoint_state_map.begin(); it != _endpoint_state_map.end();) {
-        auto endpoint = it->first;
-        auto& ep_state = it->second;
-        it++;
-
-        bool is_alive = ep_state.is_alive();
+    for (const auto& endpoint : get_endpoints()) {
        if (endpoint == get_broadcast_address()) {
            continue;
        }
+        auto eps = get_endpoint_state_for_endpoint_ptr(endpoint);
+        if (!eps) {
+            continue;
+        }
+        auto& ep_state = *eps;
+        bool is_alive = ep_state.is_alive();
+        auto update_timestamp = ep_state.get_update_timestamp();
+        // ep_state cannot be used after yielding

        // check if this is a fat client. fat clients are removed automatically from
        // gossip after FatClientTimeout.  Do not remove dead states here.
        if (is_gossip_only_member(endpoint)
            && !_just_removed_endpoints.contains(endpoint)
-            && ((now - ep_state.get_update_timestamp()) > fat_client_timeout)) {
+            && ((now - update_timestamp) > fat_client_timeout)) {
            logger.info("FatClient {} has been silent for {}ms, removing from gossip", endpoint, fat_client_timeout.count());
            co_await remove_endpoint(endpoint); // will put it in _just_removed_endpoints to respect quarantine delay
            co_await evict_from_membership(endpoint); // can get rid of the state immediately
@@ -1381,6 +1384,10 @@ const std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpo
    return _endpoint_state_map;
 }

+std::vector<inet_address> gossiper::get_endpoints() const {
+    return boost::copy_range<std::vector<inet_address>>(_endpoint_state_map | boost::adaptors::map_keys);
+}
+
 bool gossiper::uses_host_id(inet_address endpoint) const {
    return _messaging.knows_version(endpoint) ||
            get_application_state_ptr(endpoint, application_state::NET_VERSION);
@@ -1510,10 +1517,7 @@ void gossiper::update_timestamp_for_nodes(const std::map<inet_address, endpoint_
 }

 void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
-    // if (MessagingService.instance().getVersion(addr) < MessagingService.VERSION_20) {
-    //     real_mark_alive(addr, local_state);
-    //     return;
-    // }
+    // Enter the _background_msg gate so stop() would wait on it
    auto inserted = _pending_mark_alive_endpoints.insert(addr).second;
    if (inserted) {
        // The node is not in the _pending_mark_alive_endpoints
@@ -1524,11 +1528,17 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
        return;
    }

+    // unmark addr as pending on exception or after background continuation completes
+    auto unmark_pending = deferred_action([this, addr, g = shared_from_this()] () noexcept {
+        _pending_mark_alive_endpoints.erase(addr);
+    });
+
    local_state.mark_dead();
    msg_addr id = get_msg_addr(addr);
    int64_t generation = _endpoint_state_map[get_broadcast_address()].get_heart_beat_state().get_generation();
+    // Enter the _background_msg gate so stop() would wait on it
+    auto gh = _background_msg.hold();
    logger.debug("Sending a EchoMessage to {}, with generation_number={}", id, generation);
-    // Do it in the background.
    (void)_messaging.send_gossip_echo(id, generation, std::chrono::milliseconds(15000)).then([this, addr] {
        logger.trace("Got EchoMessage Reply");
        // After sending echo message, the Node might not be in the
@@ -1543,9 +1553,7 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
            return real_mark_alive(addr, state);
        }
        return make_ready_future();
-    }).finally([this, addr] {
-        _pending_mark_alive_endpoints.erase(addr);
-    }).handle_exception([addr] (auto ep) {
+    }).handle_exception([addr, gh = std::move(gh), unmark_pending = std::move(unmark_pending)] (auto ep) {
        logger.warn("Fail to send EchoMessage to {}: {}", addr, ep);
    });
 }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -399,6 +399,8 @@ public:

    const std::unordered_map<inet_address, endpoint_state>& get_endpoint_states() const noexcept;

+    std::vector<inet_address> get_endpoints() const;
+
    bool uses_host_id(inet_address endpoint) const;

    locator::host_id get_host_id(inet_address endpoint) const;
--- a/gms/version_generator.cc
+++ b/gms/version_generator.cc
@@ -8,6 +8,11 @@
 * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
 */

+#include <seastar/core/on_internal_error.hh>
+#include <seastar/core/print.hh>
+#include <seastar/core/smp.hh>
+#include "log.hh"
+#include "seastarx.hh"
 #include "version_generator.hh"

 namespace gms {
@@ -16,8 +21,15 @@ namespace version_generator {
 // For us, we run the gossiper on a single CPU, and don't need to use atomics.
 static int version = 0;

+static logging::logger logger("version_generator");
+
 int get_next_version() noexcept
 {
+    if (this_shard_id() != 0) [[unlikely]] {
+        on_fatal_internal_error(logger, format(
+                "{} can only be called on shard 0, but it was called on shard {}",
+                __FUNCTION__, this_shard_id()));
+    }
    return ++version;
 }

--- a/idl/storage_proxy.idl.hh
+++ b/idl/storage_proxy.idl.hh
@@ -22,7 +22,7 @@
 #include "idl/keys.idl.hh"
 #include "idl/uuid.idl.hh"

-verb [[with_client_info, with_timeout, one_way]] mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]);
+verb [[with_client_info, with_timeout, one_way]] mutation (frozen_mutation fm [[ref]], inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]);
 verb [[with_client_info, one_way]] mutation_done (unsigned shard, uint64_t response_id, db::view::update_backlog backlog [[version 3.1.0]]);
 verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]], replica::exception_variant exception [[version 5.1.0]]);
 verb [[with_client_info, with_timeout]] counter_mutation (std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -123,6 +123,7 @@ fedora_python3_packages=(
    python3-distro
    python3-click
    python3-six
+    python3-pyudev
 )

 pip_packages=(
@@ -174,11 +175,11 @@ arch_packages=(
    thrift
 )

-NODE_EXPORTER_VERSION=1.4.0
+NODE_EXPORTER_VERSION=1.7.0
 declare -A NODE_EXPORTER_CHECKSUM=(
-    ["x86_64"]=e77ff1b0a824a4e13f82a35d98595fe526849c09e3480d0789a56b72242d2abc
-    ["aarch64"]=0b20aa75385a42857a67ee5f6c7f67b229039a22a49c5c61c33f071356415b59
-    ["s390x"]=a98e2aa5f9e557441190d233ba752c0cae28f3130c6a6742b038f3997d034065
+    ["x86_64"]=a550cd5c05f760b7934a2d0afad66d2e92e681482f5f57a917465b1fba3b02a6
+    ["aarch64"]=e386c7b53bc130eaf5e74da28efc6b444857b77df8070537be52678aefd34d96
+    ["s390x"]=aeda68884918f10b135b76bbcd4977cb7a1bb3c4c98a8551f8d2183bafdd9264
 )
 declare -A NODE_EXPORTER_ARCH=(
    ["x86_64"]=amd64
--- a/main.cc
+++ b/main.cc
@@ -958,7 +958,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            supervisor::notify("starting direct failure detector service");
            fd.start(
                std::ref(fd_pinger), std::ref(fd_clock),
-                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count()).get();
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count(),
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{cfg->direct_failure_detector_ping_timeout_in_ms()}}.count()).get();

            auto stop_fd = defer_verbose_shutdown("direct_failure_detector", [] {
                fd.stop().get();
@@ -1164,12 +1165,18 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            }).get();
            cfg->host_id = sys_ks.local().load_local_host_id().get0();

+            std::any stop_raft_api;
            if (raft_gr.local().is_enabled()) {
                auto my_raft_id = raft::server_id{cfg->host_id.uuid()};
                supervisor::notify("starting Raft Group Registry service");
                raft_gr.invoke_on_all([my_raft_id] (service::raft_group_registry& raft_gr) {
                    return raft_gr.start(my_raft_id);
                }).get();
+
+                api::set_server_raft(ctx, raft_gr).get();
+                stop_raft_api = defer_verbose_shutdown("Raft API", [&ctx] {
+                    api::unset_server_raft(ctx).get();
+                });
            } else {
                if (cfg->check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
                    startlog.error("Bad configuration: RAFT feature has to be enabled if BROADCAST_TABLES is enabled");
@@ -1177,7 +1184,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                }
            }

-
            group0_client.init().get();

            db::sstables_format_selector sst_format_selector(gossiper.local(), feature_service, db);
@@ -1710,6 +1716,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                ss.local().drain_on_shutdown().get();
            });

+            // Signal shutdown to the forward service before draining the messaging service.
+            auto shutdown_forward_service = defer_verbose_shutdown("forward service", [&forward_service] {
+                forward_service.invoke_on_all(&service::forward_service::shutdown).get();
+            });
+
            auto drain_view_builder = defer_verbose_shutdown("view builder ops", [cfg] {
                if (cfg->view_building()) {
                    view_builder.invoke_on_all(&db::view::view_builder::drain).get();
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -213,7 +213,7 @@ public:
            tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout)
            : _db(db)
            , _schema(std::move(s))
-            , _permit(_db.local().get_reader_concurrency_semaphore().make_tracking_only_permit(_schema.get(), "multishard-mutation-query", timeout))
+            , _permit(_db.local().get_reader_concurrency_semaphore().make_tracking_only_permit(_schema, "multishard-mutation-query", timeout))
            , _cmd(cmd)
            , _ranges(ranges)
            , _trace_state(std::move(trace_state))
--- a/mutation_fragment_stream_validator.hh
+++ b/mutation_fragment_stream_validator.hh
@@ -169,11 +169,11 @@ class mutation_fragment_stream_validating_filter {
    sstring _name_storage;
    std::string_view _name_view; // always valid
    mutation_fragment_stream_validation_level _validation_level;
+    bool _raise_errors;

 private:
-    sstring full_name() const;
-
-    mutation_fragment_stream_validating_filter(const char* name_literal, sstring name_value, const schema& s, mutation_fragment_stream_validation_level level);
+    mutation_fragment_stream_validating_filter(const char* name_literal, sstring name_value, const schema& s,
+            mutation_fragment_stream_validation_level level, bool raise_errors);

 public:
    /// Constructor.
@@ -181,12 +181,18 @@ public:
    /// \arg name is used in log messages to identify the validator, the
    ///     schema identity is added automatically
    /// \arg compare_keys enable validating clustering key monotonicity
-    mutation_fragment_stream_validating_filter(sstring name, const schema& s, mutation_fragment_stream_validation_level level);
-    mutation_fragment_stream_validating_filter(const char* name, const schema& s, mutation_fragment_stream_validation_level level);
+    mutation_fragment_stream_validating_filter(sstring name, const schema& s, mutation_fragment_stream_validation_level level, bool raise_errors = true);
+    mutation_fragment_stream_validating_filter(const char* name, const schema& s, mutation_fragment_stream_validation_level level, bool raise_errors = true);

    mutation_fragment_stream_validating_filter(mutation_fragment_stream_validating_filter&&) = delete;
    mutation_fragment_stream_validating_filter(const mutation_fragment_stream_validating_filter&) = delete;

+    sstring full_name() const;
+
+    bool raise_errors() const { return _raise_errors; }
+
+    const mutation_fragment_stream_validator& validator() const { return  _validator; }
+
    bool operator()(const dht::decorated_key& dk);
    bool operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos, std::optional<tombstone> new_current_tombstone);
    bool operator()(mutation_fragment::kind kind, position_in_partition_view pos);
@@ -197,5 +203,5 @@ public:
    void reset(const mutation_fragment_v2& mf);
    /// Equivalent to `operator()(partition_end{})`
    bool on_end_of_partition();
-    void on_end_of_stream();
+    bool on_end_of_stream();
 };
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -2159,7 +2159,7 @@ stop_iteration reconcilable_result_builder::consume(static_row&& sr, tombstone,

 stop_iteration reconcilable_result_builder::consume(clustering_row&& cr, row_tombstone, bool is_alive) {
    if (_rt_assembler.needs_flush()) {
-        if (auto rt_opt = _rt_assembler.flush(_schema, position_in_partition::after_key(_schema, cr.key()))) {
+        if (auto rt_opt = _rt_assembler.flush(*_query_schema, position_in_partition::after_key(_schema, cr.key()))) {
            consume(std::move(*rt_opt));
        }
    }
@@ -2186,7 +2186,7 @@ stop_iteration reconcilable_result_builder::consume(range_tombstone&& rt) {
 }

 stop_iteration reconcilable_result_builder::consume(range_tombstone_change&& rtc) {
-    if (auto rt_opt = _rt_assembler.consume(_schema, std::move(rtc))) {
+    if (auto rt_opt = _rt_assembler.consume(*_query_schema, std::move(rtc))) {
        return consume(std::move(*rt_opt));
    }
    return stop_iteration::no;
--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -126,6 +126,7 @@ class reconcilable_result_builder {
    const schema& _schema;
    const query::partition_slice& _slice;
    bool _reversed;
+    schema_ptr _query_schema;

    bool _return_static_content_on_partition_with_no_rows{};
    bool _static_row_is_alive{};
@@ -147,6 +148,7 @@ public:
    reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
                                query::result_memory_accounter&& accounter) noexcept
        : _schema(s), _slice(slice), _reversed(_slice.options.contains(query::partition_slice::option::reversed))
+        , _query_schema(_reversed ? _schema.make_reversed() : _schema.shared_from_this())
        , _memory_accounter(std::move(accounter))
    { }

--- a/mutation_writer/multishard_writer.cc
+++ b/mutation_writer/multishard_writer.cc
@@ -113,7 +113,7 @@ future<> multishard_writer::make_shard_writer(unsigned shard) {
            reader = make_foreign(std::make_unique<flat_mutation_reader_v2>(std::move(reader)))] () mutable {
        auto s = gs.get();
        auto semaphore = std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "shard_writer");
-        auto permit = semaphore->make_tracking_only_permit(s.get(), "multishard-writer", db::no_timeout);
+        auto permit = semaphore->make_tracking_only_permit(s, "multishard-writer", db::no_timeout);
        auto this_shard_reader = make_foreign_reader(s, std::move(permit), std::move(reader));
        return make_foreign(std::make_unique<shard_writer>(gs.get(), std::move(semaphore), std::move(this_shard_reader), consumer));
    }).then([this, shard] (foreign_ptr<std::unique_ptr<shard_writer>> writer) {
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -315,14 +315,19 @@ public:
            }
        }

+    // Strong exception guarantees.
    position_in_partition& operator=(position_in_partition_view view) {
-        _type = view._type;
-        _bound_weight = view._bound_weight;
+        // The copy assigment to _ck can throw (because it allocates),
+        // but assignments to _type and _bound_weight can't throw.
+        // Thus, to achieve strong exception guarantees,
+        // we only need to perform the _ck assigmnent before others.
        if (view._ck) {
            _ck = *view._ck;
        } else {
            _ck.reset();
        }
+        _type = view._type;
+        _bound_weight = view._bound_weight;
        return *this;
    }

--- a/query-result-writer.hh
+++ b/query-result-writer.hh
@@ -136,9 +136,9 @@ public:
            return stop_iteration::no;
        }
        if (!_slice.options.contains<partition_slice::option::allow_short_read>()) {
-            throw std::runtime_error(fmt::format(
-                    "Tombstones processed by unpaged query exceeds limit of {} (configured via query_tombstone_page_limit)",
-                    _tombstone_limit));
+            // The read is unpaged, we cannot interrupt it early without failing it.
+            // Better let it continue.
+            return stop_iteration::no;
        }
        return stop_iteration::yes;
    }
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -19,9 +19,10 @@ leader::~leader() {
 }

 fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,
-        index_t commit_idx, failure_detector& failure_detector, fsm_config config) :
+        index_t commit_idx, failure_detector& failure_detector, fsm_config config,
+        seastar::condition_variable& sm_events) :
        _my_id(id), _current_term(current_term), _voted_for(voted_for),
-        _log(std::move(log)), _failure_detector(failure_detector), _config(config) {
+        _log(std::move(log)), _failure_detector(failure_detector), _config(config), _sm_events(sm_events) {
    if (id == raft::server_id{}) {
        throw std::invalid_argument("raft::fsm: raft instance cannot have id zero");
    }
@@ -41,10 +42,6 @@ fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,
    }
 }

-fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,
-        failure_detector& failure_detector, fsm_config config) :
-        fsm(id, current_term, voted_for, std::move(log), index_t{0}, failure_detector, config) {}
-
 future<semaphore_units<>> fsm::wait_for_memory_permit(seastar::abort_source* as, size_t size) {
    check_is_leader();

@@ -296,20 +293,14 @@ void fsm::become_candidate(bool is_prevote, bool is_leadership_transfer) {
    }
 }

-future<fsm_output> fsm::poll_output() {
-    logger.trace("fsm::poll_output() {} stable index: {} last index: {}",
+bool fsm::has_output() const {
+    logger.trace("fsm::has_output() {} stable index: {} last index: {}",
        _my_id, _log.stable_idx(), _log.last_idx());

-    while (true) {
-        auto diff = _log.last_idx() - _log.stable_idx();
+    auto diff = _log.last_idx() - _log.stable_idx();

-        if (diff > 0 || !_messages.empty() || !_observed.is_equal(*this) || _output.max_read_id_with_quorum ||
-                (is_leader() && leader_state().last_read_id_changed) || _output.snp || !_output.snps_to_drop.empty()) {
-            break;
-        }
-        co_await _sm_events.wait();
-    }
-    co_return get_output();
+    return diff > 0 || !_messages.empty() || !_observed.is_equal(*this) || _output.max_read_id_with_quorum
+        || (is_leader() && leader_state().last_read_id_changed) || _output.snp || !_output.snps_to_drop.empty();
 }

 fsm_output fsm::get_output() {
@@ -1019,7 +1010,7 @@ bool fsm::apply_snapshot(snapshot_descriptor snp, size_t max_trailing_entries, s
    // If the snapshot is local, _commit_idx is larger than snp.idx.
    // Otherwise snp.idx becomes the new commit index.
    _commit_idx = std::max(_commit_idx, snp.idx);
-    _output.snp.emplace(fsm_output::applied_snapshot{snp, local});
+    _output.snp.emplace(fsm_output::applied_snapshot{snp, local, max_trailing_entries});
    size_t units = _log.apply_snapshot(std::move(snp), max_trailing_entries, max_trailing_bytes);
    if (is_leader()) {
        logger.trace("apply_snapshot[{}]: signal {} available units", _my_id, units);
@@ -1132,7 +1123,6 @@ void fsm::stop() {
        // (in particular, abort waits on log_limiter_semaphore and prevent new ones).
        become_follower({});
    }
-    _sm_events.broken();
 }

 std::ostream& operator<<(std::ostream& os, const fsm& f) {
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -21,6 +21,11 @@ struct fsm_output {
    struct applied_snapshot {
        snapshot_descriptor snp;
        bool is_local;
+
+        // Always 0 for non-local snapshots.
+        size_t max_trailing_entries;
+
+        // FIXME: include max_trailing_bytes here and in store_snapshot_descriptor
    };
    std::optional<std::pair<term_t, server_id>> term_and_vote;
    std::vector<log_entry_ptr> log_entries;
@@ -36,14 +41,6 @@ struct fsm_output {
    std::optional<read_id> max_read_id_with_quorum;
    // Set to true if a leadership transfer was aborted since the last output
    bool abort_leadership_transfer;
-
-    // True if there is no new output
-    bool empty() const {
-        return !term_and_vote &&
-            log_entries.size() == 0 && messages.size() == 0 &&
-            committed.size() == 0 && !snp && snps_to_drop.empty() &&
-            !configuration;
-    }
 };

 struct fsm_config {
@@ -136,9 +133,13 @@ struct leader {
 // in-memory state machine with a catch-all API step(message)
 // method. The method handles any kind of input and performs the
 // needed state machine state transitions. To get state machine output
-// poll_output() function has to be called. This call produces an output
+// get_output() function has to be called. To check first if
+// any new output is present, call has_output(). To wait for new
+// new output events, use the sm_events condition variable passed
+// to fsm constructor; fs` signals it each time new output may appear.
+// The get_output() call produces an output
 // object, which encapsulates a list of actions that must be
-// performed until the next poll_output() call can be made. The time is
+// performed until the next get_output() call can be made. The time is
 // represented with a logical timer. The client is responsible for
 // periodically invoking tick() method, which advances the state
 // machine time and allows it to track such events as election or
@@ -226,7 +227,7 @@ private:
    std::vector<std::pair<server_id, rpc_message>> _messages;

    // Signaled when there is a IO event to process.
-    seastar::condition_variable _sm_events;
+    seastar::condition_variable& _sm_events;

    // Called when one of the replicas advances its match index
    // so it may be the case that some entries are committed now.
@@ -338,10 +339,8 @@ protected: // For testing

 public:
    explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
-            index_t commit_idx, failure_detector& failure_detector, fsm_config conf);
-
-    explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
-            failure_detector& failure_detector, fsm_config conf);
+            index_t commit_idx, failure_detector& failure_detector, fsm_config conf,
+            seastar::condition_variable& sm_events);

    bool is_leader() const {
        return std::holds_alternative<leader>(_state);
@@ -409,12 +408,9 @@ public:
    // committed to the persistent Raft log afterwards.
    template<typename T> const log_entry& add_entry(T command);

-    // Wait until there is, and return state machine output that
-    // needs to be handled.
-    // This includes a list of the entries that need
-    // to be logged. The logged entries are eventually
-    // discarded from the state machine after applying a snapshot.
-    future<fsm_output> poll_output();
+    // Check if there is any state machine output
+    // that `get_output()` will return.
+    bool has_output() const;

    // Get state machine output, if there is any. Doesn't
    // wait. It is public for use in testing.
@@ -427,7 +423,7 @@ public:

    // Feed one Raft RPC message into the state machine.
    // Advances the state machine state and generates output,
-    // accessible via poll_output().
+    // accessible via get_output().
    template <typename Message>
    void step(server_id from, Message&& msg);

--- a/raft/raft.hh
+++ b/raft/raft.hh
@@ -755,6 +755,18 @@ public:
    // apply call 'state_machine::load_snapshot(snapshot::id)'
    // Called during Raft server initialization only, should not
    // run in parallel with store.
+    //
+    // If you want to create a Raft cluster with a non-empty state
+    // machine, so that joining servers always receive a snapshot,
+    // you should:
+    // - make sure that members of the initial configuration have
+    //   the same state machine state,
+    // - set the initial snapshot index on members of the initial
+    //   configuration to 1,
+    // - set the initial snapshot index on all subsequently joining
+    //   servers to 0.
+    // This also works if you start with an empty state machine,
+    // so consider it as the go-to default.
    virtual future<snapshot_descriptor> load_snapshot_descriptor() = 0;

    // Persist given log entries.
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -98,6 +98,8 @@ public:
    future<entry_id> add_entry_on_leader(command command, seastar::abort_source* as);
    void register_metrics() override;
 private:
+    seastar::condition_variable _events;
+
    std::unique_ptr<rpc> _rpc;
    std::unique_ptr<state_machine> _state_machine;
    std::unique_ptr<persistence> _persistence;
@@ -112,6 +114,8 @@ private:
    std::optional<awaited_conf_change> _non_joint_conf_commit_promise;
    // Index of the last entry applied to `_state_machine`.
    index_t _applied_idx;
+    // Index of the last persisted snapshot descriptor.
+    index_t _snapshot_desc_idx;
    std::list<active_read> _reads;
    std::multimap<index_t, awaited_index> _awaited_indexes;

@@ -121,13 +125,20 @@ private:
    // Signaled when apply index is changed
    condition_variable _applied_index_changed;

+    // Signaled when _snapshot_desc_idx is changed
+    condition_variable _snapshot_desc_idx_changed;
+
    struct stop_apply_fiber{}; // exception to send when apply fiber is needs to be stopepd

    struct removed_from_config{}; // sent to applier_fiber when we're not a leader and we're outside the current configuration
+
+    struct trigger_snapshot_msg{};
+
    using applier_fiber_message = std::variant<
        std::vector<log_entry_ptr>,
        snapshot_descriptor,
-        removed_from_config>;
+        removed_from_config,
+        trigger_snapshot_msg>;
    queue<applier_fiber_message> _apply_entries = queue<applier_fiber_message>(10);

    struct stats {
@@ -201,6 +212,16 @@ private:
    };
    absl::flat_hash_map<server_id, append_request_queue> _append_request_status;

+    struct server_requests {
+        bool snapshot = false;
+
+        bool empty() const {
+            return !snapshot;
+        }
+    };
+
+    server_requests _new_server_requests;
+
    // Called to commit entries (on a leader or otherwise).
    void notify_waiters(std::map<index_t, op_status>& waiters, const std::vector<log_entry_ptr>& entries);

@@ -212,10 +233,15 @@ private:
    // to be applied.
    void signal_applied();

-    // This fiber processes FSM output by doing the following steps in order:
+    // Processes FSM output by doing the following steps in order:
    //  - persist the current term and vote
    //  - persist unstable log entries on disk.
    //  - send out messages
+    future<> process_fsm_output(index_t& stable_idx, fsm_output&&);
+
+    future<> process_server_requests(server_requests&&);
+
+    // Processes new FSM outputs and server requests as they appear.
    future<> io_fiber(index_t stable_idx);

    // This fiber runs in the background and applies committed entries.
@@ -265,6 +291,8 @@ private:
    // A helper to wait for a leader to get elected
    future<> wait_for_leader(seastar::abort_source* as);

+    virtual future<bool> trigger_snapshot(seastar::abort_source* as) override;
+
    // Get "safe to read" index from a leader
    future<read_barrier_reply> get_read_idx(server_id leader, seastar::abort_source* as);
    // Wait for an entry with a specific term to get committed or
@@ -337,12 +365,14 @@ future<> server_impl::start() {
                                     .append_request_threshold = _config.append_request_threshold,
                                     .max_log_size = _config.max_log_size,
                                     .enable_prevoting = _config.enable_prevoting
-                                 });
+                                 },
+                                 _events);

    _applied_idx = index_t{0};
+    _snapshot_desc_idx = index_t{0};
    if (snapshot.id) {
        co_await _state_machine->load_snapshot(snapshot.id);
-        _applied_idx = snapshot.idx;
+        _snapshot_desc_idx = _applied_idx = snapshot.idx;
    }

    if (!rpc_config.current.empty()) {
@@ -403,6 +433,54 @@ future<> server_impl::wait_for_leader(seastar::abort_source* as) {
    }
 }

+future<bool> server_impl::trigger_snapshot(seastar::abort_source* as) {
+    check_not_aborted();
+
+    if (_applied_idx <= _snapshot_desc_idx) {
+        logger.debug(
+            "[{}] trigger_snapshot: last persisted snapshot descriptor index is up-to-date"
+            ", applied index: {}, persisted snapshot descriptor index: {}, last fsm log index: {}"
+            ", last fsm snapshot index: {}", _id, _applied_idx, _snapshot_desc_idx,
+            _fsm->log_last_idx(), _fsm->log_last_snapshot_idx());
+        co_return false;
+    }
+
+    _new_server_requests.snapshot = true;
+    _events.signal();
+
+    // Wait for persisted snapshot index to catch up to this index.
+    auto awaited_idx = _applied_idx;
+
+    logger.debug("[{}] snapshot request waiting for index {}", _id, awaited_idx);
+
+    try {
+        optimized_optional<abort_source::subscription> sub;
+        if (as) {
+            as->check();
+            sub = as->subscribe([this] () noexcept { _snapshot_desc_idx_changed.broadcast(); });
+            assert(sub); // due to `check()` above
+        }
+        co_await _snapshot_desc_idx_changed.when([this, as, awaited_idx] {
+            return (as && as->abort_requested()) || awaited_idx <= _snapshot_desc_idx;
+        });
+        if (as) {
+            as->check();
+        }
+    } catch (abort_requested_exception&) {
+        throw request_aborted();
+    } catch (seastar::broken_condition_variable&) {
+        throw request_aborted();
+    }
+
+    logger.debug(
+        "[{}] snapshot request satisfied, awaited index {}, persisted snapshot descriptor index: {}"
+        ", current applied index {}, last fsm log index {}, last fsm snapshot index {}",
+        _id, awaited_idx, _snapshot_desc_idx, _applied_idx,
+        _fsm->log_last_idx(), _fsm->log_last_snapshot_idx());
+
+    co_return true;
+}
+
 future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abort_source* as) {
    // The entry may have been already committed and even applied
    // in case it was forwarded to the leader. In this case
@@ -917,141 +995,168 @@ static rpc_config_diff diff_address_sets(const server_address_set& prev, const c
    return result;
 }

+future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batch) {
+    if (batch.term_and_vote) {
+        // Current term and vote are always persisted
+        // together. A vote may change independently of
+        // term, but it's safe to update both in this
+        // case.
+        co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
+        _stats.store_term_and_vote++;
+    }
+
+    if (batch.snp) {
+        auto& [snp, is_local, max_trailing_entries] = *batch.snp;
+        logger.trace("[{}] io_fiber storing snapshot {}", _id, snp.id);
+        // Persist the snapshot
+        co_await _persistence->store_snapshot_descriptor(snp, max_trailing_entries);
+        _snapshot_desc_idx = snp.idx;
+        _snapshot_desc_idx_changed.broadcast();
+        _stats.store_snapshot++;
+        // If this is locally generated snapshot there is no need to
+        // load it.
+        if (!is_local) {
+            co_await _apply_entries.push_eventually(std::move(snp));
+        }
+    }
+
+    for (const auto& snp_id: batch.snps_to_drop) {
+        _state_machine->drop_snapshot(snp_id);
+    }
+
+    if (batch.log_entries.size()) {
+        auto& entries = batch.log_entries;
+
+        if (last_stable >= entries[0]->idx) {
+            co_await _persistence->truncate_log(entries[0]->idx);
+            _stats.truncate_persisted_log++;
+        }
+
+        utils::get_local_injector().inject("store_log_entries/test-failure",
+            [] { throw std::runtime_error("store_log_entries/test-failure"); });
+
+        // Combine saving and truncating into one call?
+        // will require persistence to keep track of last idx
+        co_await _persistence->store_log_entries(entries);
+
+        last_stable = (*entries.crbegin())->idx;
+        _stats.persisted_log_entries += entries.size();
+    }
+
+    // Update RPC server address mappings. Add servers which are joining
+    // the cluster according to the new configuration (obtained from the
+    // last_conf_idx).
+    //
+    // It should be done prior to sending the messages since the RPC
+    // module needs to know who should it send the messages to (actual
+    // network addresses of the joining servers).
+    rpc_config_diff rpc_diff;
+    if (batch.configuration) {
+        rpc_diff = diff_address_sets(get_rpc_config(), *batch.configuration);
+        for (const auto& addr: rpc_diff.joining) {
+            add_to_rpc_config(addr);
+        }
+        _rpc->on_configuration_change(rpc_diff.joining, {});
+    }
+
+     // After entries are persisted we can send messages.
+    for (auto&& m : batch.messages) {
+        try {
+            send_message(m.first, std::move(m.second));
+        } catch(...) {
+            // Not being able to send a message is not a critical error
+            logger.debug("[{}] io_fiber failed to send a message to {}: {}", _id, m.first, std::current_exception());
+        }
+    }
+
+    if (batch.configuration) {
+        for (const auto& addr: rpc_diff.leaving) {
+            abort_snapshot_transfer(addr.id);
+            remove_from_rpc_config(addr);
+        }
+        _rpc->on_configuration_change({}, rpc_diff.leaving);
+    }
+
+    // Process committed entries.
+    if (batch.committed.size()) {
+        if (_non_joint_conf_commit_promise) {
+            for (const auto& e: batch.committed) {
+                const auto* cfg = get_if<raft::configuration>(&e->data);
+                if (cfg != nullptr && !cfg->is_joint()) {
+                    std::exchange(_non_joint_conf_commit_promise, std::nullopt)->promise.set_value();
+                    break;
+                }
+            }
+        }
+        co_await _persistence->store_commit_idx(batch.committed.back()->idx);
+        _stats.queue_entries_for_apply += batch.committed.size();
+        co_await _apply_entries.push_eventually(std::move(batch.committed));
+    }
+
+    if (batch.max_read_id_with_quorum) {
+        while (!_reads.empty() && _reads.front().id <= batch.max_read_id_with_quorum) {
+            _reads.front().promise.set_value(_reads.front().idx);
+            _reads.pop_front();
+        }
+    }
+    if (!_fsm->is_leader()) {
+        if (_stepdown_promise) {
+            std::exchange(_stepdown_promise, std::nullopt)->set_value();
+        }
+        if (!_current_rpc_config.contains(_id)) {
+            // - It's important we push this after we pushed committed entries above. It
+            // will cause `applier_fiber` to drop waiters, which should be done after we
+            // notify all waiters for entries committed in this batch.
+            // - This may happen multiple times if `io_fiber` gets multiple batches when
+            // we're outside the configuration, but it should eventually (and generally
+            // quickly) stop happening (we're outside the config after all).
+            co_await _apply_entries.push_eventually(removed_from_config{});
+        }
+        // request aborts of snapshot transfers
+        abort_snapshot_transfers();
+        // abort all read barriers
+        for (auto& r : _reads) {
+            r.promise.set_value(not_a_leader{_fsm->current_leader()});
+        }
+        _reads.clear();
+    } else if (batch.abort_leadership_transfer) {
+        if (_stepdown_promise) {
+            std::exchange(_stepdown_promise, std::nullopt)->set_exception(timeout_error("Stepdown process timed out"));
+        }
+    }
+    if (_leader_promise && _fsm->current_leader()) {
+        std::exchange(_leader_promise, std::nullopt)->set_value();
+    }
+}
+
+future<> server_impl::process_server_requests(server_requests&& requests) {
+    if (requests.snapshot) {
+        co_await _apply_entries.push_eventually(trigger_snapshot_msg{});
+    }
+}
+
 future<> server_impl::io_fiber(index_t last_stable) {
    logger.trace("[{}] io_fiber start", _id);
    try {
        while (true) {
-            auto batch = co_await _fsm->poll_output();
+            bool has_fsm_output = false;
+            bool has_server_request = false;
+            co_await _events.when([this, &has_fsm_output, &has_server_request] {
+                has_fsm_output = _fsm->has_output();
+                has_server_request = !_new_server_requests.empty();
+                return has_fsm_output || has_server_request;
+            });
+
            _stats.polls++;

-            if (batch.term_and_vote) {
-                // Current term and vote are always persisted
-                // together. A vote may change independently of
-                // term, but it's safe to update both in this
-                // case.
-                co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
-                _stats.store_term_and_vote++;
+            if (has_fsm_output) {
+                auto batch = _fsm->get_output();
+                co_await process_fsm_output(last_stable, std::move(batch));
            }

-            if (batch.snp) {
-                auto& [snp, is_local] = *batch.snp;
-                logger.trace("[{}] io_fiber storing snapshot {}", _id, snp.id);
-                // Persist the snapshot
-                co_await _persistence->store_snapshot_descriptor(snp, is_local ? _config.snapshot_trailing : 0);
-                _stats.store_snapshot++;
-                // If this is locally generated snapshot there is no need to
-                // load it.
-                if (!is_local) {
-                    co_await _apply_entries.push_eventually(std::move(snp));
-                }
-            }
-
-            for (const auto& snp_id: batch.snps_to_drop) {
-                _state_machine->drop_snapshot(snp_id);
-            }
-
-            if (batch.log_entries.size()) {
-                auto& entries = batch.log_entries;
-
-                if (last_stable >= entries[0]->idx) {
-                    co_await _persistence->truncate_log(entries[0]->idx);
-                    _stats.truncate_persisted_log++;
-                }
-
-                utils::get_local_injector().inject("store_log_entries/test-failure",
-                    [] { throw std::runtime_error("store_log_entries/test-failure"); });
-
-                // Combine saving and truncating into one call?
-                // will require persistence to keep track of last idx
-                co_await _persistence->store_log_entries(entries);
-
-                last_stable = (*entries.crbegin())->idx;
-                _stats.persisted_log_entries += entries.size();
-            }
-
-            // Update RPC server address mappings. Add servers which are joining
-            // the cluster according to the new configuration (obtained from the
-            // last_conf_idx).
-            //
-            // It should be done prior to sending the messages since the RPC
-            // module needs to know who should it send the messages to (actual
-            // network addresses of the joining servers).
-            rpc_config_diff rpc_diff;
-            if (batch.configuration) {
-                rpc_diff = diff_address_sets(get_rpc_config(), *batch.configuration);
-                for (const auto& addr: rpc_diff.joining) {
-                    add_to_rpc_config(addr);
-                }
-                _rpc->on_configuration_change(rpc_diff.joining, {});
-            }
-
-             // After entries are persisted we can send messages.
-            for (auto&& m : batch.messages) {
-                try {
-                    send_message(m.first, std::move(m.second));
-                } catch(...) {
-                    // Not being able to send a message is not a critical error
-                    logger.debug("[{}] io_fiber failed to send a message to {}: {}", _id, m.first, std::current_exception());
-                }
-            }
-
-            if (batch.configuration) {
-                for (const auto& addr: rpc_diff.leaving) {
-                    abort_snapshot_transfer(addr.id);
-                    remove_from_rpc_config(addr);
-                }
-                _rpc->on_configuration_change({}, rpc_diff.leaving);
-            }
-
-            // Process committed entries.
-            if (batch.committed.size()) {
-                if (_non_joint_conf_commit_promise) {
-                    for (const auto& e: batch.committed) {
-                        const auto* cfg = get_if<raft::configuration>(&e->data);
-                        if (cfg != nullptr && !cfg->is_joint()) {
-                            std::exchange(_non_joint_conf_commit_promise, std::nullopt)->promise.set_value();
-                            break;
-                        }
-                    }
-                }
-                co_await _persistence->store_commit_idx(batch.committed.back()->idx);
-                _stats.queue_entries_for_apply += batch.committed.size();
-                co_await _apply_entries.push_eventually(std::move(batch.committed));
-            }
-
-            if (batch.max_read_id_with_quorum) {
-                while (!_reads.empty() && _reads.front().id <= batch.max_read_id_with_quorum) {
-                    _reads.front().promise.set_value(_reads.front().idx);
-                    _reads.pop_front();
-                }
-            }
-            if (!_fsm->is_leader()) {
-                if (_stepdown_promise) {
-                    std::exchange(_stepdown_promise, std::nullopt)->set_value();
-                }
-                if (!_current_rpc_config.contains(_id)) {
-                    // - It's important we push this after we pushed committed entries above. It
-                    // will cause `applier_fiber` to drop waiters, which should be done after we
-                    // notify all waiters for entries committed in this batch.
-                    // - This may happen multiple times if `io_fiber` gets multiple batches when
-                    // we're outside the configuration, but it should eventually (and generally
-                    // quickly) stop happening (we're outside the config after all).
-                    co_await _apply_entries.push_eventually(removed_from_config{});
-                }
-                // request aborts of snapshot transfers
-                abort_snapshot_transfers();
-                // abort all read barriers
-                for (auto& r : _reads) {
-                    r.promise.set_value(not_a_leader{_fsm->current_leader()});
-                }
-                _reads.clear();
-            } else if (batch.abort_leadership_transfer) {
-                if (_stepdown_promise) {
-                    std::exchange(_stepdown_promise, std::nullopt)->set_exception(timeout_error("Stepdown process timed out"));
-                }
-            }
-            if (_leader_promise && _fsm->current_leader()) {
-                std::exchange(_leader_promise, std::nullopt)->set_value();
+            if (has_server_request) {
+                auto requests = std::exchange(_new_server_requests, server_requests{});
+                co_await process_server_requests(std::move(requests));
            }
        }
    } catch (seastar::broken_condition_variable&) {
@@ -1064,6 +1169,18 @@ future<> server_impl::io_fiber(index_t last_stable) {
    co_return;
 }

+static bool is_closed_error(std::exception_ptr ep) {
+    try {
+        std::rethrow_exception(ep);
+    } catch (const seastar::rpc::remote_verb_error& e) {
+        return std::string_view{e.what()} == "connection is closed";
+    } catch (const seastar::rpc::closed_error&) {
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
 void server_impl::send_snapshot(server_id dst, install_snapshot&& snp) {
    seastar::abort_source as;
    uint64_t id = _next_snapshot_transfer_id++;
@@ -1079,7 +1196,11 @@ void server_impl::send_snapshot(server_id dst, install_snapshot&& snp) {
            _snapshot_transfers.erase(dst);
            auto reply = raft::snapshot_reply{.current_term = _fsm->get_current_term(), .success = false};
            if (f.failed()) {
-                logger.error("[{}] Transferring snapshot to {} failed with: {}", _id, dst, f.get_exception());
+                auto ep = f.get_exception();
+                // Report our or remote's closed_error as WARNs instead of ERRORs.
+                // Workaround for scylladb/scylladb#12972 for ScyllaDB 5.2.
+                auto level = is_closed_error(ep) ? log_level::warn : log_level::error;
+                logger.log(level, "[{}] Transferring snapshot to {} failed with: {}", _id, dst, ep);
            } else {
                logger.trace("[{}] Transferred snapshot to {}", _id, dst);
                reply = f.get();
@@ -1092,14 +1213,19 @@ void server_impl::send_snapshot(server_id dst, install_snapshot&& snp) {
 }

 future<snapshot_reply> server_impl::apply_snapshot(server_id from, install_snapshot snp) {
-    _fsm->step(from, std::move(snp));
-    // Only one snapshot can be received at a time from each node
-    assert(! _snapshot_application_done.contains(from));
    snapshot_reply reply{_fsm->get_current_term(), false};
-    try {
-        reply = co_await _snapshot_application_done[from].get_future();
-    } catch (...) {
-        logger.error("apply_snapshot[{}] failed with {}", _id, std::current_exception());
+    // Previous snapshot processing may still be running if a connection from the leader was broken
+    // after it sent install_snapshot but before it got a reply. It may case the snapshot to be resent
+    // and it may arrive before the previous one is processed. In this rare case we return error and the leader
+    // will try again later (or may be not if the snapshot that is been applied is recent enough)
+    if (!_snapshot_application_done.contains(from)) {
+        _fsm->step(from, std::move(snp));
+
+        try {
+            reply = co_await _snapshot_application_done[from].get_future();
+        } catch (...) {
+            logger.error("apply_snapshot[{}] failed with {}", _id, std::current_exception());
+        }
    }
    co_return reply;
 }
@@ -1195,6 +1321,23 @@ future<> server_impl::applier_fiber() {
                // it may never know the status of entries it submitted.
                drop_waiters();
                co_return;
+            },
+            [this] (const trigger_snapshot_msg&) -> future<> {
+                auto applied_term = _fsm->log_term_for(_applied_idx);
+                // last truncation index <= snapshot index <= applied index
+                assert(applied_term);
+
+                snapshot_descriptor snp;
+                snp.term = *applied_term;
+                snp.idx = _applied_idx;
+                snp.config = _fsm->log_last_conf_for(_applied_idx);
+                logger.trace("[{}] taking snapshot at term={}, idx={} due to request", _id, snp.term, snp.idx);
+                snp.id = co_await _state_machine->take_snapshot();
+                if (!_fsm->apply_snapshot(snp, 0, 0, true)) {
+                    logger.trace("[{}] while taking snapshot term={} idx={} id={} due to request,"
+                           " fsm received a later snapshot at idx={}", _id, snp.term, snp.idx, snp.id, _fsm->log_last_snapshot_idx());
+                }
+                _stats.snapshots_taken++;
            }
            ), v);

@@ -1343,6 +1486,8 @@ future<> server_impl::abort(sstring reason) {
    _aborted = std::move(reason);
    logger.trace("[{}]: abort() called", _id);
    _fsm->stop();
+    _events.broken();
+    _snapshot_desc_idx_changed.broken();

    // IO and applier fibers may update waiters and start new snapshot
    // transfers, so abort them first
--- a/raft/server.hh
+++ b/raft/server.hh
@@ -224,6 +224,22 @@ public:
    // of two servers iff their IDs are different.
    virtual void register_metrics() = 0;

+    // Manually trigger snapshot creation and log truncation.
+    //
+    // Does nothing if the current apply index is less or equal to the last persisted snapshot descriptor index
+    // and returns `false`.
+    //
+    // Otherwise returns `true`; when the future resolves, it is guaranteed that the snapshot descriptor
+    // is persisted, but not that the snapshot is loaded to the state machine yet (it will be eventually).
+    //
+    // The request may be resolved by the regular snapshotting mechanisms (e.g. a snapshot
+    // is created because the Raft log grows too large). In this case there is no guarantee
+    // how many trailing entries will be left trailing behind the snapshot. However,
+    // if there are no operations running on the server concurrently with the request and all
+    // committed entries are already applied, the created snapshot is guaranteed to leave
+    // zero trailing entries.
+    virtual future<bool> trigger_snapshot(seastar::abort_source* as) = 0;
+
    // Ad hoc functions for testing
    virtual void wait_until_candidate() = 0;
    virtual future<> wait_election_done() = 0;
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -69,7 +69,8 @@ class reader_permit::impl
        : public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>
        , public enable_shared_from_this<reader_permit::impl> {
    reader_concurrency_semaphore& _semaphore;
-    const schema* _schema;
+    schema_ptr _schema;
+
    sstring _op_name;
    std::string_view _op_name_view;
    reader_resources _base_resources;
@@ -120,18 +121,18 @@ private:
 public:
    struct value_tag {};

-    impl(reader_concurrency_semaphore& semaphore, const schema* const schema, const std::string_view& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
+    impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, const std::string_view& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
        : _semaphore(semaphore)
-        , _schema(schema)
+        , _schema(std::move(schema))
        , _op_name_view(op_name)
        , _base_resources(base_resources)
        , _timeout(timeout)
    {
        _semaphore.on_permit_created(*this);
    }
-    impl(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
+    impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
        : _semaphore(semaphore)
-        , _schema(schema)
+        , _schema(std::move(schema))
        , _op_name(std::move(op_name))
        , _op_name_view(_op_name)
        , _base_resources(base_resources)
@@ -181,7 +182,7 @@ public:
        return _semaphore;
    }

-    const ::schema* get_schema() const {
+    const schema_ptr& get_schema() const {
        return _schema;
    }

@@ -356,15 +357,15 @@ reader_permit::reader_permit(shared_ptr<impl> impl) : _impl(std::move(impl))
 {
 }

-reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name,
+reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, std::string_view op_name,
        reader_resources base_resources, db::timeout_clock::time_point timeout)
-    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, op_name, base_resources, timeout))
+    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, std::move(schema), op_name, base_resources, timeout))
 {
 }

-reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name,
+reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name,
        reader_resources base_resources, db::timeout_clock::time_point timeout)
-    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, std::move(op_name), base_resources, timeout))
+    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, std::move(schema), std::move(op_name), base_resources, timeout))
 {
 }

@@ -577,7 +578,7 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
    permit_groups permits;

    for (const auto& permit : list) {
-        permits[permit_group_key(permit.get_schema(), permit.get_op_name(), permit.get_state())].add(permit);
+        permits[permit_group_key(permit.get_schema().get(), permit.get_op_name(), permit.get_state())].add(permit);
    }

    permit_stats total;
@@ -659,6 +660,8 @@ future<> reader_concurrency_semaphore::execution_loop() noexcept {
                co_await coroutine::maybe_yield();
            }
        }
+
+        maybe_admit_waiters();
    }
 }

@@ -1039,33 +1042,33 @@ void reader_concurrency_semaphore::on_permit_unblocked() noexcept {
    --_stats.blocked_permits;
 }

-future<reader_permit> reader_concurrency_semaphore::obtain_permit(const schema* const schema, const char* const op_name, size_t memory,
+future<reader_permit> reader_concurrency_semaphore::obtain_permit(schema_ptr schema, const char* const op_name, size_t memory,
        db::timeout_clock::time_point timeout) {
-    auto permit = reader_permit(*this, schema, std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
+    auto permit = reader_permit(*this, std::move(schema), std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
    return do_wait_admission(permit).then([permit] () mutable {
        return std::move(permit);
    });
 }

-future<reader_permit> reader_concurrency_semaphore::obtain_permit(const schema* const schema, sstring&& op_name, size_t memory,
+future<reader_permit> reader_concurrency_semaphore::obtain_permit(schema_ptr schema, sstring&& op_name, size_t memory,
        db::timeout_clock::time_point timeout) {
-    auto permit = reader_permit(*this, schema, std::move(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
+    auto permit = reader_permit(*this, std::move(schema), std::move(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
    return do_wait_admission(permit).then([permit] () mutable {
        return std::move(permit);
    });
 }

-reader_permit reader_concurrency_semaphore::make_tracking_only_permit(const schema* const schema, const char* const op_name, db::timeout_clock::time_point timeout) {
-    return reader_permit(*this, schema, std::string_view(op_name), {}, timeout);
+reader_permit reader_concurrency_semaphore::make_tracking_only_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout) {
+    return reader_permit(*this, std::move(schema), std::string_view(op_name), {}, timeout);
 }

-reader_permit reader_concurrency_semaphore::make_tracking_only_permit(const schema* const schema, sstring&& op_name, db::timeout_clock::time_point timeout) {
-    return reader_permit(*this, schema, std::move(op_name), {}, timeout);
+reader_permit reader_concurrency_semaphore::make_tracking_only_permit(schema_ptr schema, sstring&& op_name, db::timeout_clock::time_point timeout) {
+    return reader_permit(*this, std::move(schema), std::move(op_name), {}, timeout);
 }

-future<> reader_concurrency_semaphore::with_permit(const schema* const schema, const char* const op_name, size_t memory,
+future<> reader_concurrency_semaphore::with_permit(schema_ptr schema, const char* const op_name, size_t memory,
        db::timeout_clock::time_point timeout, read_func func) {
-    return do_wait_admission(reader_permit(*this, schema, std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout), std::move(func));
+    return do_wait_admission(reader_permit(*this, std::move(schema), std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout), std::move(func));
 }

 future<> reader_concurrency_semaphore::with_ready_permit(reader_permit permit, read_func func) {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -372,8 +372,8 @@ public:
    ///
    /// Some permits cannot be associated with any table, so passing nullptr as
    /// the schema parameter is allowed.
-    future<reader_permit> obtain_permit(const schema* const schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout);
-    future<reader_permit> obtain_permit(const schema* const schema, sstring&& op_name, size_t memory, db::timeout_clock::time_point timeout);
+    future<reader_permit> obtain_permit(schema_ptr schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout);
+    future<reader_permit> obtain_permit(schema_ptr schema, sstring&& op_name, size_t memory, db::timeout_clock::time_point timeout);

    /// Make a tracking only permit
    ///
@@ -388,8 +388,8 @@ public:
    ///
    /// Some permits cannot be associated with any table, so passing nullptr as
    /// the schema parameter is allowed.
-    reader_permit make_tracking_only_permit(const schema* const schema, const char* const op_name, db::timeout_clock::time_point timeout);
-    reader_permit make_tracking_only_permit(const schema* const schema, sstring&& op_name, db::timeout_clock::time_point timeout);
+    reader_permit make_tracking_only_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout);
+    reader_permit make_tracking_only_permit(schema_ptr schema, sstring&& op_name, db::timeout_clock::time_point timeout);

    /// Run the function through the semaphore's execution stage with an admitted permit
    ///
@@ -410,7 +410,7 @@ public:
    ///
    /// Some permits cannot be associated with any table, so passing nullptr as
    /// the schema parameter is allowed.
-    future<> with_permit(const schema* const schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout, read_func func);
+    future<> with_permit(schema_ptr schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout, read_func func);

    /// Run the function through the semaphore's execution stage with a pre-admitted permit
    ///
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -95,9 +95,9 @@ private:
 private:
    reader_permit() = default;
    reader_permit(shared_ptr<impl>);
-    explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name,
+    explicit reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, std::string_view op_name,
            reader_resources base_resources, db::timeout_clock::time_point timeout);
-    explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name,
+    explicit reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name,
            reader_resources base_resources, db::timeout_clock::time_point timeout);

    void on_waiting();
--- a/readers/multishard.cc
+++ b/readers/multishard.cc
@@ -231,8 +231,8 @@ private:
    flat_mutation_reader_v2_opt _reader;

 private:
-    void do_pause(flat_mutation_reader_v2 reader);
-    void maybe_pause(flat_mutation_reader_v2 reader);
+    void do_pause(flat_mutation_reader_v2 reader) noexcept;
+    void maybe_pause(flat_mutation_reader_v2 reader) noexcept;
    flat_mutation_reader_v2_opt try_resume();
    void update_next_position();
    void adjust_partition_slice();
@@ -281,12 +281,12 @@ public:
    }
 };

-void evictable_reader_v2::do_pause(flat_mutation_reader_v2 reader) {
+void evictable_reader_v2::do_pause(flat_mutation_reader_v2 reader) noexcept {
    assert(!_irh);
    _irh = _permit.semaphore().register_inactive_read(std::move(reader));
 }

-void evictable_reader_v2::maybe_pause(flat_mutation_reader_v2 reader) {
+void evictable_reader_v2::maybe_pause(flat_mutation_reader_v2 reader) noexcept {
    if (_auto_pause) {
        do_pause(std::move(reader));
    } else {
@@ -649,8 +649,17 @@ future<> evictable_reader_v2::fast_forward_to(const dht::partition_range& pr) {
        co_return;
    }
    if (auto reader_opt = try_resume()) {
-        co_await reader_opt->fast_forward_to(pr);
-        _range_override.reset();
+        std::exception_ptr ex;
+        try {
+            co_await reader_opt->fast_forward_to(pr);
+            _range_override.reset();
+        } catch (...) {
+            ex = std::current_exception();
+        }
+        if (ex) {
+            co_await reader_opt->close();
+            std::rethrow_exception(std::move(ex));
+        }
        maybe_pause(std::move(*reader_opt));
    }
 }
--- a/readers/mutation_reader.cc
+++ b/readers/mutation_reader.cc
@@ -191,7 +191,11 @@ void mutation_fragment_stream_validator::reset(const mutation_fragment& mf) {

 namespace {

-[[noreturn]] void on_validation_error(seastar::logger& l, const seastar::sstring& reason) {
+bool on_validation_error(seastar::logger& l, const mutation_fragment_stream_validating_filter& zis, const seastar::sstring& reason) {
+    if (!zis.raise_errors()) {
+        l.error("{}", reason);
+        return false;
+    }
    try {
        on_internal_error(l, reason);
    } catch (std::runtime_error& e) {
@@ -209,13 +213,13 @@ bool mutation_fragment_stream_validating_filter::operator()(const dht::decorated
        if (_validator(dk.token())) {
            return true;
        }
-        on_validation_error(mrlog, format("[validator {} for {}] Unexpected token: previous {}, current {}",
+        return on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected token: previous {}, current {}",
                static_cast<void*>(this), full_name(), _validator.previous_token(), dk.token()));
    } else {
        if (_validator(dk)) {
            return true;
        }
-        on_validation_error(mrlog, format("[validator {} for {}] Unexpected partition key: previous {}, current {}",
+        return on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected partition key: previous {}, current {}",
                static_cast<void*>(this), full_name(), _validator.previous_partition_key(), dk));
    }
 }
@@ -226,10 +230,11 @@ sstring mutation_fragment_stream_validating_filter::full_name() const {
 }

 mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(const char* name_literal, sstring name_value, const schema& s,
-        mutation_fragment_stream_validation_level level)
+        mutation_fragment_stream_validation_level level, bool raise_errors)
    : _validator(s)
    , _name_storage(std::move(name_value))
    , _validation_level(level)
+    , _raise_errors(raise_errors)
 {
    if (name_literal) {
        _name_view = name_literal;
@@ -260,13 +265,13 @@ mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_
 }

 mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(sstring name, const schema& s,
-        mutation_fragment_stream_validation_level level)
-    : mutation_fragment_stream_validating_filter(nullptr, std::move(name), s, level)
+        mutation_fragment_stream_validation_level level, bool raise_errors)
+    : mutation_fragment_stream_validating_filter(nullptr, std::move(name), s, level, raise_errors)
 { }

 mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(const char* name, const schema& s,
-        mutation_fragment_stream_validation_level level)
-    : mutation_fragment_stream_validating_filter(name, {}, s, level)
+        mutation_fragment_stream_validation_level level, bool raise_errors)
+    : mutation_fragment_stream_validating_filter(name, {}, s, level, raise_errors)
 { }

 bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos,
@@ -279,7 +284,9 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    mrlog.debug("[validator {}] {}:{} new_current_tombstone: {}", static_cast<void*>(this), kind, pos, new_current_tombstone);

-    if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
+    if (_validation_level == mutation_fragment_stream_validation_level::none) {
+        return true;
+    } else if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
        valid = _validator(kind, pos, new_current_tombstone);
    } else {
        valid = _validator(kind, new_current_tombstone);
@@ -287,18 +294,19 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    if (__builtin_expect(!valid, false)) {
        if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
-            on_validation_error(mrlog, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}:{}, current {}:{}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}:{}, current {}:{}",
                    static_cast<void*>(this), full_name(), _validator.previous_partition_key(), _validator.previous_mutation_fragment_kind(), _validator.previous_position(), kind, pos));
        } else if (_validation_level >= mutation_fragment_stream_validation_level::partition_key) {
-            on_validation_error(mrlog, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}, current {}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}, current {}",
                    static_cast<void*>(this), full_name(), _validator.previous_partition_key(), _validator.previous_mutation_fragment_kind(), kind));
        } else if (kind == mutation_fragment_v2::kind::partition_end && _validator.current_tombstone()) {
-            on_validation_error(mrlog, format("[validator {} for {}] Partition ended with active tombstone: {}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Partition ended with active tombstone: {}",
                    static_cast<void*>(this), full_name(), _validator.current_tombstone()));
        } else {
-            on_validation_error(mrlog, format("[validator {} for {}] Unexpected mutation fragment: previous {}, current {}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected mutation fragment: previous {}, current {}",
                    static_cast<void*>(this), full_name(), _validator.previous_mutation_fragment_kind(), kind));
        }
+        return false;
    }

    return true;
@@ -340,15 +348,16 @@ bool mutation_fragment_stream_validating_filter::on_end_of_partition() {
    return (*this)(mutation_fragment::kind::partition_end, position_in_partition_view(position_in_partition_view::end_of_partition_tag_t()));
 }

-void mutation_fragment_stream_validating_filter::on_end_of_stream() {
+bool mutation_fragment_stream_validating_filter::on_end_of_stream() {
    if (_validation_level < mutation_fragment_stream_validation_level::partition_region) {
-        return;
+        return true;
    }
    mrlog.debug("[validator {}] EOS", static_cast<const void*>(this));
    if (!_validator.on_end_of_stream()) {
-        on_validation_error(mrlog, format("[validator {} for {}] Stream ended with unclosed partition: {}", static_cast<const void*>(this), full_name(),
+        return on_validation_error(mrlog, *this, format("[validator {} for {}] Stream ended with unclosed partition: {}", static_cast<const void*>(this), full_name(),
                _validator.previous_mutation_fragment_kind()));
    }
+    return true;
 }

 static size_t compute_buffer_size(const schema& s, const flat_mutation_reader_v2::tracked_buffer& buffer)
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -17,6 +17,7 @@
 #include "gms/gossiper.hh"
 #include "service/priority_manager.hh"
 #include "message/messaging_service.hh"
+#include "repair/table_check.hh"
 #include "sstables/sstables.hh"
 #include "replica/database.hh"
 #include "db/config.hh"
@@ -578,7 +579,7 @@ shard_repair_task_impl::shard_repair_task_impl(tasks::task_manager::module_ptr m
    , ranges(ranges_)
    , cfs(get_table_names(db.local(), table_ids_))
    , table_ids(std::move(table_ids_))
-    , id(parent_id_)
+    , global_repair_id(parent_id_)
    , data_centers(data_centers_)
    , hosts(hosts_)
    , ignore_nodes(ignore_nodes_)
@@ -588,16 +589,16 @@ shard_repair_task_impl::shard_repair_task_impl(tasks::task_manager::module_ptr m
 { }

 void shard_repair_task_impl::check_failed_ranges() {
-    rlogger.info("repair[{}]: shard {} stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}",
-        id.uuid(), id.shard(), _reason, _status.keyspace, table_names(), ranges.size(), _stats.get_stats());
+    rlogger.info("repair[{}]: stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}",
+        global_repair_id.uuid(), _reason, _status.keyspace, table_names(), ranges.size(), _stats.get_stats());
    if (nr_failed_ranges) {
-        rlogger.warn("repair[{}]: shard {} failed - {} out of {} ranges failed", id.uuid(), id.shard(), nr_failed_ranges, ranges_size());
-        throw std::runtime_error(format("repair[{}] on shard {} failed to repair {} out of {} ranges", id.uuid(), id.shard(), nr_failed_ranges, ranges_size()));
+        rlogger.warn("repair[{}]: failed - {} out of {} ranges failed", global_repair_id.uuid(), nr_failed_ranges, ranges_size());
+        throw std::runtime_error(format("repair[{}] on failed to repair {} out of {} ranges", global_repair_id.uuid(), nr_failed_ranges, ranges_size()));
    } else {
        if (dropped_tables.size()) {
-            rlogger.warn("repair[{}]: shard {} completed successfully, keyspace={}, ignoring dropped tables={}", id.uuid(), id.shard(), _status.keyspace, dropped_tables);
+            rlogger.warn("repair[{}]: completed successfully, keyspace={}, ignoring dropped tables={}", global_repair_id.uuid(), _status.keyspace, dropped_tables);
        } else {
-            rlogger.info("repair[{}]: shard {} completed successfully, keyspace={}", id.uuid(), id.shard(), _status.keyspace);
+            rlogger.info("repair[{}]: completed successfully, keyspace={}", global_repair_id.uuid(), _status.keyspace);
        }
    }
 }
@@ -631,8 +632,8 @@ future<> shard_repair_task_impl::repair_range(const dht::token_range& range, ::t
        if (it == live_neighbors.end()) {
            nr_failed_ranges++;
            auto status = format("failed: mandatory neighbor={} is not alive", node);
-            rlogger.error("repair[{}]: Repair {} out of {} ranges, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-                    id.uuid(), ranges_index, ranges_size(), id.shard(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
+            rlogger.error("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
+                    global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
            // If the task is aborted, its state will change to failed. One can wait for this with task_manager::task::done().
            (void)abort();
            co_await coroutine::return_exception(std::runtime_error(format("Repair mandatory neighbor={} is not alive, keyspace={}, mandatory_neighbors={}",
@@ -642,8 +643,8 @@ future<> shard_repair_task_impl::repair_range(const dht::token_range& range, ::t
    if (live_neighbors.size() != neighbors.size()) {
        nr_failed_ranges++;
        auto status = live_neighbors.empty() ? "skipped" : "partial";
-        rlogger.warn("repair[{}]: Repair {} out of {} ranges, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-                id.uuid(), ranges_index, ranges_size(), id.shard(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
+        rlogger.warn("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
+                global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
        if (live_neighbors.empty()) {
            co_return;
        }
@@ -651,12 +652,12 @@ future<> shard_repair_task_impl::repair_range(const dht::token_range& range, ::t
    }
    if (neighbors.empty()) {
        auto status = "skipped_no_followers";
-        rlogger.warn("repair[{}]: Repair {} out of {} ranges,  shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-                id.uuid(), ranges_index, ranges_size(), id.shard(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
+        rlogger.warn("repair[{}]: Repair {} out of {} ranges,  keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
+                global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
        co_return;
    }
-    rlogger.debug("repair[{}]: Repair {} out of {} ranges, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}",
-        id.uuid(), ranges_index, ranges_size(), id.shard(), _status.keyspace, table_names(), range, neighbors, live_neighbors);
+    rlogger.debug("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}",
+        global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors);
    co_await mm.sync_schema(db.local(), neighbors);
    sstring cf;
    try {
@@ -669,9 +670,12 @@ future<> shard_repair_task_impl::repair_range(const dht::token_range& range, ::t
        co_return;
    }
    try {
-        co_await repair_cf_range_row_level(*this, cf, table_id, range, neighbors);
-    } catch (replica::no_such_column_family&) {
-        dropped_tables.insert(cf);
+        auto dropped = co_await repair::with_table_drop_silenced(db.local(), mm, table_id, [&] (const ::table_id& uuid) {
+            return repair_cf_range_row_level(*this, cf, table_id, range, neighbors);
+        });
+        if (dropped) {
+            dropped_tables.insert(cf);
+        }
    } catch (...) {
        nr_failed_ranges++;
        throw;
@@ -943,7 +947,7 @@ future<> shard_repair_task_impl::do_repair_ranges() {
        auto table_name = table_names()[idx];
        // repair all the ranges in limited parallelism
        rlogger.info("repair[{}]: Started to repair {} out of {} tables in keyspace={}, table={}, table_id={}, repair_reason={}",
-                id.uuid(), idx + 1, table_ids.size(), _status.keyspace, table_name, table_id, _reason);
+                global_repair_id.uuid(), idx + 1, table_ids.size(), _status.keyspace, table_name, table_id, _reason);
        co_await coroutine::parallel_for_each(ranges, [this, table_id] (auto&& range) -> future<> {
            // It is possible that most of the ranges are skipped. In this case
            // this lambda will just log a message and exit. With a lot of
@@ -968,7 +972,7 @@ future<> shard_repair_task_impl::do_repair_ranges() {
                        nr_ranges_finished++;
                    }
                    rlogger.debug("repair[{}]: node ops progress bootstrap={}, replace={}, rebuild={}, decommission={}, removenode={}, repair={}",
-                        id.uuid(),
+                        global_repair_id.uuid(),
                        rs.get_metrics().bootstrap_finished_percentage(),
                        rs.get_metrics().replace_finished_percentage(),
                        rs.get_metrics().rebuild_finished_percentage(),
@@ -983,7 +987,7 @@ future<> shard_repair_task_impl::do_repair_ranges() {
            try {
                auto& table = db.local().find_column_family(table_id);
                rlogger.debug("repair[{}]: Trigger off-strategy compaction for keyspace={}, table={}",
-                    id.uuid(), table.schema()->ks_name(), table.schema()->cf_name());
+                    global_repair_id.uuid(), table.schema()->ks_name(), table.schema()->cf_name());
                table.trigger_offstrategy_compaction();
            } catch (replica::no_such_column_family&) {
                // Ignore dropped table
@@ -998,10 +1002,10 @@ future<> shard_repair_task_impl::do_repair_ranges() {
 // is assumed to be a indivisible in the sense that all the tokens in has the
 // same nodes as replicas.
 future<> shard_repair_task_impl::run() {
-    rs.get_repair_module().add_shard_task_id(id.id, _status.id);
+    rs.get_repair_module().add_shard_task_id(global_repair_id.id, _status.id);
    return do_repair_ranges().then([this] {
        check_failed_ranges();
-        rs.get_repair_module().remove_shard_task_id(id.id);
+        rs.get_repair_module().remove_shard_task_id(global_repair_id.id);
        return make_ready_future<>();
    }).handle_exception([this] (std::exception_ptr eptr) {
        rs.get_repair_module().remove_shard_task_id(_status.sequence_number);
--- a/repair/repair_task.hh
+++ b/repair/repair_task.hh
@@ -104,7 +104,7 @@ public:
    dht::token_range_vector ranges;
    std::vector<sstring> cfs;
    std::vector<table_id> table_ids;
-    repair_uniq_id id;
+    repair_uniq_id global_repair_id;
    std::vector<sstring> data_centers;
    std::vector<sstring> hosts;
    std::unordered_set<gms::inet_address> ignore_nodes;
--- a/repair/row.hh
+++ b/repair/row.hh
@@ -50,6 +50,9 @@ public:
        }
        return *_mf;
    }
+    void reset_mutation_fragment() {
+        _mf = nullptr;
+    }
    frozen_mutation_fragment& get_frozen_mutation() {
        if (!_fm) {
            throw std::runtime_error("empty frozen_mutation_fragment");
@@ -69,7 +72,14 @@ public:
        if (!_fm) {
            throw std::runtime_error("empty size due to empty frozen_mutation_fragment");
        }
-        return _fm->representation().size();
+        auto size = sizeof(repair_row) + _fm->representation().size();
+        if (_boundary) {
+            size += _boundary->pk.external_memory_usage() + _boundary->position.external_memory_usage();
+        }
+        if (_mf) {
+            size += _mf->memory_usage();
+        }
+        return size;
    }
    const repair_sync_boundary& boundary() const {
        if (!_boundary) {
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -340,7 +340,9 @@ public:
            , _seed(seed)
            , _local_read_op(local_reader ? std::optional(cf.read_in_progress()) : std::nullopt)
            , _reader(make_reader(db, cf, local_reader))
-    { }
+    {
+        pause();
+    }

    future<mutation_fragment_opt>
    read_mutation_fragment() {
@@ -672,6 +674,7 @@ void flush_rows(schema_ptr s, std::list<repair_row>& rows, lw_shared_ptr<repair_
            last_mf = mf;
            last_dk = r.get_dk_with_hash();
        }
+        r.reset_mutation_fragment();
    }
    if (last_mf && last_dk) {
        writer->do_write(std::move(last_dk), std::move(*last_mf)).get();
@@ -941,8 +944,8 @@ public:
    }

 private:
-    future<uint64_t> do_estimate_partitions_on_all_shards() {
-        return estimate_partitions(_db, _schema->ks_name(), _schema->cf_name(), _range);
+    future<uint64_t> do_estimate_partitions_on_all_shards(const dht::token_range& range) {
+        return estimate_partitions(_db, _schema->ks_name(), _schema->cf_name(), range);
    }

    future<uint64_t> do_estimate_partitions_on_local_shard() {
@@ -964,7 +967,7 @@ private:
                return repeat([this, &sharder, &partitions_sum] () mutable {
                    auto shard_range = sharder.next();
                    if (shard_range) {
-                        return do_estimate_partitions_on_all_shards().then([this, &partitions_sum] (uint64_t partitions) mutable {
+                        return do_estimate_partitions_on_all_shards(*shard_range).then([&partitions_sum] (uint64_t partitions) mutable {
                            partitions_sum += partitions;
                            return make_ready_future<stop_iteration>(stop_iteration::no);
                        });
@@ -1039,10 +1042,11 @@ private:
        auto hash = _repair_hasher.do_hash_for_mf(*_repair_reader.get_current_dk(), mf);
        repair_row r(freeze(*_schema, mf), position_in_partition(mf.position()), _repair_reader.get_current_dk(), hash, is_dirty_on_master::no);
        rlogger.trace("Reading: r.boundary={}, r.hash={}", r.boundary(), r.hash());
+        auto sz = r.size();
        _metrics.row_from_disk_nr++;
-        _metrics.row_from_disk_bytes += r.size();
-        cur_size += r.size();
-        new_rows_size += r.size();
+        _metrics.row_from_disk_bytes += sz;
+        cur_size += sz;
+        new_rows_size += sz;
        cur_rows.push_back(std::move(r));
        return stop_iteration::no;
    }
@@ -1242,6 +1246,7 @@ private:
                    // mutation_fragment attached because we have stored it in
                    // to_repair_rows_list above where the repair_row is created.
                    mutation_fragment mf = std::move(r.get_mutation_fragment());
+                    r.reset_mutation_fragment();
                    auto dk_with_hash = r.get_dk_with_hash();
                    return _repair_writer->do_write(std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
                        row_diff.pop_front();
@@ -2685,7 +2690,7 @@ private:
        size_t repaired_replicas = _all_live_peer_nodes.size() + 1;
        if (_shard_task.total_rf != repaired_replicas){
            rlogger.debug("repair[{}]: Skipped to update system.repair_history total_rf={}, repaired_replicas={}, local={}, peers={}",
-                    _shard_task.id.uuid(), _shard_task.total_rf, repaired_replicas, utils::fb_utilities::get_broadcast_address(), _all_live_peer_nodes);
+                    _shard_task.global_repair_id.uuid(), _shard_task.total_rf, repaired_replicas, utils::fb_utilities::get_broadcast_address(), _all_live_peer_nodes);
            co_return;
        }
        // Update repair_history table only if both hints and batchlog have been flushed.
@@ -2693,12 +2698,12 @@ private:
            co_return;
        }
        repair_service& rs = _shard_task.rs;
-        std::optional<gc_clock::time_point> repair_time_opt = co_await rs.update_history(_shard_task.id.uuid(), _table_id, _range, _start_time);
+        std::optional<gc_clock::time_point> repair_time_opt = co_await rs.update_history(_shard_task.global_repair_id.uuid(), _table_id, _range, _start_time);
        if (!repair_time_opt) {
            co_return;
        }
        auto repair_time = repair_time_opt.value();
-        repair_update_system_table_request req{_shard_task.id.uuid(), _table_id, _shard_task.get_keyspace(), _cf_name, _range, repair_time};
+        repair_update_system_table_request req{_shard_task.global_repair_id.uuid(), _table_id, _shard_task.get_keyspace(), _cf_name, _range, repair_time};
        auto all_nodes = _all_live_peer_nodes;
        all_nodes.push_back(utils::fb_utilities::get_broadcast_address());
        co_await coroutine::parallel_for_each(all_nodes, [this, req] (gms::inet_address node) -> future<> {
@@ -2706,9 +2711,9 @@ private:
                auto& ms = _shard_task.messaging.local();
                repair_update_system_table_response resp = co_await ser::partition_checksum_rpc_verbs::send_repair_update_system_table(&ms, netw::messaging_service::msg_addr(node), req);
                (void)resp;  // nothing to do with the response yet
-                rlogger.debug("repair[{}]: Finished to update system.repair_history table of node {}", _shard_task.id.uuid(), node);
+                rlogger.debug("repair[{}]: Finished to update system.repair_history table of node {}", _shard_task.global_repair_id.uuid(), node);
            } catch (...) {
-                rlogger.warn("repair[{}]: Failed to update system.repair_history table of node {}: {}", _shard_task.id.uuid(), node, std::current_exception());
+                rlogger.warn("repair[{}]: Failed to update system.repair_history table of node {}: {}", _shard_task.global_repair_id.uuid(), node, std::current_exception());
            }
        });
        co_return;
@@ -2735,10 +2740,10 @@ public:
            auto wanted = (_all_live_peer_nodes.size() + 1) * repair_module::max_repair_memory_per_range;
            wanted = std::min(max, wanted);
            rlogger.trace("repair[{}]: Started to get memory budget, wanted={}, available={}, max_repair_memory={}",
-                    _shard_task.id.uuid(), wanted, mem_sem.current(), max);
+                    _shard_task.global_repair_id.uuid(), wanted, mem_sem.current(), max);
            auto mem_permit = seastar::get_units(mem_sem, wanted).get0();
            rlogger.trace("repair[{}]: Finished to get memory budget, wanted={}, available={}, max_repair_memory={}",
-                    _shard_task.id.uuid(), wanted, mem_sem.current(), max);
+                    _shard_task.global_repair_id.uuid(), wanted, mem_sem.current(), max);

            auto permit = _shard_task.db.local().obtain_reader_permit(_cf, "repair-meta", db::no_timeout).get0();

@@ -2785,6 +2790,26 @@ public:
                    });
                }).get();

+                if (!master.all_nodes().empty()) {
+                    // Use the average number of partitions, instead of the sum
+                    // of the partitions, as the estimated partitions in a
+                    // given range. The bigger the estimated partitions, the
+                    // more memory bloom filter for the sstable would consume.
+                    _estimated_partitions /= master.all_nodes().size();
+
+                    // In addition, estimate the difference between nodes is
+                    // less than 10% for regular repair. Underestimation will
+                    // not be a big problem since those sstables produced by
+                    // repair will go through off-strategy later anyway. The
+                    // worst case is that we have a worse false positive ratio
+                    // than expected temporarily when the sstable is still in
+                    // maintenance set.
+                    //
+                    // To save memory and have less different conditions, we
+                    // use the 10% estimation for RBNO repair as well.
+                    _estimated_partitions /= 10;
+                }
+
                parallel_for_each(master.all_nodes(), [&, this] (repair_node_state& ns) {
                    const auto& node = ns.node;
                    rlogger.trace("Get repair_set_estimated_partitions for node={}, estimated_partitions={}", node, _estimated_partitions);
@@ -2810,11 +2835,11 @@ public:
            } catch (replica::no_such_column_family& e) {
                table_dropped = true;
                rlogger.warn("repair[{}]: shard={}, keyspace={}, cf={}, range={}, got error in row level repair: {}",
-                        _shard_task.id.uuid(), this_shard_id(), _shard_task.get_keyspace(), _cf_name, _range, e);
+                        _shard_task.global_repair_id.uuid(), this_shard_id(), _shard_task.get_keyspace(), _cf_name, _range, e);
                _failed = true;
            } catch (std::exception& e) {
                rlogger.warn("repair[{}]: shard={}, keyspace={}, cf={}, range={}, got error in row level repair: {}",
-                        _shard_task.id.uuid(), this_shard_id(), _shard_task.get_keyspace(), _cf_name, _range, e);
+                        _shard_task.global_repair_id.uuid(), this_shard_id(), _shard_task.get_keyspace(), _cf_name, _range, e);
                // In case the repair process fail, we need to call repair_row_level_stop to clean up repair followers
                _failed = true;
            }
--- a/repair/table_check.cc
+++ b/repair/table_check.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include "replica/database.hh"
+#include "repair/table_check.hh"
+#include "service/migration_manager.hh"
+
+namespace repair {
+
+future<table_dropped> table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid) {
+    co_await mm.container().invoke_on(0, [] (auto& mm) -> future<> {
+        auto& group0_client = mm.get_group0_client();
+        abort_on_expiry aoe(lowres_clock::now() + std::chrono::seconds{10});
+        auto& as = aoe.abort_source();
+        auto sub = mm.get_abort_source().subscribe([&as] () noexcept {
+            if (!as.abort_requested()) {
+                as.request_abort();
+            }
+        });
+
+        return group0_client.perform_read_barrier(&as);
+    });
+
+    co_return !db.column_family_exists(uuid);
+}
+
+future<table_dropped> with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid,
+        std::function<future<>(const table_id&)> f) {
+    std::exception_ptr ex = nullptr;
+    try {
+        co_await f(uuid);
+        co_return table_dropped::no;
+    } catch (replica::no_such_column_family&) {
+        // No need to synchronize while we know the table was dropped.
+    } catch (...) {
+        // This node may still see a table while it is dropped on the remote node
+        // and so the remote node returns an error. In that case we want to skip
+        // that table and continue with the operation.
+        //
+        // But since RPC does not enable returning the exception type, the cause
+        // of the failure cannot be determined. Synchronize schema to see the latest
+        // changes and determine whether the table was dropped.
+        ex = std::current_exception();
+    }
+
+    if (ex) {
+        auto dropped = co_await table_sync_and_check(db, mm, uuid);
+        if (!dropped) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+    }
+    co_return table_dropped::yes;
+}
+
+}
--- a/repair/table_check.hh
+++ b/repair/table_check.hh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/util/bool_class.hh>
+
+#include "schema_fwd.hh"
+
+using namespace seastar;
+
+using table_dropped = bool_class<class table_dropped_tag>;
+
+namespace raft {
+class server;
+}
+
+namespace replica {
+class database;
+}
+
+namespace service {
+class migration_manager;
+}
+
+namespace repair {
+
+class database;
+
+future<table_dropped> table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid);
+
+// Runs function f on given table. If f throws and the table is dropped, the exception is swallowed.
+// Function is aimed to handle no_such_column_family on remote node or different shard, as it synchronizes
+// schema before checking the table. Prefer standard error handling whenever possible.
+future<table_dropped> with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid,
+        std::function<future<>(const table_id&)> f);
+
+}
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -661,7 +661,7 @@ database::setup_metrics() {
                       sm::description("Holds the number of currently active read operations from \"system\" keyspace tables. "),
                       {system_label_instance}),

-        sm::make_gauge("reads_memory_consumption", [this] { return max_memory_system_concurrent_reads() - _system_read_concurrency_sem.consumed_resources().memory; },
+        sm::make_gauge("reads_memory_consumption", [this] { return _system_read_concurrency_sem.consumed_resources().memory; },
                       sm::description("Holds the amount of memory consumed by all read operations from \"system\" keyspace tables. "),
                       {system_label_instance}),

@@ -1574,7 +1574,7 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
        if (querier_opt) {
            f = co_await coroutine::as_future(semaphore.with_ready_permit(querier_opt->permit(), read_func));
        } else {
-            f = co_await coroutine::as_future(semaphore.with_permit(s.get(), "data-query", cf.estimate_read_memory_cost(), timeout, read_func));
+            f = co_await coroutine::as_future(semaphore.with_permit(s, "data-query", cf.estimate_read_memory_cost(), timeout, read_func));
        }

        if (!f.failed()) {
@@ -1640,7 +1640,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
        if (querier_opt) {
            f = co_await coroutine::as_future(semaphore.with_ready_permit(querier_opt->permit(), read_func));
        } else {
-            f = co_await coroutine::as_future(semaphore.with_permit(s.get(), "mutation-query", cf.estimate_read_memory_cost(), timeout, read_func));
+            f = co_await coroutine::as_future(semaphore.with_permit(s, "mutation-query", cf.estimate_read_memory_cost(), timeout, read_func));
        }

        if (!f.failed()) {
@@ -1690,7 +1690,7 @@ reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() {
 }

 future<reader_permit> database::obtain_reader_permit(table& tbl, const char* const op_name, db::timeout_clock::time_point timeout) {
-    return get_reader_concurrency_semaphore().obtain_permit(tbl.schema().get(), op_name, tbl.estimate_read_memory_cost(), timeout);
+    return get_reader_concurrency_semaphore().obtain_permit(tbl.schema(), op_name, tbl.estimate_read_memory_cost(), timeout);
 }

 future<reader_permit> database::obtain_reader_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout) {
@@ -1760,7 +1760,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
            // counter state for each modified cell...

            tracing::trace(trace_state, "Reading counter values from the CF");
-            auto permit = get_reader_concurrency_semaphore().make_tracking_only_permit(m_schema.get(), "counter-read-before-write", timeout);
+            auto permit = get_reader_concurrency_semaphore().make_tracking_only_permit(m_schema, "counter-read-before-write", timeout);
            return counter_write_query(m_schema, cf.as_mutation_source(), std::move(permit), m.decorated_key(), slice, trace_state)
                    .then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
                // ...now, that we got existing state of all affected counter
@@ -2846,7 +2846,7 @@ flat_mutation_reader_v2 make_multishard_streaming_reader(distributed<replica::da
        }
        virtual future<reader_permit> obtain_reader_permit(schema_ptr schema, const char* const description, db::timeout_clock::time_point timeout) override {
            auto& cf = _db.local().find_column_family(_table_id);
-            return semaphore().obtain_permit(schema.get(), description, cf.estimate_read_memory_cost(), timeout);
+            return semaphore().obtain_permit(schema, description, cf.estimate_read_memory_cost(), timeout);
        }
    };
    auto ms = mutation_source([&db] (schema_ptr s,
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -606,6 +606,7 @@ table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector
        ms_opt = make_combined_mutation_source(std::move(sources));
    }
    auto adder = row_cache::external_updater([this, m, ssts = std::move(ssts), new_ssts_ms = std::move(*ms_opt), &cg] () mutable {
+        // FIXME: the following isn't exception safe.
        for (auto& sst : ssts) {
            add_sstable(cg, sst);
        }
@@ -843,7 +844,7 @@ table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtabl
        auto metadata = mutation_source_metadata{};
        metadata.min_timestamp = old->get_min_timestamp();
        metadata.max_timestamp = old->get_max_timestamp();
-        auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count());
+        auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count(), _schema);

        if (!_async_gate.is_closed()) {
            co_await _compaction_manager.maybe_wait_for_sstable_count_reduction(cg.as_table_state());
@@ -873,7 +874,7 @@ table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtabl

        auto f = consumer(old->make_flush_reader(
            old->schema(),
-            compaction_concurrency_semaphore().make_tracking_only_permit(old->schema().get(), "try_flush_memtable_to_sstable()", db::no_timeout),
+            compaction_concurrency_semaphore().make_tracking_only_permit(old->schema(), "try_flush_memtable_to_sstable()", db::no_timeout),
            service::get_local_memtable_flush_priority()));

        // Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
@@ -1069,6 +1070,7 @@ compaction_group::update_sstable_lists_on_off_strategy_completion(sstables::comp
        virtual void execute() override {
            _cg.set_main_sstables(std::move(_new_main_list));
            _cg.set_maintenance_sstables(std::move(_new_maintenance_list));
+            // FIXME: the following is not exception safe
            _t.refresh_compound_sstable_set();
            // Input sstables aren't not removed from backlog tracker because they come from the maintenance set.
            _cg.backlog_tracker_adjust_charges({}, _new_main);
@@ -1155,6 +1157,7 @@ compaction_group::update_main_sstable_list_on_compaction_completion(sstables::co
        }
        virtual void execute() override {
            _cg.set_main_sstables(std::move(_new_sstables));
+            // FIXME: the following is not exception safe
            _t.refresh_compound_sstable_set();
            _cg.backlog_tracker_adjust_charges(_desc.old_sstables, _desc.new_sstables);
        }
@@ -1180,6 +1183,9 @@ compaction_group::update_main_sstable_list_on_compaction_completion(sstables::co
 future<>
 table::compact_all_sstables() {
    co_await flush();
+    // Forces off-strategy before major, so sstables previously sitting on maintenance set will be included
+    // in the compaction's input set, to provide same semantics as before maintenance set came into existence.
+    co_await perform_offstrategy_compaction();
    co_await parallel_foreach_compaction_group([this] (compaction_group& cg) {
        return _compaction_manager.perform_major_compaction(cg.as_table_state());
    });
@@ -1797,6 +1803,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
    };
    auto p = make_lw_shared<pruner>(*this);
    co_await _cache.invalidate(row_cache::external_updater([this, p, truncated_at] {
+        // FIXME: the following isn't exception safe.
        for (const compaction_group_ptr& cg : compaction_groups()) {
            p->prune(*cg, truncated_at);
        }
@@ -1890,11 +1897,9 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
 }

 static size_t memory_usage_of(const utils::chunked_vector<frozen_mutation_and_schema>& ms) {
-    // Overhead of sending a view mutation, in terms of data structures used by the storage_proxy.
-    constexpr size_t base_overhead_bytes = 256;
    return boost::accumulate(ms | boost::adaptors::transformed([] (const frozen_mutation_and_schema& m) {
-        return m.fm.representation().size();
-    }), size_t{base_overhead_bytes * ms.size()});
+        return db::view::memory_usage_of(m);
+    }), 0);
 }

 /**
@@ -2214,7 +2219,7 @@ write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, sstables::
            std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "write_memtable_to_sstable"),
            cfg,
            [&mt, sst] (auto& monitor, auto& semaphore, auto& cfg) {
-        return write_memtable_to_sstable(semaphore->make_tracking_only_permit(mt.schema().get(), "mt_to_sst", db::no_timeout), mt, std::move(sst), monitor, cfg)
+        return write_memtable_to_sstable(semaphore->make_tracking_only_permit(mt.schema(), "mt_to_sst", db::no_timeout), mt, std::move(sst), monitor, cfg)
        .finally([&semaphore] {
                return semaphore->stop();
        });
@@ -2543,7 +2548,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(schema_ptr s
    const bool need_static = db::view::needs_static_row(m.partition(), views);
    if (!need_regular && !need_static) {
        tracing::trace(tr_state, "View updates do not require read-before-write");
-        co_await generate_and_propagate_view_updates(base, sem.make_tracking_only_permit(s.get(), "push-view-updates-1", timeout), std::move(views), std::move(m), { }, std::move(tr_state), now);
+        co_await generate_and_propagate_view_updates(base, sem.make_tracking_only_permit(s, "push-view-updates-1", timeout), std::move(views), std::move(m), { }, std::move(tr_state), now);
        // In this case we are not doing a read-before-write, just a
        // write, so no lock is needed.
        co_return row_locker::lock_holder();
@@ -2576,7 +2581,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(schema_ptr s
    co_await utils::get_local_injector().inject("table_push_view_replica_updates_timeout", timeout);
    auto lock = co_await std::move(lockf);
    auto pk = dht::partition_range::make_singular(m.decorated_key());
-    auto permit = sem.make_tracking_only_permit(base.get(), "push-view-updates-2", timeout);
+    auto permit = sem.make_tracking_only_permit(base, "push-view-updates-2", timeout);
    auto reader = source.make_reader_v2(base, permit, pk, slice, io_priority, tr_state, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
    co_await this->generate_and_propagate_view_updates(base, std::move(permit), std::move(views), std::move(m), std::move(reader), tr_state, now);
    tracing::trace(tr_state, "View updates for {}.{} were generated and propagated", base->ks_name(), base->cf_name());
@@ -2661,7 +2666,7 @@ public:
        return _t.get_compaction_strategy();
    }
    reader_permit make_compaction_reader_permit() const override {
-        return _t.compaction_concurrency_semaphore().make_tracking_only_permit(schema().get(), "compaction", db::no_timeout);
+        return _t.compaction_concurrency_semaphore().make_tracking_only_permit(schema(), "compaction", db::no_timeout);
    }
    sstables::sstables_manager& get_sstables_manager() noexcept override {
        return _t.get_sstables_manager();
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -766,7 +766,7 @@ row_cache::make_reader_opt(schema_ptr s,
    }
 }

-row_cache::~row_cache() {
+void row_cache::clear_on_destruction() noexcept {
    with_allocator(_tracker.allocator(), [this] {
        _partitions.clear_and_dispose([this] (cache_entry* p) mutable noexcept {
            if (!p->is_dummy_entry()) {
@@ -777,6 +777,10 @@ row_cache::~row_cache() {
    });
 }

+row_cache::~row_cache() {
+    clear_on_destruction();
+}
+
 void row_cache::clear_now() noexcept {
    with_allocator(_tracker.allocator(), [this] {
        auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this] (cache_entry* p) noexcept {
@@ -973,7 +977,7 @@ future<> row_cache::do_update(external_updater eu, replica::memtable& m, Updater
                            // this layer has a chance to restore invariants before deferring,
                            // in particular set _prev_snapshot_pos to the correct value.
                            if (update.run() == stop_iteration::no) {
-                                return;
+                                break;
                            }
                            update = {};
                            real_dirty_acc.unpin_memory(size_entry);
@@ -1178,12 +1182,20 @@ row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker,
    , _underlying(src())
    , _snapshot_source(std::move(src))
 {
+  try {
    with_allocator(_tracker.allocator(), [this, cont] {
        cache_entry entry(cache_entry::dummy_entry_tag{});
        entry.set_continuous(bool(cont));
        auto raw_token = entry.position().token().raw();
        _partitions.insert(raw_token, std::move(entry), dht::ring_position_comparator{*_schema});
    });
+  } catch (...) {
+    // The code above might have allocated something in _partitions.
+    // The destructor of _partitions will be called with the wrong allocator,
+    // so we have to clear _partitions manually here, before it is destroyed.
+    clear_on_destruction();
+    throw;
+  }
 }

 cache_entry::cache_entry(cache_entry&& o) noexcept
@@ -1329,10 +1341,15 @@ future<> row_cache::do_update(row_cache::external_updater eu, row_cache::interna
        return get_units(_update_sem, 1);
    }).then([this, &eu, &iu] (auto permit) mutable {
      return eu.prepare().then([this, &eu, &iu, permit = std::move(permit)] () mutable {
-        auto pos = dht::ring_position::min();
-        eu.execute();
+        try {
+            eu.execute();
+        } catch (...) {
+            // Any error from execute is considered fatal
+            // to enforce exception safety.
+            on_fatal_internal_error(clogger, fmt::format("Fatal error during cache update: {}", std::current_exception()));
+        }
        [&] () noexcept {
-            _prev_snapshot_pos = std::move(pos);
+            _prev_snapshot_pos = dht::ring_position::min();
            _prev_snapshot = std::exchange(_underlying, _snapshot_source());
            ++_underlying_phase;
        }();
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -179,9 +179,19 @@ public:
    class external_updater_impl {
    public:
        virtual ~external_updater_impl() {}
+        // Prepare may return an exceptional future
+        // and the error is propagated to the row_cache::update caller.
+        // Hence, it must provide strong exception safety guarantees.
+        //
+        // Typically, `prepare` creates only temporary state
+        // to be atomically applied by `execute`, or, alternatively
+        // it must undo any side-effects on failure.
        virtual future<> prepare() { return make_ready_future<>(); }
        // FIXME: make execute() noexcept, that will require every updater to make execution exception safe,
        // also change function signature.
+        // See https://github.com/scylladb/scylladb/issues/15576
+        //
+        // For now, scylla aborts on any exception from `execute` 
        virtual void execute() = 0;
    };

@@ -268,6 +278,7 @@ private:
    void upgrade_entry(cache_entry&);
    void invalidate_locked(const dht::decorated_key&);
    void clear_now() noexcept;
+    void clear_on_destruction() noexcept;

    struct previous_entry_pointer {
        std::optional<dht::decorated_key> _key;
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -4,20 +4,20 @@ version = 3

 [[package]]
 name = "addr2line"
-version = "0.17.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b"
+checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
 dependencies = [
 "gimli",
 ]

 [[package]]
 name = "ahash"
-version = "0.7.6"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
+checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
 dependencies = [
- "getrandom",
+ "cfg-if",
 "once_cell",
 "version_check",
 ]
@@ -34,12 +34,6 @@ version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "216261ddc8289130e551ddcd5ce8a064710c0d064a4d2895c67151c92b5443f6"

-[[package]]
-name = "arrayvec"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
-
 [[package]]
 name = "async-trait"
 version = "0.1.59"
@@ -93,7 +87,7 @@ dependencies = [
 "cap-primitives",
 "cap-std",
 "io-lifetimes",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]

 [[package]]
@@ -109,7 +103,7 @@ dependencies = [
 "ipnet",
 "maybe-owned",
 "rustix",
- "windows-sys",
+ "windows-sys 0.42.0",
 "winx",
 ]

@@ -171,20 +165,19 @@ dependencies = [

 [[package]]
 name = "cranelift-bforest"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cb658ef043a07ea4086c65f2e3d770b5dc60b8787a9ef54cf06d792cf613d82"
+checksum = "1277fbfa94bc82c8ec4af2ded3e639d49ca5f7f3c7eeab2c66accd135ece4e70"
 dependencies = [
 "cranelift-entity",
 ]

 [[package]]
 name = "cranelift-codegen"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b36618d7ab9ad5da72935623292d364b5482ef42141e0145c0090bfc7f6b8dca"
+checksum = "c6e8c31ad3b2270e9aeec38723888fe1b0ace3bea2b06b3f749ccf46661d3220"
 dependencies = [
- "arrayvec",
 "bumpalo",
 "cranelift-bforest",
 "cranelift-codegen-meta",
@@ -192,7 +185,7 @@ dependencies = [
 "cranelift-entity",
 "cranelift-isle",
 "gimli",
- "hashbrown",
+ "hashbrown 0.13.2",
 "log",
 "regalloc2",
 "smallvec",
@@ -201,33 +194,33 @@ dependencies = [

 [[package]]
 name = "cranelift-codegen-meta"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb7cab168dac35a2fc53a3591ee36d145d7fc2ebbdb5c70f1f9e35764157af5a"
+checksum = "c8ac5ac30d62b2d66f12651f6b606dbdfd9c2cfd0908de6b387560a277c5c9da"
 dependencies = [
 "cranelift-codegen-shared",
 ]

 [[package]]
 name = "cranelift-codegen-shared"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dcbdd64e35dfb910ff709e5b2d5e1348f626837685673726d985a620b9d8de5"
+checksum = "dd82b8b376247834b59ed9bdc0ddeb50f517452827d4a11bccf5937b213748b8"

 [[package]]
 name = "cranelift-entity"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a9e39cfc857e7e539aa623e03bb6bec11f54aef3dfdef41adcfa7b594af3b54"
+checksum = "40099d38061b37e505e63f89bab52199037a72b931ad4868d9089ff7268660b0"
 dependencies = [
 "serde",
 ]

 [[package]]
 name = "cranelift-frontend"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78d28039844e3f7817e5a10cbb3d9adbc7188ee9cc4ba43536f304219fcfc077"
+checksum = "64a25d9d0a0ae3079c463c34115ec59507b4707175454f0eee0891e83e30e82d"
 dependencies = [
 "cranelift-codegen",
 "log",
@@ -237,15 +230,15 @@ dependencies = [

 [[package]]
 name = "cranelift-isle"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4183c68346d657c40ea06273cc0e9c3fe25f4e51e6decf534c079f34041c43c0"
+checksum = "80de6a7d0486e4acbd5f9f87ec49912bf4c8fb6aea00087b989685460d4469ba"

 [[package]]
 name = "cranelift-native"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dbf72319054ff725a26c579b4070187928ca38e55111b964723bdbacbb1993e"
+checksum = "bb6b03e0e03801c4b3fd8ce0758a94750c07a44e7944cc0ffbf0d3f2e7c79b00"
 dependencies = [
 "cranelift-codegen",
 "libc",
@@ -254,9 +247,9 @@ dependencies = [

 [[package]]
 name = "cranelift-wasm"
-version = "0.92.1"
+version = "0.95.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3632b478ca00dfad77dbef3ce284f1199930519ab744827726a8e386a6db3f5"
+checksum = "ff3220489a3d928ad91e59dd7aeaa8b3de18afb554a6211213673a71c90737ac"
 dependencies = [
 "cranelift-codegen",
 "cranelift-entity",
@@ -334,23 +327,12 @@ checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"

 [[package]]
 name = "errno"
-version = "0.2.8"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
 "libc",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -367,7 +349,7 @@ checksum = "28c0190ff0bd3b28bfdd4d0cf9f92faa12880fb0b8ae2054723dd6c76a4efd42"
 dependencies = [
 "cfg-if",
 "rustix",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]

 [[package]]
@@ -387,7 +369,7 @@ checksum = "e25ca26b0001154679ce0901527330e6153b670d17ccd1f86bab4e45dfba1a74"
 dependencies = [
 "io-lifetimes",
 "rustix",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]

 [[package]]
@@ -501,9 +483,9 @@ dependencies = [

 [[package]]
 name = "gimli"
-version = "0.26.2"
+version = "0.27.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d"
+checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
 dependencies = [
 "fallible-iterator",
 "indexmap",
@@ -515,6 +497,12 @@ name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
 dependencies = [
 "ahash",
 ]
@@ -564,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399"
 dependencies = [
 "autocfg",
- "hashbrown",
+ "hashbrown 0.12.3",
 "serde",
 ]

@@ -575,7 +563,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b87bc110777311d7832025f38c4ab0f089f764644009edef3b5cbadfedee8c40"
 dependencies = [
 "io-lifetimes",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]

 [[package]]
@@ -585,7 +573,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
 dependencies = [
 "libc",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]

 [[package]]
@@ -603,7 +591,7 @@ dependencies = [
 "hermit-abi",
 "io-lifetimes",
 "rustix",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]

 [[package]]
@@ -689,21 +677,21 @@ dependencies = [

 [[package]]
 name = "memoffset"
-version = "0.6.5"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
+checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
 dependencies = [
 "autocfg",
 ]

 [[package]]
 name = "object"
-version = "0.29.0"
+version = "0.30.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21158b2c33aa6d4561f1c0a6ea283ca92bc54802a93b263e910746d679a7eb53"
+checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439"
 dependencies = [
 "crc32fast",
- "hashbrown",
+ "hashbrown 0.13.2",
 "indexmap",
 "memchr",
 ]
@@ -834,9 +822,9 @@ dependencies = [

 [[package]]
 name = "regalloc2"
-version = "0.5.1"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "300d4fbfb40c1c66a78ba3ddd41c1110247cf52f97b87d0f2fc9209bd49b030c"
+checksum = "80535183cae11b149d618fbd3c37e38d7cda589d82d7769e196ca9a9042d7621"
 dependencies = [
 "fxhash",
 "log",
@@ -860,9 +848,9 @@ checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"

 [[package]]
 name = "rustix"
-version = "0.36.7"
+version = "0.36.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
+checksum = "305efbd14fde4139eb501df5f136994bb520b033fa9fbdce287507dc23b8c7ed"
 dependencies = [
 "bitflags",
 "errno",
@@ -871,7 +859,7 @@ dependencies = [
 "libc",
 "linux-raw-sys",
 "once_cell",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -953,7 +941,7 @@ dependencies = [
 "fd-lock",
 "io-lifetimes",
 "rustix",
- "windows-sys",
+ "windows-sys 0.42.0",
 "winx",
 ]

@@ -1098,9 +1086,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

 [[package]]
 name = "wasi-cap-std-sync"
-version = "5.0.0"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11254257c965082b671fb876e63a69c25af8d68b2b742d785593192b28df87a8"
+checksum = "612510e6c7b6681f7d29ce70ef26e18349c26acd39b7d89f1727d90b7f58b20e"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -1117,42 +1105,43 @@ dependencies = [
 "system-interface",
 "tracing",
 "wasi-common",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
 name = "wasi-common"
-version = "5.0.0"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54c08c84016536b2407809253aa6c47eacf86d5b5ecd7741b50d23f18b5bb045"
+checksum = "008136464e438c5049a614b6ea1bae9f6c4d354ce9ee2b4d9a1ac6e73f31aafc"
 dependencies = [
 "anyhow",
 "bitflags",
 "cap-rand",
 "cap-std",
 "io-extras",
+ "log",
 "rustix",
 "thiserror",
 "tracing",
 "wasmtime",
 "wiggle",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
 name = "wasm-encoder"
-version = "0.20.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05632e0a66a6ed8cca593c24223aabd6262f256c3693ad9822c315285f010614"
+checksum = "d05d0b6fcd0aeb98adf16e7975331b3c17222aa815148f5b976370ce589d80ef"
 dependencies = [
 "leb128",
 ]

 [[package]]
 name = "wasmparser"
-version = "0.96.0"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adde01ade41ab9a5d10ec8ed0bb954238cf8625b5cd5a13093d6de2ad9c2be1a"
+checksum = "48134de3d7598219ab9eaf6b91b15d8e50d31da76b8519fe4ecfcec2cf35104b"
 dependencies = [
 "indexmap",
 "url",
@@ -1160,9 +1149,9 @@ dependencies = [

 [[package]]
 name = "wasmtime"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49ffcc607adc9da024e87ca814592d4bc67f5c5b58e488f5608d5734a1ebc23e"
+checksum = "f907fdead3153cb9bfb7a93bbd5b62629472dc06dee83605358c64c52ed3dda9"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -1185,24 +1174,25 @@ dependencies = [
 "wasmtime-jit",
 "wasmtime-runtime",
 "wat",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
 name = "wasmtime-asm-macros"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cb5dc4d79cd7b2453c395f64e9013d2ad90bd083be556d5565cb224ebe8d57"
+checksum = "d3b9daa7c14cd4fa3edbf69de994408d5f4b7b0959ac13fa69d465f6597f810d"
 dependencies = [
 "cfg-if",
 ]

 [[package]]
 name = "wasmtime-component-macro"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db5c3d25a7d531582fbaa75ce6a86dddc8211c783cb247af053075a0fcc9d3d7"
+checksum = "267096ed7cc93b4ab15d3daa4f195e04dbb7e71c7e5c6457ae7d52e9dd9c3607"
 dependencies = [
+ "anyhow",
 "proc-macro2",
 "quote",
 "syn",
@@ -1213,15 +1203,15 @@ dependencies = [

 [[package]]
 name = "wasmtime-component-util"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff0e0e3a1310cdde7db4e8634bda696ca4f80c429fbd727fa827be5f9cb35d21"
+checksum = "74e02ca7a4a3c69d72b88f26f0192e333958df6892415ac9ab84dcc42c9000c2"

 [[package]]
 name = "wasmtime-cranelift"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a66a3f2167a7436910c6cbac2408a7b599688d7114cb8821cb10879dae451759"
+checksum = "b1cefde0cce8cb700b1b21b6298a3837dba46521affd7b8c38a9ee2c869eee04"
 dependencies = [
 "anyhow",
 "cranelift-codegen",
@@ -1235,14 +1225,30 @@ dependencies = [
 "target-lexicon",
 "thiserror",
 "wasmparser",
+ "wasmtime-cranelift-shared",
+ "wasmtime-environ",
+]
+
+[[package]]
+name = "wasmtime-cranelift-shared"
+version = "8.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd041e382ef5aea1b9fc78442394f1a4f6d676ce457e7076ca4cb3f397882f8b"
+dependencies = [
+ "anyhow",
+ "cranelift-codegen",
+ "cranelift-native",
+ "gimli",
+ "object",
+ "target-lexicon",
 "wasmtime-environ",
 ]

 [[package]]
 name = "wasmtime-environ"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9350c919553cddf14f78f9452119c8004d7ef6bfebb79a41a21819ed0c5604d8"
+checksum = "a990198cee4197423045235bf89d3359e69bd2ea031005f4c2d901125955c949"
 dependencies = [
 "anyhow",
 "cranelift-entity",
@@ -1259,22 +1265,22 @@ dependencies = [

 [[package]]
 name = "wasmtime-fiber"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7459893ae6d67f9b35b04f44df8dfc037ea7f3071d710b9f7866b79cb2c482ae"
+checksum = "7ab182d5ab6273a133ab88db94d8ca86dc3e57e43d70baaa4d98f94ddbd7d10a"
 dependencies = [
 "cc",
 "cfg-if",
 "rustix",
 "wasmtime-asm-macros",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
 name = "wasmtime-jit"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90ba5779ea786386432b94c9fc9ad5597346c319e8239db0d98d5be5cc109a7e"
+checksum = "0de48df552cfca1c9b750002d3e07b45772dd033b0b206d5c0968496abf31244"
 dependencies = [
 "addr2line",
 "anyhow",
@@ -1290,34 +1296,34 @@ dependencies = [
 "wasmtime-environ",
 "wasmtime-jit-icache-coherence",
 "wasmtime-runtime",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
 name = "wasmtime-jit-debug"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9841a44c82c74101c10ad4f215392761a2523b3c6c838597962bdb6de75fdb3"
+checksum = "6e0554b84c15a27d76281d06838aed94e13a77d7bf604bbbaf548aa20eb93846"
 dependencies = [
 "once_cell",
 ]

 [[package]]
 name = "wasmtime-jit-icache-coherence"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd4356c2493002da3b111d470c2ecea65a3017009afce8adc46eaa5758739891"
+checksum = "aecae978b13f7f67efb23bd827373ace4578f2137ec110bbf6a4a7cde4121bbd"
 dependencies = [
 "cfg-if",
 "libc",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
 name = "wasmtime-runtime"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd26efea7a790fcf430e663ba2519f0ab6eb8980adf8b0c58c62b727da77c2ec"
+checksum = "658cf6f325232b6760e202e5255d823da5e348fdea827eff0a2a22319000b441"
 dependencies = [
 "anyhow",
 "cc",
@@ -1335,14 +1341,14 @@ dependencies = [
 "wasmtime-environ",
 "wasmtime-fiber",
 "wasmtime-jit-debug",
- "windows-sys",
+ "windows-sys 0.45.0",
 ]

 [[package]]
 name = "wasmtime-types"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86e1e4f66a2b9a114f9def450ab9971828c968db6ea6fccd613724b771fa4913"
+checksum = "a4f6fffd2a1011887d57f07654dd112791e872e3ff4a2e626aee8059ee17f06f"
 dependencies = [
 "cranelift-entity",
 "serde",
@@ -1352,11 +1358,12 @@ dependencies = [

 [[package]]
 name = "wasmtime-wasi"
-version = "5.0.0"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd1271c6ec6585929986d059fc2e2365e7033e32ae3bc761ed4715fd47128308"
+checksum = "4a3b5cb7606625ec229f0e33394a1637b34a58ad438526eba859b5fdb422ac1e"
 dependencies = [
 "anyhow",
+ "libc",
 "wasi-cap-std-sync",
 "wasi-common",
 "wasmtime",
@@ -1365,9 +1372,9 @@ dependencies = [

 [[package]]
 name = "wasmtime-wit-bindgen"
-version = "5.0.1"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d97566c073045a48b745f3559689295140f00fff7f2799efe8c89cc7e70ae007"
+checksum = "983db9cc294d1adaa892a53ff6a0dc6605fc0ab1a4da5d8a2d2d4bde871ff7dd"
 dependencies = [
 "anyhow",
 "heck",
@@ -1396,9 +1403,9 @@ dependencies = [

 [[package]]
 name = "wast"
-version = "50.0.0"
+version = "57.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2cbb59d4ac799842791fe7e806fa5dbbf6b5554d538e51cc8e176db6ff0ae34"
+checksum = "6eb0f5ed17ac4421193c7477da05892c2edafd67f9639e3c11a82086416662dc"
 dependencies = [
 "leb128",
 "memchr",
@@ -1408,18 +1415,18 @@ dependencies = [

 [[package]]
 name = "wat"
-version = "1.0.52"
+version = "1.0.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "584aaf7a1ecf4d383bbe1a25eeab0cbb8ff96acc6796707ff65cde48f4632f15"
+checksum = "ab9ab0d87337c3be2bb6fc5cd331c4ba9fd6bcb4ee85048a0dd59ed9ecf92e53"
 dependencies = [
- "wast 50.0.0",
+ "wast 57.0.0",
 ]

 [[package]]
 name = "wiggle"
-version = "5.0.0"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63d256f306e99e90343029170d81154319a976292c35eba68b05792532fa365e"
+checksum = "6b16a7462893c46c6d3dd2a1f99925953bdbb921080606e1a4c9344864492fa4"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -1432,9 +1439,9 @@ dependencies = [

 [[package]]
 name = "wiggle-generate"
-version = "5.0.0"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a0e55a87dcb350634c9f9b3ec08bfc87d7b05a0303a5fe8bb3134452ba3b62f"
+checksum = "489499e186ab24c8ac6d89e9934c54ced6f19bd473730e6a74f533bd67ecd905"
 dependencies = [
 "anyhow",
 "heck",
@@ -1447,9 +1454,9 @@ dependencies = [

 [[package]]
 name = "wiggle-macro"
-version = "5.0.0"
+version = "8.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b70901617926a441dbb03f3d208bd02b3fffbda13cadd9b17e7cf9389d9c067e"
+checksum = "e9142e7fce24a4344c85a43c8b719ef434fc6155223bade553e186cb4183b6cc"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -1485,56 +1492,146 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.0",
+ "windows_aarch64_msvc 0.52.0",
+ "windows_i686_gnu 0.52.0",
+ "windows_i686_msvc 0.52.0",
+ "windows_x86_64_gnu 0.52.0",
+ "windows_x86_64_gnullvm 0.52.0",
+ "windows_x86_64_msvc 0.52.0",
 ]

 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.42.0"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e"
+checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"

 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.42.0"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4"
+checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"

 [[package]]
 name = "windows_i686_gnu"
-version = "0.42.0"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7"
+checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"

 [[package]]
 name = "windows_i686_msvc"
-version = "0.42.0"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246"
+checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"

 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.42.0"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed"
+checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"

 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.42.0"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028"
+checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"

 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.42.0"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5"
+checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"

 [[package]]
 name = "winx"
@@ -1544,20 +1641,22 @@ checksum = "9baf690e238840de84bbfad6ad72d6628c41d34c1a5e276dab7fb2c9167ca1ac"
 dependencies = [
 "bitflags",
 "io-lifetimes",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]

 [[package]]
 name = "wit-parser"
-version = "0.3.1"
+version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "703eb1d2f89ff2c52d50f7ff002735e423cea75f0a5dc5c8a4626c4c47cd9ca6"
+checksum = "f887c3da527a51b321076ebe6a7513026a4757b6d4d144259946552d6fc728b3"
 dependencies = [
 "anyhow",
 "id-arena",
 "indexmap",
+ "log",
 "pulldown-cmark",
 "unicode-xid",
+ "url",
 ]

 [[package]]
--- a/Show More
+++ b/Show More