doc: remove wrong image upgrade info (5.2-to-2023.1)

This commit removes the information about the recommended way of upgrading ScyllaDB images - by updating ScyllaDB and OS packages in one step. This upgrade procedure is not supported (it was implemented, but then reverted). Refs https://github.com/scylladb/scylladb/issues/15733 Closes scylladb/scylladb#21876 Fixes https://github.com/scylladb/scylla-enterprise/issues/5041 Fixes https://github.com/scylladb/scylladb/issues/21898 (cherry picked from commit 98860905d8)
db/config.cc: increment components_memory_reclaim_threshold config default
2024-12-12 15:28:20 +02:00 · 2024-06-04 07:13:28 +03:00 · 2024-05-30 11:11:39 +03:00 · 2024-05-30 11:10:49 +03:00 · 2024-05-27 08:52:06 +03:00 · 2024-05-26 16:30:06 +03:00
163 changed files with 3282 additions and 895 deletions
--- a/2
+++ b/2
@@ -72,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.2.13
+VERSION=5.2.19

 if test -f version
 then
--- a/api/api-doc/raft.json
+++ b/api/api-doc/raft.json
@@ -0,0 +1,43 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/raft",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/raft/trigger_snapshot/{group_id}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Triggers snapshot creation and log truncation for the given Raft group",
+               "type":"string",
+               "nickname":"trigger_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"group_id",
+                     "description":"The ID of the group which should get snapshotted",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Timeout in seconds after which the endpoint returns a failure. If not provided, 60s is used.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api.cc
+++ b/api/api.cc
@@ -31,6 +31,7 @@
 #include "api/config.hh"
 #include "task_manager.hh"
 #include "task_manager_test.hh"
+#include "raft.hh"

 logging::logger apilog("api");

@@ -277,6 +278,18 @@ future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::confi

 #endif

+future<> set_server_raft(http_context& ctx, sharded<service::raft_group_registry>& raft_gr) {
+    auto rb = std::make_shared<api_registry_builder>(ctx.api_doc);
+    return ctx.http_server.set_routes([rb, &ctx, &raft_gr] (routes& r) {
+        rb->register_function(r, "raft", "The Raft API");
+        set_raft(ctx, r, raft_gr);
+    });
+}
+
+future<> unset_server_raft(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
+}
+
 void req_params::process(const request& req) {
    // Process mandatory parameters
    for (auto& [name, ent] : params) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -22,6 +22,7 @@ namespace service {
 class load_meter;
 class storage_proxy;
 class storage_service;
+class raft_group_registry;

 } // namespace service

@@ -116,5 +117,7 @@ future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
 future<> set_server_task_manager(http_context& ctx);
 future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::config> cfg);
+future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
+future<> unset_server_raft(http_context&);

 }
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <seastar/core/coroutine.hh>
+
+#include "api/api.hh"
+#include "api/api-doc/raft.json.hh"
+
+#include "service/raft/raft_group_registry.hh"
+
+using namespace seastar::httpd;
+
+extern logging::logger apilog;
+
+namespace api {
+
+namespace r = httpd::raft_json;
+using namespace json;
+
+void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
+    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
+        raft::group_id gid{utils::UUID{req->param["group_id"]}};
+        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
+            if (timeout_str.empty()) {
+                return std::chrono::seconds{60};
+            }
+            auto dur = std::stoll(timeout_str);
+            if (dur <= 0) {
+                throw std::runtime_error{"Timeout must be a positive number."};
+            }
+            return std::chrono::seconds{dur};
+        });
+
+        std::atomic<bool> found_srv{false};
+        co_await raft_gr.invoke_on_all([gid, timeout_dur, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
+            auto* srv = raft_gr.find_server(gid);
+            if (!srv) {
+                co_return;
+            }
+
+            found_srv = true;
+            abort_on_expiry aoe(lowres_clock::now() + timeout_dur);
+            apilog.info("Triggering Raft group {} snapshot", gid);
+            auto result = co_await srv->trigger_snapshot(&aoe.abort_source());
+            if (result) {
+                apilog.info("New snapshot for Raft group {} created", gid);
+            } else {
+                apilog.info("Could not create new snapshot for Raft group {}, no new entries applied", gid);
+            }
+        });
+
+        if (!found_srv) {
+            throw std::runtime_error{fmt::format("Server for group ID {} not found", gid)};
+        }
+
+        co_return json_void{};
+    });
+}
+
+void unset_raft(http_context&, httpd::routes& r) {
+    r::trigger_snapshot.unset(r);
+}
+
+}
+
--- a/api/raft.hh
+++ b/api/raft.hh
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2023-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "api_init.hh"
+
+namespace api {
+
+void set_raft(http_context& ctx, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr);
+void unset_raft(http_context& ctx, httpd::routes& r);
+
+}
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -176,7 +176,9 @@ void set_task_manager(http_context& ctx, routes& r) {
        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
            return task->done().then_wrapped([task] (auto f) {
                task->unregister_task();
-                f.get();
+                // done() is called only because we want the task to be complete before getting its status.
+                // The future should be ignored here as the result does not matter.
+                f.ignore_ready_future();
                return make_foreign(task);
            });
        }));
@@ -204,8 +206,8 @@ void set_task_manager(http_context& ctx, routes& r) {
        while (!q.empty()) {
            auto& current = q.front();
            res.push_back(co_await retrieve_status(current));
-            for (auto i = 0; i < current->get_children().size(); ++i) {
-                q.push(co_await current->get_children()[i].copy());
+            for (auto& child: current->get_children()) {
+                q.push(co_await child.copy());
            }
            q.pop();
        }
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -25,6 +25,7 @@
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
 #include "utils/UUID_gen.hh"
+#include "utils/error_injection.hh"

 #include "cdc/generation.hh"
 #include "cdc/cdc_options.hh"
@@ -44,8 +45,16 @@ static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const

 namespace cdc {

-extern const api::timestamp_clock::duration generation_leeway =
-    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+api::timestamp_clock::duration get_generation_leeway() {
+    static thread_local auto generation_leeway =
+            std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+    utils::get_local_injector().inject("increase_cdc_generation_leeway", [&] {
+        generation_leeway = std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::minutes(5));
+    });
+
+    return generation_leeway;
+}

 static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    i = net::hton(i);
@@ -331,7 +340,7 @@ future<cdc::generation_id> generation_service::make_new_generation(const std::un
    auto new_generation_timestamp = [add_delay, ring_delay = _cfg.ring_delay] {
        auto ts = db_clock::now();
        if (add_delay && ring_delay != 0ms) {
-            ts += 2 * ring_delay + duration_cast<milliseconds>(generation_leeway);
+            ts += 2 * ring_delay + duration_cast<milliseconds>(get_generation_leeway());
        }
        return ts;
    };
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -46,6 +46,8 @@ namespace gms {

 namespace cdc {

+api::timestamp_clock::duration get_generation_leeway();
+
 class stream_id final {
    bytes _value;
 public:
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -15,10 +15,6 @@

 extern logging::logger cdc_log;

-namespace cdc {
-    extern const api::timestamp_clock::duration generation_leeway;
-} // namespace cdc
-
 static api::timestamp_type to_ts(db_clock::time_point tp) {
    // This assumes that timestamp_clock and db_clock have the same epochs.
    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
@@ -73,7 +69,7 @@ bool cdc::metadata::streams_available() const {

 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
-    if (ts > now + generation_leeway.count()) {
+    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
@@ -86,27 +82,43 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // Nothing protects us from that until we start using transactions for generation switching.
    }

-    auto it = gen_used_at(now);
-    if (it == _gens.end()) {
+    auto it = gen_used_at(now - get_generation_leeway().count());
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+    }
+
+    if (ts <= now - get_generation_leeway().count()) {
+        // We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
+        // happens iff one of the following is true:
+        // - the write is to no generation,
+        // - the write is to a generation older than the generation under `it`,
+        // - the write is to the generation under `it` and that generation is not the current generation.
+        // Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
+        // but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
+        // are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
+        // the generation under `it` because that generation was operating at `now - generation_leeway`.
+        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
+        if (it == _gens.end() || ts < it->first || is_previous_gen) {
+            throw exceptions::invalid_request_exception(format(
+                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
+                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                    " consistency properties.\n"
+                    "We *do* allow sending writes into the near past, but our ability to do that is limited."
+                    " Are you using client-side timestamps? Make sure your clocks are well-synchronized"
+                    " with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        }
+    }
+
+    it = _gens.begin();
+    if (it == _gens.end() || ts < it->first) {
        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
    }

-    // Garbage-collect generations that will no longer be used.
-    it = _gens.erase(_gens.begin(), it);
-
-    if (it->first > ts) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream from an earlier generation than the currently used one."
-                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                " consistency properties (write timestamp: {}, current generation started at: {})",
-                format_timestamp(ts), format_timestamp(it->first)));
-    }
-
-    // With `generation_leeway` we allow sending writes to the near future. It might happen
-    // that `ts` doesn't belong to the current generation ("current" according to our clock),
-    // but to the next generation. Adjust for this case:
+    // Find the generation operating at `ts`.
    {
        auto next_it = std::next(it);
        while (next_it != _gens.end() && next_it->first <= ts) {
@@ -147,8 +159,8 @@ bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
        ++it;
    }

-    // Check if some new generation has already superseded this one.
-    return it != _gens.end() && it->first <= api::new_timestamp();
+    // Check if the generation is obsolete.
+    return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
 }

 bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
@@ -157,7 +169,7 @@ bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen)
    }

    auto now = api::new_timestamp();
-    auto it = gen_used_at(now);
+    auto it = gen_used_at(now - get_generation_leeway().count());

    if (it != _gens.end()) {
        // Garbage-collect generations that will no longer be used.
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -42,7 +42,9 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
 public:
-    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
+     * it is older than the generation operating at `now - get_generation_leeway()`.
+     */
    bool known_or_obsolete(db_clock::time_point) const;

    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
@@ -54,8 +56,9 @@ public:
     *
     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
-     * by the `cdc::generation_leeway` constant.
+     * yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
+     * the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
+     * `get_generation_leeway()`.
     */
    stream_id get_stream(api::timestamp_type ts, dht::token tok);

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -464,6 +464,7 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    double _estimated_droppable_tombstone_ratio = 0;
    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
@@ -521,7 +522,7 @@ protected:
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -584,6 +585,7 @@ protected:
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
        cfg.run_identifier = gc_run;
        cfg.monitor = monitor.get();
+        uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }
@@ -654,6 +656,7 @@ private:
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

+        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
@@ -688,12 +691,16 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
+            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state());
+            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->all()->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->all()->size(), _sstables.size());
        }
+        // _estimated_droppable_tombstone_ratio could exceed 1.0 in certain cases, so limit it to 1.0.
+        _estimated_droppable_tombstone_ratio = std::min(1.0, sum_of_estimated_droppable_tombstone_ratio / ssts->all()->size());

        _compacting = std::move(ssts);

@@ -1620,7 +1627,7 @@ private:
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1068,7 +1068,7 @@ void compaction_manager::submit(compaction::table_state& t) {

    // OK to drop future.
    // waited via task->stop()
-    (void)perform_task(make_shared<regular_compaction_task>(*this, t));
+    (void)perform_task(make_shared<regular_compaction_task>(*this, t)).then_wrapped([] (auto f) { f.ignore_ready_future(); });
 }

 bool compaction_manager::can_perform_regular_compaction(compaction::table_state& t) {
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -65,7 +65,7 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

-uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
+uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
    return partition_estimate;
 }

@@ -750,8 +750,8 @@ compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema
    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

-uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate);
+uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
+    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
 }

 reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -108,7 +108,7 @@ public:

    compaction_backlog_tracker make_backlog_tracker();

-    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr);

    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -70,7 +70,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() = 0;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema);

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -100,16 +100,27 @@ public:
    };
 };

-uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
-        // Not enough information, we assume the worst
-        return partition_estimate / max_data_segregation_window_count;
-    }
-    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
-    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
-    const auto window_size = get_window_size(_options);
+uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) {
+    // If not enough information, we assume the worst
+    auto estimated_window_count = max_data_segregation_window_count;
+    auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
+    bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
+    auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
+        const auto window_size = get_window_size(_options);
+        return (max_window + (window_size - 1) - min_window) / window_size;
+    };

-    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
+    if (!min_and_max_ts_available && default_ttl.count()) {
+        auto min_window = get_window_for(_options, timestamp_type(0));
+        auto max_window = get_window_for(_options, timestamp_type(default_ttl.count()));
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    } else if (min_and_max_ts_available) {
+        auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
+        auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    }

    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
 }
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -157,7 +157,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) override;
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) override;

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) override;

--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -117,6 +117,8 @@ struct date_type_impl final : public concrete_type<db_clock::time_point> {

 using timestamp_date_base_class = concrete_type<db_clock::time_point>;

+sstring timestamp_to_json_string(const timestamp_date_base_class& t, const bytes_view& bv);
+
 struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    timeuuid_type_impl();
    static utils::UUID from_sstring(sstring_view s);
--- a/configure.py
+++ b/configure.py
@@ -698,6 +698,7 @@ scylla_core = (['message/messaging_service.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
+                'utils/on_internal_error.cc',
                'converting_mutation_partition_applier.cc',
                'readers/combined.cc',
                'readers/multishard.cc',
@@ -969,6 +970,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -1077,6 +1079,8 @@ api = ['api/api.cc',
       Json2Code('api/api-doc/error_injection.json'),
       'api/authorization_cache.cc',
       Json2Code('api/api-doc/authorization_cache.json'),
+       'api/raft.cc',
+       Json2Code('api/api-doc/raft.json'),
       ]

 alternator = [
@@ -1269,7 +1273,7 @@ deps['test/boost/bytes_ostream_test'] = [
    "test/lib/log.cc",
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
-deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
+deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc', 'utils/on_internal_error.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -932,6 +932,9 @@ void query_processor::migration_subscriber::on_update_aggregate(const sstring& k
 void query_processor::migration_subscriber::on_update_view(
        const sstring& ks_name,
        const sstring& view_name, bool columns_changed) {
+    // scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
+    // them as such when changed.
+    on_update_column_family(ks_name, view_name, columns_changed);
 }

 void query_processor::migration_subscriber::on_drop_keyspace(const sstring& ks_name) {
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -404,20 +404,19 @@ alter_table_statement::prepare_schema_mutations(query_processor& qp, api::timest

 std::unique_ptr<cql3::statements::prepared_statement>
 cql3::statements::alter_table_statement::prepare(data_dictionary::database db, cql_stats& stats) {
+    auto t = db.try_find_table(keyspace(), column_family());
+    std::optional<schema_ptr> s = t ? std::make_optional(t->schema()) : std::nullopt;
+    std::optional<sstring> warning = check_restricted_table_properties(db, s, keyspace(), column_family(), *_properties);
+    if (warning) {
+        mylogger.warn("{}", *warning);
+    }
    return std::make_unique<prepared_statement>(make_shared<alter_table_statement>(*this));
 }

 future<::shared_ptr<messages::result_message>>
 alter_table_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    auto s = validation::validate_column_family(qp.db(), keyspace(), column_family());
-    std::optional<sstring> warning = check_restricted_table_properties(qp, s, keyspace(), column_family(), *_properties);
-    return schema_altering_statement::execute(qp, state, options).then([this, warning = std::move(warning)] (::shared_ptr<messages::result_message> msg) {
-        if (warning) {
-            msg->add_warning(*warning);
-            mylogger.warn("{}", *warning);
-        }
-        return msg;
-    });
+    validation::validate_column_family(qp.db(), keyspace(), column_family());
+    return schema_altering_statement::execute(qp, state, options);
 }

 }
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -185,6 +185,10 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    if (_properties.properties()->get_synchronous_updates_flag()) {
        throw exceptions::invalid_request_exception(format("The synchronous_updates option is only applicable to materialized views, not to base tables"));
    }
+    std::optional<sstring> warning = check_restricted_table_properties(db, std::nullopt, keyspace(), column_family(), *_properties.properties());
+    if (warning) {
+        mylogger.warn("{}", *warning);
+    }
    const bool has_default_ttl = _properties.properties()->get_default_time_to_live() > 0;

    auto stmt = ::make_shared<create_table_statement>(*_cf_name, _properties.properties(), _if_not_exists, _static_columns, _properties.properties()->get_id());
@@ -426,7 +430,7 @@ void create_table_statement::raw_statement::add_column_alias(::shared_ptr<column
 // legal but restricted by the configuration. Checks for other of errors
 // in the table's options are done elsewhere.
 std::optional<sstring> check_restricted_table_properties(
-    query_processor& qp,
+    data_dictionary::database db,
    std::optional<schema_ptr> schema,
    const sstring& keyspace, const sstring& table,
    const cf_prop_defs& cfprops)
@@ -450,7 +454,7 @@ std::optional<sstring> check_restricted_table_properties(
    auto cs = (strategy) ? strategy : current_strategy;

    if (strategy && *strategy == sstables::compaction_strategy_type::date_tiered) {
-        switch(qp.db().get_config().restrict_dtcs()) {
+        switch(db.get_config().restrict_dtcs()) {
        case db::tri_mode_restriction_t::mode::TRUE:
            throw exceptions::configuration_exception(
                "DateTieredCompactionStrategy is deprecated, and "
@@ -471,7 +475,7 @@ std::optional<sstring> check_restricted_table_properties(
        std::map<sstring, sstring> options = (strategy) ? cfprops.get_compaction_type_options() : (*schema)->compaction_strategy_options();
        sstables::time_window_compaction_strategy_options twcs_options(options);
        long ttl = (cfprops.has_property(cf_prop_defs::KW_DEFAULT_TIME_TO_LIVE)) ? cfprops.get_default_time_to_live() : current_ttl.count();
-        auto max_windows = qp.db().get_config().twcs_max_window_count();
+        auto max_windows = db.get_config().twcs_max_window_count();

        // It may happen that an user tries to update an unrelated table property. Allow the request through.
        if (!cfprops.has_property(cf_prop_defs::KW_DEFAULT_TIME_TO_LIVE) && !strategy) {
@@ -491,7 +495,7 @@ std::optional<sstring> check_restricted_table_properties(
                                                   "highly discouraged.", ttl, twcs_options.get_sstable_window_size().count(), window_count, max_windows));
            }
        } else {
-              switch (qp.db().get_config().restrict_twcs_without_default_ttl()) {
+              switch (db.get_config().restrict_twcs_without_default_ttl()) {
              case db::tri_mode_restriction_t::mode::TRUE:
                  throw exceptions::configuration_exception(
                      "TimeWindowCompactionStrategy tables without a strict default_time_to_live setting "
@@ -510,18 +514,6 @@ std::optional<sstring> check_restricted_table_properties(
    return std::nullopt;
 }

-future<::shared_ptr<messages::result_message>>
-create_table_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    std::optional<sstring> warning = check_restricted_table_properties(qp, std::nullopt, keyspace(), column_family(), *_properties);
-    return schema_altering_statement::execute(qp, state, options).then([this, warning = std::move(warning)] (::shared_ptr<messages::result_message> msg) {
-        if (warning) {
-            msg->add_warning(*warning);
-            mylogger.warn("{}", *warning);
-        }
-        return msg;
-    });
-}
-
 }

 }
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -79,9 +79,6 @@ public:

    virtual future<> grant_permissions_to_creator(const service::client_state&) const override;

-    virtual future<::shared_ptr<messages::result_message>>
-    execute(query_processor& qp, service::query_state& state, const query_options& options) const override;
-
    schema_ptr get_cf_meta_data(const data_dictionary::database) const;

    class raw_statement;
@@ -129,7 +126,7 @@ public:
 };

 std::optional<sstring> check_restricted_table_properties(
-    query_processor& qp,
+    data_dictionary::database db,
    std::optional<schema_ptr> schema,
    const sstring& keyspace, const sstring& table,
    const cf_prop_defs& cfprops);
--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -485,7 +485,7 @@ struct to_json_string_visitor {
    sstring operator()(const string_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const bytes_type_impl& t) { return quote_json_string("0x" + t.to_string(bv)); }
    sstring operator()(const boolean_type_impl& t) { return t.to_string(bv); }
-    sstring operator()(const timestamp_date_base_class& t) { return quote_json_string(t.to_string(bv)); }
+    sstring operator()(const timestamp_date_base_class& t) { return quote_json_string(timestamp_to_json_string(t, bv)); }
    sstring operator()(const timeuuid_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const map_type_impl& t) { return to_json_string_aux(t, bv); }
    sstring operator()(const set_type_impl& t) { return to_json_string_aux(t, bv); }
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -2591,12 +2591,20 @@ db::commitlog::read_log_file(sstring filename, sstring pfx, seastar::io_priority
            return eof || next == pos;
        }
        future<> skip(size_t bytes) {
-            pos += bytes;
-            if (pos > file_size) {
+            auto n = std::min(file_size - pos, bytes);
+            pos += n;
+            if (pos == file_size) {
                eof = true;
-                pos = file_size;
            }
-            return fin.skip(bytes);
+            if (n < bytes) {
+                // if we are trying to skip past end, we have at least
+                // the bytes skipped or the source from where we read 
+                // this corrupt. So add at least four bytes. This is
+                // inexact, but adding the full "bytes" is equally wrong
+                // since it could be complete garbled junk.
+                corrupt_size += std::max(n, sizeof(uint32_t));
+            }
+            return fin.skip(n);
        }
        void stop() {
            eof = true;
--- a/db/config.cc
+++ b/db/config.cc
@@ -406,6 +406,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Adjusts the sensitivity of the failure detector on an exponential scale. Generally this setting never needs adjusting.\n"
        "Related information: Failure detection and recovery")
    , failure_detector_timeout_in_ms(this, "failure_detector_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 20 * 1000, "Maximum time between two successful echo message before gossip mark a node down in milliseconds.\n")
+    , direct_failure_detector_ping_timeout_in_ms(this, "direct_failure_detector_ping_timeout_in_ms", value_status::Used, 600, "Duration after which the direct failure detector aborts a ping message, so the next ping can start.\n"
+        "Note: this failure detector is used by Raft, and is different from gossiper's failure detector (configured by `failure_detector_timeout_in_ms`).\n")
    /* Performance tuning properties */
    /* Tuning performance and system reso   urce utilization, including commit log, compaction, memory, disk I/O, CPU, reads, and writes. */
    /* Commit log settings */
@@ -817,6 +819,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , unspooled_dirty_soft_limit(this, "unspooled_dirty_soft_limit", value_status::Used, 0.6, "Soft limit of unspooled dirty memory expressed as a portion of the hard limit")
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default) "
        "bytes written to data file. Value must be between 0 and 1.")
+    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting")
--- a/db/config.hh
+++ b/db/config.hh
@@ -173,6 +173,7 @@ public:
    named_value<bool> snapshot_before_compaction;
    named_value<uint32_t> phi_convict_threshold;
    named_value<uint32_t> failure_detector_timeout_in_ms;
+    named_value<uint32_t> direct_failure_detector_ping_timeout_in_ms;
    named_value<sstring> commitlog_sync;
    named_value<uint32_t> commitlog_segment_size_in_mb;
    named_value<uint32_t> schema_commitlog_segment_size_in_mb;
@@ -322,6 +323,7 @@ public:
    named_value<unsigned> murmur3_partitioner_ignore_msb_bits;
    named_value<double> unspooled_dirty_soft_limit;
    named_value<double> sstable_summary_ratio;
+    named_value<double> components_memory_reclaim_threshold;
    named_value<size_t> large_memory_allocation_warning_threshold;
    named_value<bool> enable_deprecated_partitioners;
    named_value<bool> enable_keyspace_column_family_metrics;
--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -157,7 +157,7 @@ future<> cql_table_large_data_handler::try_record(std::string_view large_table,
    const auto sstable_name = large_data_handler::sst_filename(sst);
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
+    large_data_logger.warn("Writing large {} {}/{}: {} ({} bytes) to {}", desc, ks_name, cf_name, extra_path, size, sstable_name);
    return _sys_ks->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -184,10 +184,10 @@ future<> cql_table_large_data_handler::internal_record_large_cells(const sstable
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("/{}/{}", ck_str, column_name), extra_fields, ck_str, column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name);
    } else {
        auto desc = format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, format("//{}", column_name), extra_fields, data_value::make_null(utf8_type), column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
    }
 }

@@ -199,10 +199,10 @@ future<> cql_table_large_data_handler::internal_record_large_cells_and_collectio
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("/{}/{}", ck_str, column_name), extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
    } else {
        auto desc = format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, format("//{}", column_name), extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
    }
 }

@@ -212,7 +212,7 @@ future<> cql_table_large_data_handler::record_large_rows(const sstables::sstable
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        std::string ck_str = key_to_str(*clustering_key, s);
-        return try_record("row", sst, partition_key, int64_t(row_size), "row", format("/{}", ck_str), extra_fields,  ck_str);
+        return try_record("row", sst, partition_key, int64_t(row_size), "row", "", extra_fields, ck_str);
    } else {
        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, data_value::make_null(utf8_type));
    }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2839,8 +2839,7 @@ static void install_virtual_readers(db::system_keyspace& sys_ks, replica::databa

 static bool maybe_write_in_user_memory(schema_ptr s) {
    return (s.get() == system_keyspace::batchlog().get()) || (s.get() == system_keyspace::paxos().get())
-            || s == system_keyspace::v3::scylla_views_builds_in_progress()
-            || s == system_keyspace::raft();
+            || s == system_keyspace::v3::scylla_views_builds_in_progress();
 }

 future<> system_keyspace_make(db::system_keyspace& sys_ks, distributed<replica::database>& dist_db, distributed<service::storage_service>& dist_ss, sharded<gms::gossiper>& dist_gossiper, distributed<service::raft_group_registry>& dist_raft_gr, db::config& cfg, table_selector& tables) {
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -487,37 +487,56 @@ mutation_partition& view_updates::partition_for(partition_key&& key) {
 }

 size_t view_updates::op_count() const {
-    return _op_count++;;
+    return _op_count;
 }

 row_marker view_updates::compute_row_marker(const clustering_or_static_row& base_row) const {
    /*
-     * We need to compute both the timestamp and expiration.
+     * We need to compute both the timestamp and expiration for view rows.
     *
-     * There are 3 cases:
-     *   1) There is a column that is not in the base PK but is in the view PK. In that case, as long as that column
-     *      lives, the view entry does too, but as soon as it expires (or is deleted for that matter) the entry also
-     *      should expire. So the expiration for the view is the one of that column, regardless of any other expiration.
-     *      To take an example of that case, if you have:
-     *        CREATE TABLE t (a int, b int, c int, PRIMARY KEY (a, b))
-     *        CREATE MATERIALIZED VIEW mv AS SELECT * FROM t WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)
-     *        INSERT INTO t(a, b) VALUES (0, 0) USING TTL 3;
-     *        UPDATE t SET c = 0 WHERE a = 0 AND b = 0;
-     *      then even after 3 seconds elapsed, the row will still exist (it just won't have a "row marker" anymore) and so
-     *      the MV should still have a corresponding entry.
-     *      This cell determines the liveness of the view row.
-     *   2) The columns for the base and view PKs are exactly the same, and all base columns are selected by the view.
-     *      In that case, all components (marker, deletion and cells) are the same and trivially mapped.
-     *   3) The columns for the base and view PKs are exactly the same, but some base columns are not selected in the view.
-     *      Use the max timestamp out of the base row marker and all the unselected columns - this ensures we can keep the
-     *      view row alive. Do the same thing for the expiration, if the marker is dead or will expire, and so
-     *      will all unselected columns.
+     * Below there are several distinct cases depending on how many new key
+     * columns the view has - i.e., how many of the view's key columns were
+     * regular columns in the base. base_regular_columns_in_view_pk.size():
+     *
+     * Zero new key columns:
+     *     The view rows key is composed only from base key columns, and those
+     *     cannot be changed in an update, so the view row remains alive as
+     *     long as the base row is alive. We need to return the same row
+     *     marker as the base for the view - to keep an empty view row alive
+    *      for as long as an empty base row exists.
+     *     Note that in this case, if there are *unselected* base columns, we
+     *     may need to keep an empty view row alive even without a row marker
+     *     because the base row (which has additional columns) is still alive.
+     *     For that we have the "virtual columns" feature: In the zero new
+     *     key columns case, we put unselected columns in the view as empty
+     *     columns, to keep the view row alive.
+     *
+     * One new key column:
+     *     In this case, there is a regular base column that is part of the
+     *     view key. This regular column can be added or deleted in an update,
+     *     or its expiration be set, and those can cause the view row -
+     *     including its row marker - to need to appear or disappear as well.
+     *     So the liveness of cell of this one column determines the liveness
+     *     of the view row and the row marker that we return.
+     *
+     * Two or more new key columns:
+     *     This case is explicitly NOT supported in CQL - one cannot create a
+     *     view with more than one base-regular columns in its key. In general
+     *     picking one liveness (timestamp and expiration) is not possible
+     *     if there are multiple regular base columns in the view key, as
+     *     those can have different liveness.
+     *     However, we do allow this case for Alternator - we need to allow
+     *     the case of two (but not more) because the DynamoDB API allows
+     *     creating a GSI whose two key columns (hash and range key) were
+     *     regular columns.
+     *     We can support this case in Alternator because it doesn't use
+     *     expiration (the "TTL" it does support is different), and doesn't
+     *     support user-defined timestamps. But, the two columns can still
+     *     have different timestamps - this happens if an update modifies
+     *     just one of them. In this case the timestamp of the view update
+     *     (and that of the row marker we return) is the later of these two
+     *     updated columns.
     */
-
-    // WARNING: The code assumes that if multiple regular base columns are present in the view key,
-    // they share liveness information. It's true especially in the only case currently allowed by CQL,
-    // which assumes there's up to one non-pk column in the view key. It's also true in alternator,
-    // which does not carry TTL information.
    const auto& col_ids = base_row.is_clustering_row()
            ? _base_info->base_regular_columns_in_view_pk()
            : _base_info->base_static_columns_in_view_pk();
@@ -525,7 +544,20 @@ row_marker view_updates::compute_row_marker(const clustering_or_static_row& base
        auto& def = _base->column_at(base_row.column_kind(), col_ids[0]);
        // Note: multi-cell columns can't be part of the primary key.
        auto cell = base_row.cells().cell_at(col_ids[0]).as_atomic_cell(def);
-        return cell.is_live_and_has_ttl() ? row_marker(cell.timestamp(), cell.ttl(), cell.expiry()) : row_marker(cell.timestamp());
+        auto ts = cell.timestamp();
+        if (col_ids.size() > 1){
+            // As explained above, this case only happens in Alternator,
+            // and we may need to pick a higher ts:
+            auto& second_def = _base->column_at(base_row.column_kind(), col_ids[1]);
+            auto second_cell = base_row.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
+            auto second_ts = second_cell.timestamp();
+            ts = std::max(ts, second_ts);
+            // Alternator isn't supposed to have TTL or more than two col_ids!
+            if (col_ids.size() != 2 || cell.is_live_and_has_ttl() || second_cell.is_live_and_has_ttl()) [[unlikely]] {
+                utils::on_internal_error(format("Unexpected col_ids length {} or has TTL", col_ids.size()));
+            }
+        }
+        return cell.is_live_and_has_ttl() ? row_marker(ts, cell.ttl(), cell.expiry()) : row_marker(ts);
    }

    return base_row.marker();
@@ -923,8 +955,22 @@ void view_updates::do_delete_old_entry(const partition_key& base_key, const clus
            // Note: multi-cell columns can't be part of the primary key.
            auto& def = _base->column_at(kind, col_ids[0]);
            auto cell = existing.cells().cell_at(col_ids[0]).as_atomic_cell(def);
+            auto ts = cell.timestamp();
+            if (col_ids.size() > 1) {
+                // This is the Alternator-only support for two regular base
+                // columns that become view key columns. See explanation in
+                // view_updates::compute_row_marker().
+                auto& second_def = _base->column_at(kind, col_ids[1]);
+                auto second_cell = existing.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
+                auto second_ts = second_cell.timestamp();
+                ts = std::max(ts, second_ts);
+                // Alternator isn't supposed to have more than two col_ids!
+                if (col_ids.size() != 2) [[unlikely]] {
+                    utils::on_internal_error(format("Unexpected col_ids length {}", col_ids.size()));
+                }
+            }
            if (cell.is_live()) {
-                r->apply(shadowable_tombstone(cell.timestamp(), now));
+                r->apply(shadowable_tombstone(ts, now));
            }
        } else {
            // "update" caused the base row to have been deleted, and !col_id
@@ -1308,11 +1354,12 @@ void view_update_builder::generate_update(static_row&& update, const tombstone&

 future<stop_iteration> view_update_builder::on_results() {
    constexpr size_t max_rows_for_view_updates = 100;
-    size_t rows_for_view_updates = std::accumulate(_view_updates.begin(), _view_updates.end(), 0, [] (size_t acc, const view_updates& vu) {
-        return acc + vu.op_count();
-    });
-    const bool stop_updates = rows_for_view_updates >= max_rows_for_view_updates;
-
+    auto should_stop_updates = [this] () -> bool {
+        size_t rows_for_view_updates = std::accumulate(_view_updates.begin(), _view_updates.end(), 0, [] (size_t acc, const view_updates& vu) {
+            return acc + vu.op_count();
+        });
+        return rows_for_view_updates >= max_rows_for_view_updates;
+    };
    if (_update && !_update->is_end_of_partition() && _existing && !_existing->is_end_of_partition()) {
        auto cmp = position_in_partition::tri_compare(*_schema)(_update->position(), _existing->position());
        if (cmp < 0) {
@@ -1335,7 +1382,7 @@ future<stop_iteration> view_update_builder::on_results() {
                              : std::nullopt;
                generate_update(std::move(update), _update_partition_tombstone, std::move(existing), _existing_partition_tombstone);
            }
-            return stop_updates ? stop() : advance_updates();
+            return should_stop_updates() ? stop() : advance_updates();
        }
        if (cmp > 0) {
            // We have something existing but no update (which will happen either because it's a range tombstone marker in
@@ -1371,7 +1418,7 @@ future<stop_iteration> view_update_builder::on_results() {
                    generate_update(std::move(update), _update_partition_tombstone, { std::move(existing) }, _existing_partition_tombstone);
                }
            }
-            return stop_updates ? stop () : advance_existings();
+            return should_stop_updates() ? stop () : advance_existings();
        }
        // We're updating a row that had pre-existing data
        if (_update->is_range_tombstone_change()) {
@@ -1393,8 +1440,9 @@ future<stop_iteration> view_update_builder::on_results() {
                                                  mutation_fragment_v2::printer(*_schema, *_update), mutation_fragment_v2::printer(*_schema, *_existing)));
            }
            generate_update(std::move(*_update).as_static_row(), _update_partition_tombstone, { std::move(*_existing).as_static_row() }, _existing_partition_tombstone);
+
        }
-        return stop_updates ? stop() : advance_all();
+        return should_stop_updates() ? stop() : advance_all();
    }

    auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
@@ -1409,7 +1457,7 @@ future<stop_iteration> view_update_builder::on_results() {
            auto update = static_row();
            generate_update(std::move(update), _update_partition_tombstone, { std::move(existing) }, _existing_partition_tombstone);
        }
-        return stop_updates ? stop() : advance_existings();
+        return should_stop_updates() ? stop() : advance_existings();
    }

    // If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
@@ -1430,7 +1478,7 @@ future<stop_iteration> view_update_builder::on_results() {
                          : std::nullopt;
            generate_update(std::move(*_update).as_static_row(), _update_partition_tombstone, std::move(existing), _existing_partition_tombstone);
        }
-        return stop_updates ? stop() : advance_updates();
+        return should_stop_updates() ? stop() : advance_updates();
    }

    return stop();
@@ -1609,6 +1657,13 @@ static bool should_update_synchronously(const schema& s) {
    return *tag_opt == "true";
 }

+size_t memory_usage_of(const frozen_mutation_and_schema& mut) {
+    // Overhead of sending a view mutation, in terms of data structures used by the storage_proxy, as well as possible background tasks
+    // allocated for a remote view update.
+    constexpr size_t base_overhead_bytes = 2288;
+    return base_overhead_bytes + mut.fm.representation().size();
+}
+
 // Take the view mutations generated by generate_view_updates(), which pertain
 // to a modification of a single base partition, and apply them to the
 // appropriate paired replicas. This is done asynchronously - we do not wait
@@ -1630,7 +1685,7 @@ future<> mutate_MV(
        auto& keyspace_name = mut.s->ks_name();
        auto target_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto remote_endpoints = service::get_local_storage_proxy().get_token_metadata_ptr()->pending_endpoints_for(view_token, keyspace_name);
-        auto sem_units = pending_view_updates.split(mut.fm.representation().size());
+        auto sem_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_updates.split(memory_usage_of(mut)));

        const bool update_synchronously = should_update_synchronously(*mut.s);
        if (update_synchronously) {
@@ -1678,7 +1733,7 @@ future<> mutate_MV(
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
            local_view_update = service::get_local_storage_proxy().mutate_mv_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
-                            units = sem_units.split(sem_units.count())] (future<>&& f) {
+                            sem_units] (future<>&& f) {
                --stats.writes;
                if (f.failed()) {
                    ++stats.view_updates_failed_local;
@@ -1715,7 +1770,7 @@ future<> mutate_MV(
            schema_ptr s = mut.s;
            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), std::move(mut), base_token, view_token, allow_hints, tr_state).then_wrapped(
                    [s = std::move(s), &stats, &cf_stats, tr_state, base_token, view_token, target_endpoint, updates_pushed_remote,
-                            units = sem_units.split(sem_units.count()), apply_update_synchronously] (future<>&& f) mutable {
+                            sem_units, apply_update_synchronously] (future<>&& f) mutable {
                if (f.failed()) {
                    stats.view_updates_failed_remote += updates_pushed_remote;
                    cf_stats.total_view_updates_failed_remote += updates_pushed_remote;
@@ -2230,7 +2285,7 @@ future<> view_builder::do_build_step() {
            }
        }
    }).handle_exception([] (std::exception_ptr ex) {
-        vlogger.warn("Unexcepted error executing build step: {}. Ignored.", std::current_exception());
+        vlogger.warn("Unexcepted error executing build step: {}. Ignored.", ex);
    });
 }

--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -213,7 +213,7 @@ class view_updates final {
    schema_ptr _base;
    base_info_ptr _base_info;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
-    mutable size_t _op_count = 0;
+    size_t _op_count = 0;
 public:
    explicit view_updates(view_and_base vab)
            : _view(std::move(vab.view))
@@ -327,6 +327,8 @@ future<> mutate_MV(
        service::allow_hints allow_hints,
        wait_for_all_updates wait_for_all);

+size_t memory_usage_of(const frozen_mutation_and_schema& mut);
+
 /**
 * create_virtual_column() adds a "virtual column" to a schema builder.
 * The definition of a "virtual column" is based on the given definition
--- a/direct_failure_detector/failure_detector.cc
+++ b/direct_failure_detector/failure_detector.cc
@@ -96,6 +96,7 @@ struct failure_detector::impl {
    clock& _clock;

    clock::interval_t _ping_period;
+    clock::interval_t _ping_timeout;

    // Number of workers on each shard.
    // We use this to decide where to create new workers (we pick a shard with the smallest number of workers).
@@ -138,7 +139,7 @@ struct failure_detector::impl {
    // The unregistering process requires cross-shard operations which we perform on this fiber.
    future<> _destroy_subscriptions = make_ready_future<>();

-    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period);
+    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period, clock::interval_t ping_timeout);
    ~impl();

    // Inform update_endpoint_fiber() about an added/removed endpoint.
@@ -174,12 +175,14 @@ struct failure_detector::impl {
    future<> mark(listener* l, pinger::endpoint_id ep, bool alive);
 };

-failure_detector::failure_detector(pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period))
+failure_detector::failure_detector(
+    pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period, ping_timeout))
 {}

-failure_detector::impl::impl(failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period) {
+failure_detector::impl::impl(
+    failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period), _ping_timeout(ping_timeout) {
    if (this_shard_id() != 0) {
        return;
    }
@@ -536,11 +539,9 @@ future<> endpoint_worker::ping_fiber() noexcept {
        auto start = clock.now();
        auto next_ping_start = start + _fd._ping_period;

-        // A ping should take significantly less time than _ping_period, but we give it a multiple of ping_period before it times out
-        // just in case of transient network partitions.
-        // However, if there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
+        auto timeout = start + _fd._ping_timeout;
+        // If there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
        // the listener (mark it as dead).
-        auto timeout = start + 3 * _fd._ping_period;
        for (auto& [threshold, l]: _fd._listeners_liveness) {
            if (l.endpoint_liveness[_id].alive && last_response + threshold < timeout) {
                timeout = last_response + threshold;
--- a/direct_failure_detector/failure_detector.hh
+++ b/direct_failure_detector/failure_detector.hh
@@ -120,14 +120,14 @@ public:

        // Every endpoint in the detected set will be periodically pinged every `ping_period`,
        // assuming that the pings return in a timely manner. A ping may take longer than `ping_period`
-        // before it's aborted (up to a certain multiple of `ping_period`), in which case the next ping
-        // will start immediately.
-        //
-        // `ping_period` should be chosen so that during normal operation, a ping takes significantly
-        // less time than `ping_period` (preferably at least an order of magnitude less).
+        // before it's aborted (up to `ping_timeout`), in which case the next ping will start immediately.
        //
        // The passed-in value must be the same on every shard.
-        clock::interval_t ping_period
+        clock::interval_t ping_period,
+
+        // Duration after which a ping is aborted, so that next ping can be started
+        // (pings are sent sequentially).
+        clock::interval_t ping_timeout
    );

    ~failure_detector();
@@ -147,7 +147,7 @@ public:
    // The listener stops being called when the returned subscription is destroyed.
    // The subscription must be destroyed before service is stopped.
    //
-    // `threshold` should be significantly larger than `ping_period`, preferably at least an order of magnitude larger.
+    // `threshold` should be significantly larger than `ping_timeout`, preferably at least an order of magnitude larger.
    //
    // Different listeners may use different thresholds, depending on the use case:
    // some listeners may want to mark endpoints as dead more aggressively if fast reaction times are important
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -15,10 +15,84 @@ import grp
 import sys
 import stat
 import distro
+import logging
+import pyudev
 from pathlib import Path
 from scylla_util import *
 from subprocess import run, SubprocessError

+LOGGER = logging.getLogger(__name__)
+
+class UdevInfo:
+    def __init__(self, device_file):
+        self.context = pyudev.Context()
+        self.device = pyudev.Devices.from_device_file(self.context, device_file)
+
+    def verify(self):
+        if not self.id_fs_uuid:
+            LOGGER.error('ID_FS_UUID does not found')
+        if self.id_fs_type != 'xfs':
+            LOGGER.error('ID_FS_TYPE is not "xfs"')
+        if self.id_fs_usage != 'filesystem':
+            LOGGER.error('ID_FS_USAGE is not "filesystem"')
+
+    def dump_variables(self):
+        LOGGER.error(f'    sys_path: {self.device.sys_path}')
+        LOGGER.error(f'    sys_name: {self.device.sys_name}')
+        LOGGER.error(f'    sys_number: {self.device.sys_number}')
+        LOGGER.error(f'    device_path: {self.device.device_path}')
+        LOGGER.error(f'    tags: {list(self.device.tags)}')
+        LOGGER.error(f'    subsystem: {self.device.subsystem}')
+        LOGGER.error(f'    driver: {self.device.driver}')
+        LOGGER.error(f'    device_type: {self.device.device_type}')
+        LOGGER.error(f'    device_node: {self.device.device_node}')
+        LOGGER.error(f'    device_number: {self.device.device_number}')
+        LOGGER.error(f'    device_links: {list(self.device.device_links)}')
+        LOGGER.error(f'    is_initialized: {self.device.is_initialized}')
+        LOGGER.error(f'    time_since_initialized: {self.device.time_since_initialized}')
+        for k, v in self.device.properties.items():
+            LOGGER.error(f'    {k}: {v}')
+
+    @property
+    def id_fs_uuid(self):
+        return self.device.properties.get('ID_FS_UUID')
+
+    @property
+    def id_fs_type(self):
+        return self.device.properties.get('ID_FS_TYPE')
+
+    @property
+    def id_fs_usage(self):
+        return self.device.properties.get('ID_FS_USAGE')
+
+    @property
+    def uuid_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-uuid/'):
+                return l
+
+    @property
+    def label_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-label/'):
+                return l
+
+    @property
+    def partuuid_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-partuuid/'):
+                return l
+
+    @property
+    def path_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-path/'):
+                return l
+
+    @property
+    def id_links(self):
+        return [l for l in self.device.device_links if l.startswith('/dev/disk/by-id')]
+
 if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
@@ -161,11 +235,27 @@ if __name__ == '__main__':

    os.makedirs(mount_at, exist_ok=True)

-    uuid = out(f'blkid -s UUID -o value {fsdev}')
-    if not uuid:
-        raise Exception(f'Failed to get UUID of {fsdev}')
-
-    uuidpath = f'/dev/disk/by-uuid/{uuid}'
+    udev_info = UdevInfo(fsdev)
+    mount_dev = None
+    if udev_info.uuid_link:
+        mount_dev = udev_info.uuid_link
+    else:
+        if udev_info.label_link:
+            mount_dev = udev_info.label_link
+            dev_type = 'label'
+        elif udev_info.partuuid_link:
+            mount_dev = udev_info.partuuid_link
+            dev_type = 'partuuid'
+        elif udev_info.path_link:
+            mount_dev = udev_info.path_link
+            dev_type = 'path'
+        elif udev_info.id_links:
+            mount_dev = udev_info.id_links[0]
+            dev_type = 'id'
+        else:
+            mount_dev = fsdev
+            dev_type = 'realpath'
+        LOGGER.error(f'Failed to detect uuid, using {dev_type}: {mount_dev}')

    after = ''
    wants = ''
@@ -183,7 +273,7 @@ Wants={wants}
 DefaultDependencies=no

 [Mount]
-What={uuidpath}
+What={mount_dev}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}
@@ -209,10 +299,18 @@ WantedBy=local-fs.target
        mount = systemd_unit(mntunit_bn)
        mount.start()
    except SubprocessError as e:
-        if not os.path.exists(uuidpath):
-            print(f'\nERROR: {uuidpath} is not found\n')
-        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
-            print(f'\nERROR: {uuidpath} is not block device\n')
+        if mount_dev != fsdev:
+            if not os.path.islink(mount_dev):
+                LOGGER.error('{mount_dev} is not found')
+            if not os.path.exists(mount_dev):
+                LOGGER.error('{mount_dev} is broken link')
+        if not os.path.exists(fsdev):
+            LOGGER.error('{fsdev} is not found')
+        if not stat.S_ISBLK(os.stat(fsdev).st_mode):
+            LOGGER.error('{fsdev} is not block device')
+        LOGGER.error(f'Error detected, dumping udev env parameters on {fsdev}')
+        udev_info.verify()
+        udev_info.dump_variables()
        raise e

    if args.enable_on_nextboot:
@@ -228,3 +326,8 @@ WantedBy=local-fs.target

    if is_debian_variant():
        run('update-initramfs -u', shell=True, check=True)
+
+    if not udev_info.uuid_link:
+        LOGGER.error(f'Error detected, dumping udev env parameters on {fsdev}')
+        udev_info.verify()
+        udev_info.dump_variables()
--- a/docs/dev/cdc.md
+++ b/docs/dev/cdc.md
@@ -158,9 +158,27 @@ We're not able to prevent a node learning about a new generation too late due to
 However, it could happen that a node learns about the generation from gossip in time, but then won't be able to extract it from `cdc_generation_descriptions_v2`. In that case we can still maintain consistency: the node will remember that there is a new generation even though it doesn't yet know what it is (it knows only the ID, in particular it knows the timestamp) using the `cdc::metadata::prepare(db_clock::time_point)` method, and then _reject_ writes for CDC-enabled tables that are supposed to use this new generation. The node will keep trying to read the generation's data in background until it succeeds or sees that it's not necessary anymore (e.g. because the generation was already superseded by a new generation).
 Thus we give up availability for safety. This likely won't happen if the administrator ensures that the cluster is not partitioned before bootstrapping a new node. This problem will also be mitigated with a future patch.

-Due to the need of maintaining colocation we don't allow the client to send writes with arbitrary timestamps.
-Suppose that a write is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`). Then we only allow the write if its timestamp is in the interval [`T`, `C + generation_leeway`), where `generation_leeway` is a small time-inteval constant (e.g. 5 seconds).
-Reason: we cannot allow writes before `T`, because they belong to the old generation whose token ranges might no longer refine the current vnodes, so the corresponding log write would not necessarily be colocated with the base write. We also cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.
+#### Generation switching: accepting writes
+
+Due to the need of maintaining colocation we don't allow the client to send writes with arbitrary timestamps. We allow:
+- writes to the current and next generations unless they are too far into the future,
+- writes to the previous generations unless they are too far into the past.
+
+##### Writes to the current and next generations
+
+Suppose that a write with timestamp `W` is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`) such that `T <= W`. Then we only allow the write if `W < C + generation_leeway`, where `generation_leeway` is a small time-interval constant (e.g. 5 seconds).
+
+We cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.
+
+##### Writes to the previous generations
+
+This time suppose that `T > W`. Then we only allow the write if `W > C - generation_leeway` and there was a generation operating at `W`.
+
+We allow writes to previous generations to improve user experience. If a client generates timestamps by itself and clocks are not perfectly synchronized, there may be short periods of time around the moment of switching generations when client's writes are rejected because they fall into one of the previous generations. Usually, this problem is easy to overcome by the client. It can simply repeat a write a few times, but using a higher timestamp. Unfortunately, if a table additionally uses LWT, the client cannot increase the timestamp because LWT makes timestamps permanent. Once Paxos commits an entry with a given timestamp, Scylla will keep trying to apply that entry until it succeeds, with the same timestamp. Applying the entry involves doing a CDC log table write. If it fails, we are stuck. Allowing writes to the previous generations is also a probabilistic fix for this bug.
+
+Note that writing only to the previous generation might not be enough. With the Raft-based topology and tablets, we can add multiple nodes almost instantly. Then, we can have multiple generations with almost identical timestamps.
+
+We allow writes only to the recent past to reduce the number of generations that must be stored in memory.

 ### Streams description tables

--- a/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
+++ b/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
@@ -7,10 +7,11 @@

 .. Note:: 

-   If ``authenticator`` is set to ``PasswordAuthenticator`` - increase the replication factor of the ``system_auth`` keyspace.
-
-   For example:
-
+   If ``authenticator`` is set to ``PasswordAuthenticator``, increase the replication factor of the ``system_auth`` keyspace.
+   For example: 
+   
   ``ALTER KEYSPACE system_auth WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1' : <new_replication_factor>};``
+   
+   Ensure you run repair after you alter the keyspace. See :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`.

   It is recommended to set ``system_auth`` replication factor to the number of nodes in each DC.
--- a/docs/troubleshooting/debugging-large-partition.rst
+++ b/docs/troubleshooting/debugging-large-partition.rst
@@ -21,7 +21,7 @@ Any of the following:

  .. code-block:: none

-     WARN  2022-09-22 17:33:11,075 [shard 1]large_data - Writing large partition Some_KS/Some_table: PK[/CK[/COL]] (SIZE bytes) to SSTABLE_NAME
+     WARN  2022-09-22 17:33:11,075 [shard 1]large_data - Writing large partition Some_KS/Some_table: [COL] (SIZE bytes) to SSTABLE_NAME

  In this case, refer to :ref:`Troubleshooting Large Partition Tables <large-partition-table-configure>` for more information.

--- a/docs/upgrade/_common/upgrade-guide-v4-rpm.rst
+++ b/docs/upgrade/_common/upgrade-guide-v4-rpm.rst
@@ -31,7 +31,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/index.html>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `ScyllaDB Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade Steps
 =============
@@ -180,4 +180,4 @@ Start the node

 Validate
 --------
-Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.
+Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.
--- a/docs/upgrade/_common/upgrade-guide-v4-ubuntu-and-debian.rst
+++ b/docs/upgrade/_common/upgrade-guide-v4-ubuntu-and-debian.rst
@@ -34,7 +34,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/index.html>`_ for suspending Scylla Manager (only available Scylla Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `Scylla Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `Scylla Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade steps
 =============
--- a/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
@@ -32,7 +32,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `ScyllaDB Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade Steps
 =============
--- a/docs/upgrade/ami-upgrade.rst
+++ b/docs/upgrade/ami-upgrade.rst
@@ -2,13 +2,14 @@
 Upgrade ScyllaDB Image: EC2 AMI, GCP, and Azure Images
 ======================================================

-To upgrade ScyllaDB images, you need to update:
+ScyllaDB images are based on **Ubuntu 22.04**.

-#. ScyllaDB packages. Since ScyllaDB Open Source **5.2** and ScyllaDB 
-   Enterprise **2023.1**, the images are based on **Ubuntu 22.04**. 
-   See the :doc:`upgrade guide <./index>` for your ScyllaDB version 
-   for instructions for updating ScyllaDB packages on Ubuntu.
-#. Underlying OS packages. ScyllaDB includes a list of 3rd party and OS packages 
-   tested with the ScyllaDB release. 
+If you’re using the ScyllaDB official image (recommended), follow the upgrade 
+instructions on the **Debian/Ubuntu** tab in the :doc:`upgrade guide </upgrade/index/>`
+for your ScyllaDB version.
+
+If you’re using your own image and have installed ScyllaDB packages for Ubuntu or Debian, 
+follow the extended upgrade procedure on the **EC2/GCP/Azure Ubuntu image** tab 
+in the :doc:`upgrade guide </upgrade/index/>` for your ScyllaDB version.

 To check your Scylla version, run the ``scylla --version`` command.
--- a/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
+++ b/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
@@ -167,54 +167,27 @@ Download and install the new release

   .. group-tab:: EC2/GCP/Azure Ubuntu Image

-        Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
+      Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.

-        There are two alternative upgrade procedures: upgrading ScyllaDB and simultaneously updating 3rd party and OS packages - recommended if you 
-        are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04, and upgrading ScyllaDB without updating 
-        any external packages.
+      If you’re using the ScyllaDB official image (recommended), see
+      the **Debian/Ubuntu** tab for upgrade instructions. If you’re using your
+      own image and have installed ScyllaDB packages for Ubuntu or Debian,
+      you need to apply an extended upgrade procedure:
+      
+      #. Update the ScyllaDB deb repo (see above).
+      #. Configure Java 1.8 (see above).
+      #. Install the new ScyllaDB version with the additional 
+         ``scylla-enterprise-machine-image`` package:

-        **To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-        Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Load the new repo:
-
-            .. code:: sh
-
-               sudo apt-get update
-
-        #. Run the following command to update the manifest file:
-
-            .. code:: sh
-
-               cat scylla-enterprise-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-
-            Where:
-
-              * ``<version>`` - The ScyllaDB Enterprise version to which you are upgrading ( |NEW_VERSION| ).
-              * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-
-            The file is included in the ScyllaDB Enterprise packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-            Example:
-
-                .. code:: sh
-
-                   cat scylla-enterprise-packages-2022.2.0-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-
-                .. note::
-
-                   Alternatively, you can update the manifest file with the following command:
-
-                   ``sudo apt-get install $(awk '{print $1'} scylla-enterprise-packages-<version>-<arch>.txt) -y``
-
-
-
-        To upgrade ScyllaDB without updating any external packages, follow the :ref:`download and installation instructions for Debian/Ubuntu <upgrade-debian-ubuntu-5.2-to-enterprise-2023.1>`.
+          .. code::
+         
+           sudo apt-get clean all
+           sudo apt-get update
+           sudo apt-get dist-upgrade scylla-enterprise
+           sudo apt-get dist-upgrade scylla-enterprise-machine-image

+      #. Run ``scylla_setup`` without running ``io_setup``.
+      #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.

 Start the node
 --------------
--- a/docs/using-scylla/cdc/cdc-stream-generations.rst
+++ b/docs/using-scylla/cdc/cdc-stream-generations.rst
@@ -124,58 +124,9 @@ Example: The Next Generation

   There are two entries with the same base partition key, but in different streams. One of them corresponds to the write made before the generation change, the other --- to the write made after the change.

-After the operating CDC generation changes, all writes with timestamps greater than or equal to the new generation's timestamp will use the new stream IDs. If you try to perform a write with a timestamp that is smaller than the new generation's timestamp, the write may be rejected, depending on the node you're connected to:
+After the operating CDC generation changes, all writes with timestamps greater than or equal to the new generation's timestamp will use the new stream IDs.

-* if the clock of the node you're connected to reports earlier time than the generation's timestamp, it will allow the write to be performed.
-* Otherwise, the write will be rejected.
-
-Therefore, if you've configured the driver to generate timestamps for you, make sure that the clock of the machine your driver is running on is not too desynchronized with the clock of the node you're connecting to. That way you can minimize the chance of writes being rejected while a new node is being bootstrapped.
-
-Example: rejecting writes to an old generation
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This is a continuation of the :ref:`previous example <next-gen>`; a second node was bootstrapped recently, thus a new generation superseded the previous one.
-
-#. Get the timestamp of the latest generation as an integer:
-
-   .. code-block:: cql
-
-    SELECT tounixtimestamp(time) FROM system_distributed.cdc_generation_timestamps WHERE key = 'timestamps';
-
-   result:
-
-   .. code-block:: none
-
-     system.tounixtimestamp(time)
-    ------------------------------
-                    1585152329484
-                    1585140283006
-
-    (2 rows)
-
-   Generation timestamps have millisecond resolution. Here, the latest generation's timestamp is equal to ``1585152329484`` milliseconds.
-
-#. Try to perform a write with a slightly smaller timestamp (remember that the ``USING TIMESTAMP`` clause expects a timestamp in **microseconds**):
-
-   .. code-block:: cql
-
-    INSERT INTO ks.t (pk, ck, v) VALUES (0, 0, 0) USING TIMESTAMP 1585152329483000;
-
-   result:
-
-   .. code-block:: none
-
-    InvalidRequest: Error from server: code=2200 [Invalid query] message="cdc: attempted to get a stream from an earlier generation than the currently used one. With CDC you cannot send writes with timestamps too far into the past, because that would break consistency properties (write timestamp: 2020/03/25 16:05:29, current generation started at: 2020/03/25 16:05:29)"
-
-   The write was rejected.
-
-#. Perform a write with a timestamp equal to the generation's timestamp:
-
-   .. code-block:: cql
-
-    INSERT INTO ks.t (pk, ck, v) VALUES (0, 0, 0) USING TIMESTAMP 1585152329484000;
-
-   The write succeeds.
+If the clock of the node you're connected to reports time distant from the write's timestamp, it may reject the write. If you've configured the driver to generate timestamps for you, make sure that the clock of the machine your driver is running on is not too desynchronized with the clock of the node you're connecting to. That way you can minimize the chance of writes being rejected.

 The first generation's timestamp
 --------------------------------
--- a/docs/using-scylla/drivers/index.rst
+++ b/docs/using-scylla/drivers/index.rst
@@ -14,7 +14,7 @@ Scylla Drivers
 You can use Scylla with:

 * :doc:`Apache Cassandra CQL Compatible Drivers <cql-drivers/index>`
-* :doc:`AWS DynamoDB Compatible API Drivers <dynamo-drivers/index>`
+* :doc:`Amazon DynamoDB Compatible API Drivers <dynamo-drivers/index>`

 Additional drivers coming soon!

--- a/gms/version_generator.cc
+++ b/gms/version_generator.cc
@@ -8,6 +8,11 @@
 * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
 */

+#include <seastar/core/on_internal_error.hh>
+#include <seastar/core/print.hh>
+#include <seastar/core/smp.hh>
+#include "log.hh"
+#include "seastarx.hh"
 #include "version_generator.hh"

 namespace gms {
@@ -16,8 +21,15 @@ namespace version_generator {
 // For us, we run the gossiper on a single CPU, and don't need to use atomics.
 static int version = 0;

+static logging::logger logger("version_generator");
+
 int get_next_version() noexcept
 {
+    if (this_shard_id() != 0) [[unlikely]] {
+        on_fatal_internal_error(logger, format(
+                "{} can only be called on shard 0, but it was called on shard {}",
+                __FUNCTION__, this_shard_id()));
+    }
    return ++version;
 }

--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -123,6 +123,7 @@ fedora_python3_packages=(
    python3-distro
    python3-click
    python3-six
+    python3-pyudev
 )

 pip_packages=(
--- a/main.cc
+++ b/main.cc
@@ -958,7 +958,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            supervisor::notify("starting direct failure detector service");
            fd.start(
                std::ref(fd_pinger), std::ref(fd_clock),
-                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count()).get();
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{100}}.count(),
+                service::direct_fd_clock::base::duration{std::chrono::milliseconds{cfg->direct_failure_detector_ping_timeout_in_ms()}}.count()).get();

            auto stop_fd = defer_verbose_shutdown("direct_failure_detector", [] {
                fd.stop().get();
@@ -1164,12 +1165,18 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            }).get();
            cfg->host_id = sys_ks.local().load_local_host_id().get0();

+            std::any stop_raft_api;
            if (raft_gr.local().is_enabled()) {
                auto my_raft_id = raft::server_id{cfg->host_id.uuid()};
                supervisor::notify("starting Raft Group Registry service");
                raft_gr.invoke_on_all([my_raft_id] (service::raft_group_registry& raft_gr) {
                    return raft_gr.start(my_raft_id);
                }).get();
+
+                api::set_server_raft(ctx, raft_gr).get();
+                stop_raft_api = defer_verbose_shutdown("Raft API", [&ctx] {
+                    api::unset_server_raft(ctx).get();
+                });
            } else {
                if (cfg->check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
                    startlog.error("Bad configuration: RAFT feature has to be enabled if BROADCAST_TABLES is enabled");
@@ -1177,7 +1184,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                }
            }

-
            group0_client.init().get();

            db::sstables_format_selector sst_format_selector(gossiper.local(), feature_service, db);
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -213,7 +213,7 @@ public:
            tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout)
            : _db(db)
            , _schema(std::move(s))
-            , _permit(_db.local().get_reader_concurrency_semaphore().make_tracking_only_permit(_schema.get(), "multishard-mutation-query", timeout))
+            , _permit(_db.local().get_reader_concurrency_semaphore().make_tracking_only_permit(_schema, "multishard-mutation-query", timeout))
            , _cmd(cmd)
            , _ranges(ranges)
            , _trace_state(std::move(trace_state))
--- a/mutation_fragment_stream_validator.hh
+++ b/mutation_fragment_stream_validator.hh
@@ -169,11 +169,11 @@ class mutation_fragment_stream_validating_filter {
    sstring _name_storage;
    std::string_view _name_view; // always valid
    mutation_fragment_stream_validation_level _validation_level;
+    bool _raise_errors;

 private:
-    sstring full_name() const;
-
-    mutation_fragment_stream_validating_filter(const char* name_literal, sstring name_value, const schema& s, mutation_fragment_stream_validation_level level);
+    mutation_fragment_stream_validating_filter(const char* name_literal, sstring name_value, const schema& s,
+            mutation_fragment_stream_validation_level level, bool raise_errors);

 public:
    /// Constructor.
@@ -181,12 +181,18 @@ public:
    /// \arg name is used in log messages to identify the validator, the
    ///     schema identity is added automatically
    /// \arg compare_keys enable validating clustering key monotonicity
-    mutation_fragment_stream_validating_filter(sstring name, const schema& s, mutation_fragment_stream_validation_level level);
-    mutation_fragment_stream_validating_filter(const char* name, const schema& s, mutation_fragment_stream_validation_level level);
+    mutation_fragment_stream_validating_filter(sstring name, const schema& s, mutation_fragment_stream_validation_level level, bool raise_errors = true);
+    mutation_fragment_stream_validating_filter(const char* name, const schema& s, mutation_fragment_stream_validation_level level, bool raise_errors = true);

    mutation_fragment_stream_validating_filter(mutation_fragment_stream_validating_filter&&) = delete;
    mutation_fragment_stream_validating_filter(const mutation_fragment_stream_validating_filter&) = delete;

+    sstring full_name() const;
+
+    bool raise_errors() const { return _raise_errors; }
+
+    const mutation_fragment_stream_validator& validator() const { return  _validator; }
+
    bool operator()(const dht::decorated_key& dk);
    bool operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos, std::optional<tombstone> new_current_tombstone);
    bool operator()(mutation_fragment::kind kind, position_in_partition_view pos);
@@ -197,5 +203,5 @@ public:
    void reset(const mutation_fragment_v2& mf);
    /// Equivalent to `operator()(partition_end{})`
    bool on_end_of_partition();
-    void on_end_of_stream();
+    bool on_end_of_stream();
 };
--- a/mutation_writer/multishard_writer.cc
+++ b/mutation_writer/multishard_writer.cc
@@ -113,7 +113,7 @@ future<> multishard_writer::make_shard_writer(unsigned shard) {
            reader = make_foreign(std::make_unique<flat_mutation_reader_v2>(std::move(reader)))] () mutable {
        auto s = gs.get();
        auto semaphore = std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "shard_writer");
-        auto permit = semaphore->make_tracking_only_permit(s.get(), "multishard-writer", db::no_timeout);
+        auto permit = semaphore->make_tracking_only_permit(s, "multishard-writer", db::no_timeout);
        auto this_shard_reader = make_foreign_reader(s, std::move(permit), std::move(reader));
        return make_foreign(std::make_unique<shard_writer>(gs.get(), std::move(semaphore), std::move(this_shard_reader), consumer));
    }).then([this, shard] (foreign_ptr<std::unique_ptr<shard_writer>> writer) {
--- a/query-result-writer.hh
+++ b/query-result-writer.hh
@@ -136,9 +136,9 @@ public:
            return stop_iteration::no;
        }
        if (!_slice.options.contains<partition_slice::option::allow_short_read>()) {
-            throw std::runtime_error(fmt::format(
-                    "Tombstones processed by unpaged query exceeds limit of {} (configured via query_tombstone_page_limit)",
-                    _tombstone_limit));
+            // The read is unpaged, we cannot interrupt it early without failing it.
+            // Better let it continue.
+            return stop_iteration::no;
        }
        return stop_iteration::yes;
    }
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -19,9 +19,10 @@ leader::~leader() {
 }

 fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,
-        index_t commit_idx, failure_detector& failure_detector, fsm_config config) :
+        index_t commit_idx, failure_detector& failure_detector, fsm_config config,
+        seastar::condition_variable& sm_events) :
        _my_id(id), _current_term(current_term), _voted_for(voted_for),
-        _log(std::move(log)), _failure_detector(failure_detector), _config(config) {
+        _log(std::move(log)), _failure_detector(failure_detector), _config(config), _sm_events(sm_events) {
    if (id == raft::server_id{}) {
        throw std::invalid_argument("raft::fsm: raft instance cannot have id zero");
    }
@@ -41,10 +42,6 @@ fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,
    }
 }

-fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,
-        failure_detector& failure_detector, fsm_config config) :
-        fsm(id, current_term, voted_for, std::move(log), index_t{0}, failure_detector, config) {}
-
 future<semaphore_units<>> fsm::wait_for_memory_permit(seastar::abort_source* as, size_t size) {
    check_is_leader();

@@ -296,20 +293,14 @@ void fsm::become_candidate(bool is_prevote, bool is_leadership_transfer) {
    }
 }

-future<fsm_output> fsm::poll_output() {
-    logger.trace("fsm::poll_output() {} stable index: {} last index: {}",
+bool fsm::has_output() const {
+    logger.trace("fsm::has_output() {} stable index: {} last index: {}",
        _my_id, _log.stable_idx(), _log.last_idx());

-    while (true) {
-        auto diff = _log.last_idx() - _log.stable_idx();
+    auto diff = _log.last_idx() - _log.stable_idx();

-        if (diff > 0 || !_messages.empty() || !_observed.is_equal(*this) || _output.max_read_id_with_quorum ||
-                (is_leader() && leader_state().last_read_id_changed) || _output.snp || !_output.snps_to_drop.empty()) {
-            break;
-        }
-        co_await _sm_events.wait();
-    }
-    co_return get_output();
+    return diff > 0 || !_messages.empty() || !_observed.is_equal(*this) || _output.max_read_id_with_quorum
+        || (is_leader() && leader_state().last_read_id_changed) || _output.snp || !_output.snps_to_drop.empty();
 }

 fsm_output fsm::get_output() {
@@ -1019,7 +1010,7 @@ bool fsm::apply_snapshot(snapshot_descriptor snp, size_t max_trailing_entries, s
    // If the snapshot is local, _commit_idx is larger than snp.idx.
    // Otherwise snp.idx becomes the new commit index.
    _commit_idx = std::max(_commit_idx, snp.idx);
-    _output.snp.emplace(fsm_output::applied_snapshot{snp, local});
+    _output.snp.emplace(fsm_output::applied_snapshot{snp, local, max_trailing_entries});
    size_t units = _log.apply_snapshot(std::move(snp), max_trailing_entries, max_trailing_bytes);
    if (is_leader()) {
        logger.trace("apply_snapshot[{}]: signal {} available units", _my_id, units);
@@ -1132,7 +1123,6 @@ void fsm::stop() {
        // (in particular, abort waits on log_limiter_semaphore and prevent new ones).
        become_follower({});
    }
-    _sm_events.broken();
 }

 std::ostream& operator<<(std::ostream& os, const fsm& f) {
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -21,6 +21,11 @@ struct fsm_output {
    struct applied_snapshot {
        snapshot_descriptor snp;
        bool is_local;
+
+        // Always 0 for non-local snapshots.
+        size_t max_trailing_entries;
+
+        // FIXME: include max_trailing_bytes here and in store_snapshot_descriptor
    };
    std::optional<std::pair<term_t, server_id>> term_and_vote;
    std::vector<log_entry_ptr> log_entries;
@@ -36,14 +41,6 @@ struct fsm_output {
    std::optional<read_id> max_read_id_with_quorum;
    // Set to true if a leadership transfer was aborted since the last output
    bool abort_leadership_transfer;
-
-    // True if there is no new output
-    bool empty() const {
-        return !term_and_vote &&
-            log_entries.size() == 0 && messages.size() == 0 &&
-            committed.size() == 0 && !snp && snps_to_drop.empty() &&
-            !configuration;
-    }
 };

 struct fsm_config {
@@ -136,9 +133,13 @@ struct leader {
 // in-memory state machine with a catch-all API step(message)
 // method. The method handles any kind of input and performs the
 // needed state machine state transitions. To get state machine output
-// poll_output() function has to be called. This call produces an output
+// get_output() function has to be called. To check first if
+// any new output is present, call has_output(). To wait for new
+// new output events, use the sm_events condition variable passed
+// to fsm constructor; fs` signals it each time new output may appear.
+// The get_output() call produces an output
 // object, which encapsulates a list of actions that must be
-// performed until the next poll_output() call can be made. The time is
+// performed until the next get_output() call can be made. The time is
 // represented with a logical timer. The client is responsible for
 // periodically invoking tick() method, which advances the state
 // machine time and allows it to track such events as election or
@@ -226,7 +227,7 @@ private:
    std::vector<std::pair<server_id, rpc_message>> _messages;

    // Signaled when there is a IO event to process.
-    seastar::condition_variable _sm_events;
+    seastar::condition_variable& _sm_events;

    // Called when one of the replicas advances its match index
    // so it may be the case that some entries are committed now.
@@ -338,10 +339,8 @@ protected: // For testing

 public:
    explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
-            index_t commit_idx, failure_detector& failure_detector, fsm_config conf);
-
-    explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
-            failure_detector& failure_detector, fsm_config conf);
+            index_t commit_idx, failure_detector& failure_detector, fsm_config conf,
+            seastar::condition_variable& sm_events);

    bool is_leader() const {
        return std::holds_alternative<leader>(_state);
@@ -409,12 +408,9 @@ public:
    // committed to the persistent Raft log afterwards.
    template<typename T> const log_entry& add_entry(T command);

-    // Wait until there is, and return state machine output that
-    // needs to be handled.
-    // This includes a list of the entries that need
-    // to be logged. The logged entries are eventually
-    // discarded from the state machine after applying a snapshot.
-    future<fsm_output> poll_output();
+    // Check if there is any state machine output
+    // that `get_output()` will return.
+    bool has_output() const;

    // Get state machine output, if there is any. Doesn't
    // wait. It is public for use in testing.
@@ -427,7 +423,7 @@ public:

    // Feed one Raft RPC message into the state machine.
    // Advances the state machine state and generates output,
-    // accessible via poll_output().
+    // accessible via get_output().
    template <typename Message>
    void step(server_id from, Message&& msg);

--- a/raft/raft.hh
+++ b/raft/raft.hh
@@ -755,6 +755,18 @@ public:
    // apply call 'state_machine::load_snapshot(snapshot::id)'
    // Called during Raft server initialization only, should not
    // run in parallel with store.
+    //
+    // If you want to create a Raft cluster with a non-empty state
+    // machine, so that joining servers always receive a snapshot,
+    // you should:
+    // - make sure that members of the initial configuration have
+    //   the same state machine state,
+    // - set the initial snapshot index on members of the initial
+    //   configuration to 1,
+    // - set the initial snapshot index on all subsequently joining
+    //   servers to 0.
+    // This also works if you start with an empty state machine,
+    // so consider it as the go-to default.
    virtual future<snapshot_descriptor> load_snapshot_descriptor() = 0;

    // Persist given log entries.
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -98,6 +98,8 @@ public:
    future<entry_id> add_entry_on_leader(command command, seastar::abort_source* as);
    void register_metrics() override;
 private:
+    seastar::condition_variable _events;
+
    std::unique_ptr<rpc> _rpc;
    std::unique_ptr<state_machine> _state_machine;
    std::unique_ptr<persistence> _persistence;
@@ -112,6 +114,8 @@ private:
    std::optional<awaited_conf_change> _non_joint_conf_commit_promise;
    // Index of the last entry applied to `_state_machine`.
    index_t _applied_idx;
+    // Index of the last persisted snapshot descriptor.
+    index_t _snapshot_desc_idx;
    std::list<active_read> _reads;
    std::multimap<index_t, awaited_index> _awaited_indexes;

@@ -121,13 +125,20 @@ private:
    // Signaled when apply index is changed
    condition_variable _applied_index_changed;

+    // Signaled when _snapshot_desc_idx is changed
+    condition_variable _snapshot_desc_idx_changed;
+
    struct stop_apply_fiber{}; // exception to send when apply fiber is needs to be stopepd

    struct removed_from_config{}; // sent to applier_fiber when we're not a leader and we're outside the current configuration
+
+    struct trigger_snapshot_msg{};
+
    using applier_fiber_message = std::variant<
        std::vector<log_entry_ptr>,
        snapshot_descriptor,
-        removed_from_config>;
+        removed_from_config,
+        trigger_snapshot_msg>;
    queue<applier_fiber_message> _apply_entries = queue<applier_fiber_message>(10);

    struct stats {
@@ -201,6 +212,16 @@ private:
    };
    absl::flat_hash_map<server_id, append_request_queue> _append_request_status;

+    struct server_requests {
+        bool snapshot = false;
+
+        bool empty() const {
+            return !snapshot;
+        }
+    };
+
+    server_requests _new_server_requests;
+
    // Called to commit entries (on a leader or otherwise).
    void notify_waiters(std::map<index_t, op_status>& waiters, const std::vector<log_entry_ptr>& entries);

@@ -212,10 +233,15 @@ private:
    // to be applied.
    void signal_applied();

-    // This fiber processes FSM output by doing the following steps in order:
+    // Processes FSM output by doing the following steps in order:
    //  - persist the current term and vote
    //  - persist unstable log entries on disk.
    //  - send out messages
+    future<> process_fsm_output(index_t& stable_idx, fsm_output&&);
+
+    future<> process_server_requests(server_requests&&);
+
+    // Processes new FSM outputs and server requests as they appear.
    future<> io_fiber(index_t stable_idx);

    // This fiber runs in the background and applies committed entries.
@@ -265,6 +291,8 @@ private:
    // A helper to wait for a leader to get elected
    future<> wait_for_leader(seastar::abort_source* as);

+    virtual future<bool> trigger_snapshot(seastar::abort_source* as) override;
+
    // Get "safe to read" index from a leader
    future<read_barrier_reply> get_read_idx(server_id leader, seastar::abort_source* as);
    // Wait for an entry with a specific term to get committed or
@@ -337,12 +365,14 @@ future<> server_impl::start() {
                                     .append_request_threshold = _config.append_request_threshold,
                                     .max_log_size = _config.max_log_size,
                                     .enable_prevoting = _config.enable_prevoting
-                                 });
+                                 },
+                                 _events);

    _applied_idx = index_t{0};
+    _snapshot_desc_idx = index_t{0};
    if (snapshot.id) {
        co_await _state_machine->load_snapshot(snapshot.id);
-        _applied_idx = snapshot.idx;
+        _snapshot_desc_idx = _applied_idx = snapshot.idx;
    }

    if (!rpc_config.current.empty()) {
@@ -403,6 +433,54 @@ future<> server_impl::wait_for_leader(seastar::abort_source* as) {
    }
 }

+future<bool> server_impl::trigger_snapshot(seastar::abort_source* as) {
+    check_not_aborted();
+
+    if (_applied_idx <= _snapshot_desc_idx) {
+        logger.debug(
+            "[{}] trigger_snapshot: last persisted snapshot descriptor index is up-to-date"
+            ", applied index: {}, persisted snapshot descriptor index: {}, last fsm log index: {}"
+            ", last fsm snapshot index: {}", _id, _applied_idx, _snapshot_desc_idx,
+            _fsm->log_last_idx(), _fsm->log_last_snapshot_idx());
+        co_return false;
+    }
+
+    _new_server_requests.snapshot = true;
+    _events.signal();
+
+    // Wait for persisted snapshot index to catch up to this index.
+    auto awaited_idx = _applied_idx;
+
+    logger.debug("[{}] snapshot request waiting for index {}", _id, awaited_idx);
+
+    try {
+        optimized_optional<abort_source::subscription> sub;
+        if (as) {
+            as->check();
+            sub = as->subscribe([this] () noexcept { _snapshot_desc_idx_changed.broadcast(); });
+            assert(sub); // due to `check()` above
+        }
+        co_await _snapshot_desc_idx_changed.when([this, as, awaited_idx] {
+            return (as && as->abort_requested()) || awaited_idx <= _snapshot_desc_idx;
+        });
+        if (as) {
+            as->check();
+        }
+    } catch (abort_requested_exception&) {
+        throw request_aborted();
+    } catch (seastar::broken_condition_variable&) {
+        throw request_aborted();
+    }
+
+    logger.debug(
+        "[{}] snapshot request satisfied, awaited index {}, persisted snapshot descriptor index: {}"
+        ", current applied index {}, last fsm log index {}, last fsm snapshot index {}",
+        _id, awaited_idx, _snapshot_desc_idx, _applied_idx,
+        _fsm->log_last_idx(), _fsm->log_last_snapshot_idx());
+
+    co_return true;
+}
+
 future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abort_source* as) {
    // The entry may have been already committed and even applied
    // in case it was forwarded to the leader. In this case
@@ -917,141 +995,168 @@ static rpc_config_diff diff_address_sets(const server_address_set& prev, const c
    return result;
 }

+future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batch) {
+    if (batch.term_and_vote) {
+        // Current term and vote are always persisted
+        // together. A vote may change independently of
+        // term, but it's safe to update both in this
+        // case.
+        co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
+        _stats.store_term_and_vote++;
+    }
+
+    if (batch.snp) {
+        auto& [snp, is_local, max_trailing_entries] = *batch.snp;
+        logger.trace("[{}] io_fiber storing snapshot {}", _id, snp.id);
+        // Persist the snapshot
+        co_await _persistence->store_snapshot_descriptor(snp, max_trailing_entries);
+        _snapshot_desc_idx = snp.idx;
+        _snapshot_desc_idx_changed.broadcast();
+        _stats.store_snapshot++;
+        // If this is locally generated snapshot there is no need to
+        // load it.
+        if (!is_local) {
+            co_await _apply_entries.push_eventually(std::move(snp));
+        }
+    }
+
+    for (const auto& snp_id: batch.snps_to_drop) {
+        _state_machine->drop_snapshot(snp_id);
+    }
+
+    if (batch.log_entries.size()) {
+        auto& entries = batch.log_entries;
+
+        if (last_stable >= entries[0]->idx) {
+            co_await _persistence->truncate_log(entries[0]->idx);
+            _stats.truncate_persisted_log++;
+        }
+
+        utils::get_local_injector().inject("store_log_entries/test-failure",
+            [] { throw std::runtime_error("store_log_entries/test-failure"); });
+
+        // Combine saving and truncating into one call?
+        // will require persistence to keep track of last idx
+        co_await _persistence->store_log_entries(entries);
+
+        last_stable = (*entries.crbegin())->idx;
+        _stats.persisted_log_entries += entries.size();
+    }
+
+    // Update RPC server address mappings. Add servers which are joining
+    // the cluster according to the new configuration (obtained from the
+    // last_conf_idx).
+    //
+    // It should be done prior to sending the messages since the RPC
+    // module needs to know who should it send the messages to (actual
+    // network addresses of the joining servers).
+    rpc_config_diff rpc_diff;
+    if (batch.configuration) {
+        rpc_diff = diff_address_sets(get_rpc_config(), *batch.configuration);
+        for (const auto& addr: rpc_diff.joining) {
+            add_to_rpc_config(addr);
+        }
+        _rpc->on_configuration_change(rpc_diff.joining, {});
+    }
+
+     // After entries are persisted we can send messages.
+    for (auto&& m : batch.messages) {
+        try {
+            send_message(m.first, std::move(m.second));
+        } catch(...) {
+            // Not being able to send a message is not a critical error
+            logger.debug("[{}] io_fiber failed to send a message to {}: {}", _id, m.first, std::current_exception());
+        }
+    }
+
+    if (batch.configuration) {
+        for (const auto& addr: rpc_diff.leaving) {
+            abort_snapshot_transfer(addr.id);
+            remove_from_rpc_config(addr);
+        }
+        _rpc->on_configuration_change({}, rpc_diff.leaving);
+    }
+
+    // Process committed entries.
+    if (batch.committed.size()) {
+        if (_non_joint_conf_commit_promise) {
+            for (const auto& e: batch.committed) {
+                const auto* cfg = get_if<raft::configuration>(&e->data);
+                if (cfg != nullptr && !cfg->is_joint()) {
+                    std::exchange(_non_joint_conf_commit_promise, std::nullopt)->promise.set_value();
+                    break;
+                }
+            }
+        }
+        co_await _persistence->store_commit_idx(batch.committed.back()->idx);
+        _stats.queue_entries_for_apply += batch.committed.size();
+        co_await _apply_entries.push_eventually(std::move(batch.committed));
+    }
+
+    if (batch.max_read_id_with_quorum) {
+        while (!_reads.empty() && _reads.front().id <= batch.max_read_id_with_quorum) {
+            _reads.front().promise.set_value(_reads.front().idx);
+            _reads.pop_front();
+        }
+    }
+    if (!_fsm->is_leader()) {
+        if (_stepdown_promise) {
+            std::exchange(_stepdown_promise, std::nullopt)->set_value();
+        }
+        if (!_current_rpc_config.contains(_id)) {
+            // - It's important we push this after we pushed committed entries above. It
+            // will cause `applier_fiber` to drop waiters, which should be done after we
+            // notify all waiters for entries committed in this batch.
+            // - This may happen multiple times if `io_fiber` gets multiple batches when
+            // we're outside the configuration, but it should eventually (and generally
+            // quickly) stop happening (we're outside the config after all).
+            co_await _apply_entries.push_eventually(removed_from_config{});
+        }
+        // request aborts of snapshot transfers
+        abort_snapshot_transfers();
+        // abort all read barriers
+        for (auto& r : _reads) {
+            r.promise.set_value(not_a_leader{_fsm->current_leader()});
+        }
+        _reads.clear();
+    } else if (batch.abort_leadership_transfer) {
+        if (_stepdown_promise) {
+            std::exchange(_stepdown_promise, std::nullopt)->set_exception(timeout_error("Stepdown process timed out"));
+        }
+    }
+    if (_leader_promise && _fsm->current_leader()) {
+        std::exchange(_leader_promise, std::nullopt)->set_value();
+    }
+}
+
+future<> server_impl::process_server_requests(server_requests&& requests) {
+    if (requests.snapshot) {
+        co_await _apply_entries.push_eventually(trigger_snapshot_msg{});
+    }
+}
+
 future<> server_impl::io_fiber(index_t last_stable) {
    logger.trace("[{}] io_fiber start", _id);
    try {
        while (true) {
-            auto batch = co_await _fsm->poll_output();
+            bool has_fsm_output = false;
+            bool has_server_request = false;
+            co_await _events.when([this, &has_fsm_output, &has_server_request] {
+                has_fsm_output = _fsm->has_output();
+                has_server_request = !_new_server_requests.empty();
+                return has_fsm_output || has_server_request;
+            });
+
            _stats.polls++;

-            if (batch.term_and_vote) {
-                // Current term and vote are always persisted
-                // together. A vote may change independently of
-                // term, but it's safe to update both in this
-                // case.
-                co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
-                _stats.store_term_and_vote++;
+            if (has_fsm_output) {
+                auto batch = _fsm->get_output();
+                co_await process_fsm_output(last_stable, std::move(batch));
            }

-            if (batch.snp) {
-                auto& [snp, is_local] = *batch.snp;
-                logger.trace("[{}] io_fiber storing snapshot {}", _id, snp.id);
-                // Persist the snapshot
-                co_await _persistence->store_snapshot_descriptor(snp, is_local ? _config.snapshot_trailing : 0);
-                _stats.store_snapshot++;
-                // If this is locally generated snapshot there is no need to
-                // load it.
-                if (!is_local) {
-                    co_await _apply_entries.push_eventually(std::move(snp));
-                }
-            }
-
-            for (const auto& snp_id: batch.snps_to_drop) {
-                _state_machine->drop_snapshot(snp_id);
-            }
-
-            if (batch.log_entries.size()) {
-                auto& entries = batch.log_entries;
-
-                if (last_stable >= entries[0]->idx) {
-                    co_await _persistence->truncate_log(entries[0]->idx);
-                    _stats.truncate_persisted_log++;
-                }
-
-                utils::get_local_injector().inject("store_log_entries/test-failure",
-                    [] { throw std::runtime_error("store_log_entries/test-failure"); });
-
-                // Combine saving and truncating into one call?
-                // will require persistence to keep track of last idx
-                co_await _persistence->store_log_entries(entries);
-
-                last_stable = (*entries.crbegin())->idx;
-                _stats.persisted_log_entries += entries.size();
-            }
-
-            // Update RPC server address mappings. Add servers which are joining
-            // the cluster according to the new configuration (obtained from the
-            // last_conf_idx).
-            //
-            // It should be done prior to sending the messages since the RPC
-            // module needs to know who should it send the messages to (actual
-            // network addresses of the joining servers).
-            rpc_config_diff rpc_diff;
-            if (batch.configuration) {
-                rpc_diff = diff_address_sets(get_rpc_config(), *batch.configuration);
-                for (const auto& addr: rpc_diff.joining) {
-                    add_to_rpc_config(addr);
-                }
-                _rpc->on_configuration_change(rpc_diff.joining, {});
-            }
-
-             // After entries are persisted we can send messages.
-            for (auto&& m : batch.messages) {
-                try {
-                    send_message(m.first, std::move(m.second));
-                } catch(...) {
-                    // Not being able to send a message is not a critical error
-                    logger.debug("[{}] io_fiber failed to send a message to {}: {}", _id, m.first, std::current_exception());
-                }
-            }
-
-            if (batch.configuration) {
-                for (const auto& addr: rpc_diff.leaving) {
-                    abort_snapshot_transfer(addr.id);
-                    remove_from_rpc_config(addr);
-                }
-                _rpc->on_configuration_change({}, rpc_diff.leaving);
-            }
-
-            // Process committed entries.
-            if (batch.committed.size()) {
-                if (_non_joint_conf_commit_promise) {
-                    for (const auto& e: batch.committed) {
-                        const auto* cfg = get_if<raft::configuration>(&e->data);
-                        if (cfg != nullptr && !cfg->is_joint()) {
-                            std::exchange(_non_joint_conf_commit_promise, std::nullopt)->promise.set_value();
-                            break;
-                        }
-                    }
-                }
-                co_await _persistence->store_commit_idx(batch.committed.back()->idx);
-                _stats.queue_entries_for_apply += batch.committed.size();
-                co_await _apply_entries.push_eventually(std::move(batch.committed));
-            }
-
-            if (batch.max_read_id_with_quorum) {
-                while (!_reads.empty() && _reads.front().id <= batch.max_read_id_with_quorum) {
-                    _reads.front().promise.set_value(_reads.front().idx);
-                    _reads.pop_front();
-                }
-            }
-            if (!_fsm->is_leader()) {
-                if (_stepdown_promise) {
-                    std::exchange(_stepdown_promise, std::nullopt)->set_value();
-                }
-                if (!_current_rpc_config.contains(_id)) {
-                    // - It's important we push this after we pushed committed entries above. It
-                    // will cause `applier_fiber` to drop waiters, which should be done after we
-                    // notify all waiters for entries committed in this batch.
-                    // - This may happen multiple times if `io_fiber` gets multiple batches when
-                    // we're outside the configuration, but it should eventually (and generally
-                    // quickly) stop happening (we're outside the config after all).
-                    co_await _apply_entries.push_eventually(removed_from_config{});
-                }
-                // request aborts of snapshot transfers
-                abort_snapshot_transfers();
-                // abort all read barriers
-                for (auto& r : _reads) {
-                    r.promise.set_value(not_a_leader{_fsm->current_leader()});
-                }
-                _reads.clear();
-            } else if (batch.abort_leadership_transfer) {
-                if (_stepdown_promise) {
-                    std::exchange(_stepdown_promise, std::nullopt)->set_exception(timeout_error("Stepdown process timed out"));
-                }
-            }
-            if (_leader_promise && _fsm->current_leader()) {
-                std::exchange(_leader_promise, std::nullopt)->set_value();
+            if (has_server_request) {
+                auto requests = std::exchange(_new_server_requests, server_requests{});
+                co_await process_server_requests(std::move(requests));
            }
        }
    } catch (seastar::broken_condition_variable&) {
@@ -1064,6 +1169,18 @@ future<> server_impl::io_fiber(index_t last_stable) {
    co_return;
 }

+static bool is_closed_error(std::exception_ptr ep) {
+    try {
+        std::rethrow_exception(ep);
+    } catch (const seastar::rpc::remote_verb_error& e) {
+        return std::string_view{e.what()} == "connection is closed";
+    } catch (const seastar::rpc::closed_error&) {
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
 void server_impl::send_snapshot(server_id dst, install_snapshot&& snp) {
    seastar::abort_source as;
    uint64_t id = _next_snapshot_transfer_id++;
@@ -1079,7 +1196,11 @@ void server_impl::send_snapshot(server_id dst, install_snapshot&& snp) {
            _snapshot_transfers.erase(dst);
            auto reply = raft::snapshot_reply{.current_term = _fsm->get_current_term(), .success = false};
            if (f.failed()) {
-                logger.error("[{}] Transferring snapshot to {} failed with: {}", _id, dst, f.get_exception());
+                auto ep = f.get_exception();
+                // Report our or remote's closed_error as WARNs instead of ERRORs.
+                // Workaround for scylladb/scylladb#12972 for ScyllaDB 5.2.
+                auto level = is_closed_error(ep) ? log_level::warn : log_level::error;
+                logger.log(level, "[{}] Transferring snapshot to {} failed with: {}", _id, dst, ep);
            } else {
                logger.trace("[{}] Transferred snapshot to {}", _id, dst);
                reply = f.get();
@@ -1200,6 +1321,23 @@ future<> server_impl::applier_fiber() {
                // it may never know the status of entries it submitted.
                drop_waiters();
                co_return;
+            },
+            [this] (const trigger_snapshot_msg&) -> future<> {
+                auto applied_term = _fsm->log_term_for(_applied_idx);
+                // last truncation index <= snapshot index <= applied index
+                assert(applied_term);
+
+                snapshot_descriptor snp;
+                snp.term = *applied_term;
+                snp.idx = _applied_idx;
+                snp.config = _fsm->log_last_conf_for(_applied_idx);
+                logger.trace("[{}] taking snapshot at term={}, idx={} due to request", _id, snp.term, snp.idx);
+                snp.id = co_await _state_machine->take_snapshot();
+                if (!_fsm->apply_snapshot(snp, 0, 0, true)) {
+                    logger.trace("[{}] while taking snapshot term={} idx={} id={} due to request,"
+                           " fsm received a later snapshot at idx={}", _id, snp.term, snp.idx, snp.id, _fsm->log_last_snapshot_idx());
+                }
+                _stats.snapshots_taken++;
            }
            ), v);

@@ -1348,6 +1486,8 @@ future<> server_impl::abort(sstring reason) {
    _aborted = std::move(reason);
    logger.trace("[{}]: abort() called", _id);
    _fsm->stop();
+    _events.broken();
+    _snapshot_desc_idx_changed.broken();

    // IO and applier fibers may update waiters and start new snapshot
    // transfers, so abort them first
--- a/raft/server.hh
+++ b/raft/server.hh
@@ -224,6 +224,22 @@ public:
    // of two servers iff their IDs are different.
    virtual void register_metrics() = 0;

+    // Manually trigger snapshot creation and log truncation.
+    //
+    // Does nothing if the current apply index is less or equal to the last persisted snapshot descriptor index
+    // and returns `false`.
+    //
+    // Otherwise returns `true`; when the future resolves, it is guaranteed that the snapshot descriptor
+    // is persisted, but not that the snapshot is loaded to the state machine yet (it will be eventually).
+    //
+    // The request may be resolved by the regular snapshotting mechanisms (e.g. a snapshot
+    // is created because the Raft log grows too large). In this case there is no guarantee
+    // how many trailing entries will be left trailing behind the snapshot. However,
+    // if there are no operations running on the server concurrently with the request and all
+    // committed entries are already applied, the created snapshot is guaranteed to leave
+    // zero trailing entries.
+    virtual future<bool> trigger_snapshot(seastar::abort_source* as) = 0;
+
    // Ad hoc functions for testing
    virtual void wait_until_candidate() = 0;
    virtual future<> wait_election_done() = 0;
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -69,7 +69,8 @@ class reader_permit::impl
        : public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>
        , public enable_shared_from_this<reader_permit::impl> {
    reader_concurrency_semaphore& _semaphore;
-    const schema* _schema;
+    schema_ptr _schema;
+
    sstring _op_name;
    std::string_view _op_name_view;
    reader_resources _base_resources;
@@ -120,18 +121,18 @@ private:
 public:
    struct value_tag {};

-    impl(reader_concurrency_semaphore& semaphore, const schema* const schema, const std::string_view& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
+    impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, const std::string_view& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
        : _semaphore(semaphore)
-        , _schema(schema)
+        , _schema(std::move(schema))
        , _op_name_view(op_name)
        , _base_resources(base_resources)
        , _timeout(timeout)
    {
        _semaphore.on_permit_created(*this);
    }
-    impl(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
+    impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout)
        : _semaphore(semaphore)
-        , _schema(schema)
+        , _schema(std::move(schema))
        , _op_name(std::move(op_name))
        , _op_name_view(_op_name)
        , _base_resources(base_resources)
@@ -181,7 +182,7 @@ public:
        return _semaphore;
    }

-    const ::schema* get_schema() const {
+    const schema_ptr& get_schema() const {
        return _schema;
    }

@@ -356,15 +357,15 @@ reader_permit::reader_permit(shared_ptr<impl> impl) : _impl(std::move(impl))
 {
 }

-reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name,
+reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, std::string_view op_name,
        reader_resources base_resources, db::timeout_clock::time_point timeout)
-    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, op_name, base_resources, timeout))
+    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, std::move(schema), op_name, base_resources, timeout))
 {
 }

-reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name,
+reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name,
        reader_resources base_resources, db::timeout_clock::time_point timeout)
-    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, std::move(op_name), base_resources, timeout))
+    : _impl(::seastar::make_shared<reader_permit::impl>(semaphore, std::move(schema), std::move(op_name), base_resources, timeout))
 {
 }

@@ -577,7 +578,7 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
    permit_groups permits;

    for (const auto& permit : list) {
-        permits[permit_group_key(permit.get_schema(), permit.get_op_name(), permit.get_state())].add(permit);
+        permits[permit_group_key(permit.get_schema().get(), permit.get_op_name(), permit.get_state())].add(permit);
    }

    permit_stats total;
@@ -1041,33 +1042,33 @@ void reader_concurrency_semaphore::on_permit_unblocked() noexcept {
    --_stats.blocked_permits;
 }

-future<reader_permit> reader_concurrency_semaphore::obtain_permit(const schema* const schema, const char* const op_name, size_t memory,
+future<reader_permit> reader_concurrency_semaphore::obtain_permit(schema_ptr schema, const char* const op_name, size_t memory,
        db::timeout_clock::time_point timeout) {
-    auto permit = reader_permit(*this, schema, std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
+    auto permit = reader_permit(*this, std::move(schema), std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
    return do_wait_admission(permit).then([permit] () mutable {
        return std::move(permit);
    });
 }

-future<reader_permit> reader_concurrency_semaphore::obtain_permit(const schema* const schema, sstring&& op_name, size_t memory,
+future<reader_permit> reader_concurrency_semaphore::obtain_permit(schema_ptr schema, sstring&& op_name, size_t memory,
        db::timeout_clock::time_point timeout) {
-    auto permit = reader_permit(*this, schema, std::move(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
+    auto permit = reader_permit(*this, std::move(schema), std::move(op_name), {1, static_cast<ssize_t>(memory)}, timeout);
    return do_wait_admission(permit).then([permit] () mutable {
        return std::move(permit);
    });
 }

-reader_permit reader_concurrency_semaphore::make_tracking_only_permit(const schema* const schema, const char* const op_name, db::timeout_clock::time_point timeout) {
-    return reader_permit(*this, schema, std::string_view(op_name), {}, timeout);
+reader_permit reader_concurrency_semaphore::make_tracking_only_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout) {
+    return reader_permit(*this, std::move(schema), std::string_view(op_name), {}, timeout);
 }

-reader_permit reader_concurrency_semaphore::make_tracking_only_permit(const schema* const schema, sstring&& op_name, db::timeout_clock::time_point timeout) {
-    return reader_permit(*this, schema, std::move(op_name), {}, timeout);
+reader_permit reader_concurrency_semaphore::make_tracking_only_permit(schema_ptr schema, sstring&& op_name, db::timeout_clock::time_point timeout) {
+    return reader_permit(*this, std::move(schema), std::move(op_name), {}, timeout);
 }

-future<> reader_concurrency_semaphore::with_permit(const schema* const schema, const char* const op_name, size_t memory,
+future<> reader_concurrency_semaphore::with_permit(schema_ptr schema, const char* const op_name, size_t memory,
        db::timeout_clock::time_point timeout, read_func func) {
-    return do_wait_admission(reader_permit(*this, schema, std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout), std::move(func));
+    return do_wait_admission(reader_permit(*this, std::move(schema), std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout), std::move(func));
 }

 future<> reader_concurrency_semaphore::with_ready_permit(reader_permit permit, read_func func) {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -372,8 +372,8 @@ public:
    ///
    /// Some permits cannot be associated with any table, so passing nullptr as
    /// the schema parameter is allowed.
-    future<reader_permit> obtain_permit(const schema* const schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout);
-    future<reader_permit> obtain_permit(const schema* const schema, sstring&& op_name, size_t memory, db::timeout_clock::time_point timeout);
+    future<reader_permit> obtain_permit(schema_ptr schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout);
+    future<reader_permit> obtain_permit(schema_ptr schema, sstring&& op_name, size_t memory, db::timeout_clock::time_point timeout);

    /// Make a tracking only permit
    ///
@@ -388,8 +388,8 @@ public:
    ///
    /// Some permits cannot be associated with any table, so passing nullptr as
    /// the schema parameter is allowed.
-    reader_permit make_tracking_only_permit(const schema* const schema, const char* const op_name, db::timeout_clock::time_point timeout);
-    reader_permit make_tracking_only_permit(const schema* const schema, sstring&& op_name, db::timeout_clock::time_point timeout);
+    reader_permit make_tracking_only_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout);
+    reader_permit make_tracking_only_permit(schema_ptr schema, sstring&& op_name, db::timeout_clock::time_point timeout);

    /// Run the function through the semaphore's execution stage with an admitted permit
    ///
@@ -410,7 +410,7 @@ public:
    ///
    /// Some permits cannot be associated with any table, so passing nullptr as
    /// the schema parameter is allowed.
-    future<> with_permit(const schema* const schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout, read_func func);
+    future<> with_permit(schema_ptr schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout, read_func func);

    /// Run the function through the semaphore's execution stage with a pre-admitted permit
    ///
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -95,9 +95,9 @@ private:
 private:
    reader_permit() = default;
    reader_permit(shared_ptr<impl>);
-    explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name,
+    explicit reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, std::string_view op_name,
            reader_resources base_resources, db::timeout_clock::time_point timeout);
-    explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name,
+    explicit reader_permit(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name,
            reader_resources base_resources, db::timeout_clock::time_point timeout);

    void on_waiting();
--- a/readers/multishard.cc
+++ b/readers/multishard.cc
@@ -231,8 +231,8 @@ private:
    flat_mutation_reader_v2_opt _reader;

 private:
-    void do_pause(flat_mutation_reader_v2 reader);
-    void maybe_pause(flat_mutation_reader_v2 reader);
+    void do_pause(flat_mutation_reader_v2 reader) noexcept;
+    void maybe_pause(flat_mutation_reader_v2 reader) noexcept;
    flat_mutation_reader_v2_opt try_resume();
    void update_next_position();
    void adjust_partition_slice();
@@ -281,12 +281,12 @@ public:
    }
 };

-void evictable_reader_v2::do_pause(flat_mutation_reader_v2 reader) {
+void evictable_reader_v2::do_pause(flat_mutation_reader_v2 reader) noexcept {
    assert(!_irh);
    _irh = _permit.semaphore().register_inactive_read(std::move(reader));
 }

-void evictable_reader_v2::maybe_pause(flat_mutation_reader_v2 reader) {
+void evictable_reader_v2::maybe_pause(flat_mutation_reader_v2 reader) noexcept {
    if (_auto_pause) {
        do_pause(std::move(reader));
    } else {
@@ -649,8 +649,17 @@ future<> evictable_reader_v2::fast_forward_to(const dht::partition_range& pr) {
        co_return;
    }
    if (auto reader_opt = try_resume()) {
-        co_await reader_opt->fast_forward_to(pr);
-        _range_override.reset();
+        std::exception_ptr ex;
+        try {
+            co_await reader_opt->fast_forward_to(pr);
+            _range_override.reset();
+        } catch (...) {
+            ex = std::current_exception();
+        }
+        if (ex) {
+            co_await reader_opt->close();
+            std::rethrow_exception(std::move(ex));
+        }
        maybe_pause(std::move(*reader_opt));
    }
 }
--- a/readers/mutation_reader.cc
+++ b/readers/mutation_reader.cc
@@ -191,7 +191,11 @@ void mutation_fragment_stream_validator::reset(const mutation_fragment& mf) {

 namespace {

-[[noreturn]] void on_validation_error(seastar::logger& l, const seastar::sstring& reason) {
+bool on_validation_error(seastar::logger& l, const mutation_fragment_stream_validating_filter& zis, const seastar::sstring& reason) {
+    if (!zis.raise_errors()) {
+        l.error("{}", reason);
+        return false;
+    }
    try {
        on_internal_error(l, reason);
    } catch (std::runtime_error& e) {
@@ -209,13 +213,13 @@ bool mutation_fragment_stream_validating_filter::operator()(const dht::decorated
        if (_validator(dk.token())) {
            return true;
        }
-        on_validation_error(mrlog, format("[validator {} for {}] Unexpected token: previous {}, current {}",
+        return on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected token: previous {}, current {}",
                static_cast<void*>(this), full_name(), _validator.previous_token(), dk.token()));
    } else {
        if (_validator(dk)) {
            return true;
        }
-        on_validation_error(mrlog, format("[validator {} for {}] Unexpected partition key: previous {}, current {}",
+        return on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected partition key: previous {}, current {}",
                static_cast<void*>(this), full_name(), _validator.previous_partition_key(), dk));
    }
 }
@@ -226,10 +230,11 @@ sstring mutation_fragment_stream_validating_filter::full_name() const {
 }

 mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(const char* name_literal, sstring name_value, const schema& s,
-        mutation_fragment_stream_validation_level level)
+        mutation_fragment_stream_validation_level level, bool raise_errors)
    : _validator(s)
    , _name_storage(std::move(name_value))
    , _validation_level(level)
+    , _raise_errors(raise_errors)
 {
    if (name_literal) {
        _name_view = name_literal;
@@ -260,13 +265,13 @@ mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_
 }

 mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(sstring name, const schema& s,
-        mutation_fragment_stream_validation_level level)
-    : mutation_fragment_stream_validating_filter(nullptr, std::move(name), s, level)
+        mutation_fragment_stream_validation_level level, bool raise_errors)
+    : mutation_fragment_stream_validating_filter(nullptr, std::move(name), s, level, raise_errors)
 { }

 mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(const char* name, const schema& s,
-        mutation_fragment_stream_validation_level level)
-    : mutation_fragment_stream_validating_filter(name, {}, s, level)
+        mutation_fragment_stream_validation_level level, bool raise_errors)
+    : mutation_fragment_stream_validating_filter(name, {}, s, level, raise_errors)
 { }

 bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos,
@@ -279,7 +284,9 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    mrlog.debug("[validator {}] {}:{} new_current_tombstone: {}", static_cast<void*>(this), kind, pos, new_current_tombstone);

-    if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
+    if (_validation_level == mutation_fragment_stream_validation_level::none) {
+        return true;
+    } else if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
        valid = _validator(kind, pos, new_current_tombstone);
    } else {
        valid = _validator(kind, new_current_tombstone);
@@ -287,18 +294,19 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    if (__builtin_expect(!valid, false)) {
        if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
-            on_validation_error(mrlog, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}:{}, current {}:{}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}:{}, current {}:{}",
                    static_cast<void*>(this), full_name(), _validator.previous_partition_key(), _validator.previous_mutation_fragment_kind(), _validator.previous_position(), kind, pos));
        } else if (_validation_level >= mutation_fragment_stream_validation_level::partition_key) {
-            on_validation_error(mrlog, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}, current {}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected mutation fragment: partition key {}: previous {}, current {}",
                    static_cast<void*>(this), full_name(), _validator.previous_partition_key(), _validator.previous_mutation_fragment_kind(), kind));
        } else if (kind == mutation_fragment_v2::kind::partition_end && _validator.current_tombstone()) {
-            on_validation_error(mrlog, format("[validator {} for {}] Partition ended with active tombstone: {}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Partition ended with active tombstone: {}",
                    static_cast<void*>(this), full_name(), _validator.current_tombstone()));
        } else {
-            on_validation_error(mrlog, format("[validator {} for {}] Unexpected mutation fragment: previous {}, current {}",
+            on_validation_error(mrlog, *this, format("[validator {} for {}] Unexpected mutation fragment: previous {}, current {}",
                    static_cast<void*>(this), full_name(), _validator.previous_mutation_fragment_kind(), kind));
        }
+        return false;
    }

    return true;
@@ -340,15 +348,16 @@ bool mutation_fragment_stream_validating_filter::on_end_of_partition() {
    return (*this)(mutation_fragment::kind::partition_end, position_in_partition_view(position_in_partition_view::end_of_partition_tag_t()));
 }

-void mutation_fragment_stream_validating_filter::on_end_of_stream() {
+bool mutation_fragment_stream_validating_filter::on_end_of_stream() {
    if (_validation_level < mutation_fragment_stream_validation_level::partition_region) {
-        return;
+        return true;
    }
    mrlog.debug("[validator {}] EOS", static_cast<const void*>(this));
    if (!_validator.on_end_of_stream()) {
-        on_validation_error(mrlog, format("[validator {} for {}] Stream ended with unclosed partition: {}", static_cast<const void*>(this), full_name(),
+        return on_validation_error(mrlog, *this, format("[validator {} for {}] Stream ended with unclosed partition: {}", static_cast<const void*>(this), full_name(),
                _validator.previous_mutation_fragment_kind()));
    }
+    return true;
 }

 static size_t compute_buffer_size(const schema& s, const flat_mutation_reader_v2::tracked_buffer& buffer)
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -17,6 +17,7 @@
 #include "gms/gossiper.hh"
 #include "service/priority_manager.hh"
 #include "message/messaging_service.hh"
+#include "repair/table_check.hh"
 #include "sstables/sstables.hh"
 #include "replica/database.hh"
 #include "db/config.hh"
@@ -669,9 +670,12 @@ future<> shard_repair_task_impl::repair_range(const dht::token_range& range, ::t
        co_return;
    }
    try {
-        co_await repair_cf_range_row_level(*this, cf, table_id, range, neighbors);
-    } catch (replica::no_such_column_family&) {
-        dropped_tables.insert(cf);
+        auto dropped = co_await repair::with_table_drop_silenced(db.local(), mm, table_id, [&] (const ::table_id& uuid) {
+            return repair_cf_range_row_level(*this, cf, table_id, range, neighbors);
+        });
+        if (dropped) {
+            dropped_tables.insert(cf);
+        }
    } catch (...) {
        nr_failed_ranges++;
        throw;
--- a/repair/row.hh
+++ b/repair/row.hh
@@ -50,6 +50,9 @@ public:
        }
        return *_mf;
    }
+    void reset_mutation_fragment() {
+        _mf = nullptr;
+    }
    frozen_mutation_fragment& get_frozen_mutation() {
        if (!_fm) {
            throw std::runtime_error("empty frozen_mutation_fragment");
@@ -69,7 +72,14 @@ public:
        if (!_fm) {
            throw std::runtime_error("empty size due to empty frozen_mutation_fragment");
        }
-        return _fm->representation().size();
+        auto size = sizeof(repair_row) + _fm->representation().size();
+        if (_boundary) {
+            size += _boundary->pk.external_memory_usage() + _boundary->position.external_memory_usage();
+        }
+        if (_mf) {
+            size += _mf->memory_usage();
+        }
+        return size;
    }
    const repair_sync_boundary& boundary() const {
        if (!_boundary) {
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -340,7 +340,9 @@ public:
            , _seed(seed)
            , _local_read_op(local_reader ? std::optional(cf.read_in_progress()) : std::nullopt)
            , _reader(make_reader(db, cf, local_reader))
-    { }
+    {
+        pause();
+    }

    future<mutation_fragment_opt>
    read_mutation_fragment() {
@@ -672,6 +674,7 @@ void flush_rows(schema_ptr s, std::list<repair_row>& rows, lw_shared_ptr<repair_
            last_mf = mf;
            last_dk = r.get_dk_with_hash();
        }
+        r.reset_mutation_fragment();
    }
    if (last_mf && last_dk) {
        writer->do_write(std::move(last_dk), std::move(*last_mf)).get();
@@ -941,8 +944,8 @@ public:
    }

 private:
-    future<uint64_t> do_estimate_partitions_on_all_shards() {
-        return estimate_partitions(_db, _schema->ks_name(), _schema->cf_name(), _range);
+    future<uint64_t> do_estimate_partitions_on_all_shards(const dht::token_range& range) {
+        return estimate_partitions(_db, _schema->ks_name(), _schema->cf_name(), range);
    }

    future<uint64_t> do_estimate_partitions_on_local_shard() {
@@ -964,7 +967,7 @@ private:
                return repeat([this, &sharder, &partitions_sum] () mutable {
                    auto shard_range = sharder.next();
                    if (shard_range) {
-                        return do_estimate_partitions_on_all_shards().then([this, &partitions_sum] (uint64_t partitions) mutable {
+                        return do_estimate_partitions_on_all_shards(*shard_range).then([&partitions_sum] (uint64_t partitions) mutable {
                            partitions_sum += partitions;
                            return make_ready_future<stop_iteration>(stop_iteration::no);
                        });
@@ -1039,10 +1042,11 @@ private:
        auto hash = _repair_hasher.do_hash_for_mf(*_repair_reader.get_current_dk(), mf);
        repair_row r(freeze(*_schema, mf), position_in_partition(mf.position()), _repair_reader.get_current_dk(), hash, is_dirty_on_master::no);
        rlogger.trace("Reading: r.boundary={}, r.hash={}", r.boundary(), r.hash());
+        auto sz = r.size();
        _metrics.row_from_disk_nr++;
-        _metrics.row_from_disk_bytes += r.size();
-        cur_size += r.size();
-        new_rows_size += r.size();
+        _metrics.row_from_disk_bytes += sz;
+        cur_size += sz;
+        new_rows_size += sz;
        cur_rows.push_back(std::move(r));
        return stop_iteration::no;
    }
@@ -1242,6 +1246,7 @@ private:
                    // mutation_fragment attached because we have stored it in
                    // to_repair_rows_list above where the repair_row is created.
                    mutation_fragment mf = std::move(r.get_mutation_fragment());
+                    r.reset_mutation_fragment();
                    auto dk_with_hash = r.get_dk_with_hash();
                    return _repair_writer->do_write(std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
                        row_diff.pop_front();
@@ -2785,6 +2790,26 @@ public:
                    });
                }).get();

+                if (!master.all_nodes().empty()) {
+                    // Use the average number of partitions, instead of the sum
+                    // of the partitions, as the estimated partitions in a
+                    // given range. The bigger the estimated partitions, the
+                    // more memory bloom filter for the sstable would consume.
+                    _estimated_partitions /= master.all_nodes().size();
+
+                    // In addition, estimate the difference between nodes is
+                    // less than 10% for regular repair. Underestimation will
+                    // not be a big problem since those sstables produced by
+                    // repair will go through off-strategy later anyway. The
+                    // worst case is that we have a worse false positive ratio
+                    // than expected temporarily when the sstable is still in
+                    // maintenance set.
+                    //
+                    // To save memory and have less different conditions, we
+                    // use the 10% estimation for RBNO repair as well.
+                    _estimated_partitions /= 10;
+                }
+
                parallel_for_each(master.all_nodes(), [&, this] (repair_node_state& ns) {
                    const auto& node = ns.node;
                    rlogger.trace("Get repair_set_estimated_partitions for node={}, estimated_partitions={}", node, _estimated_partitions);
--- a/repair/table_check.cc
+++ b/repair/table_check.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include "replica/database.hh"
+#include "repair/table_check.hh"
+#include "service/migration_manager.hh"
+
+namespace repair {
+
+future<table_dropped> table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid) {
+    co_await mm.container().invoke_on(0, [] (auto& mm) -> future<> {
+        auto& group0_client = mm.get_group0_client();
+        abort_on_expiry aoe(lowres_clock::now() + std::chrono::seconds{10});
+        auto& as = aoe.abort_source();
+        auto sub = mm.get_abort_source().subscribe([&as] () noexcept {
+            if (!as.abort_requested()) {
+                as.request_abort();
+            }
+        });
+
+        return group0_client.perform_read_barrier(&as);
+    });
+
+    co_return !db.column_family_exists(uuid);
+}
+
+future<table_dropped> with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid,
+        std::function<future<>(const table_id&)> f) {
+    std::exception_ptr ex = nullptr;
+    try {
+        co_await f(uuid);
+        co_return table_dropped::no;
+    } catch (replica::no_such_column_family&) {
+        // No need to synchronize while we know the table was dropped.
+    } catch (...) {
+        // This node may still see a table while it is dropped on the remote node
+        // and so the remote node returns an error. In that case we want to skip
+        // that table and continue with the operation.
+        //
+        // But since RPC does not enable returning the exception type, the cause
+        // of the failure cannot be determined. Synchronize schema to see the latest
+        // changes and determine whether the table was dropped.
+        ex = std::current_exception();
+    }
+
+    if (ex) {
+        auto dropped = co_await table_sync_and_check(db, mm, uuid);
+        if (!dropped) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+    }
+    co_return table_dropped::yes;
+}
+
+}
--- a/repair/table_check.hh
+++ b/repair/table_check.hh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/util/bool_class.hh>
+
+#include "schema_fwd.hh"
+
+using namespace seastar;
+
+using table_dropped = bool_class<class table_dropped_tag>;
+
+namespace raft {
+class server;
+}
+
+namespace replica {
+class database;
+}
+
+namespace service {
+class migration_manager;
+}
+
+namespace repair {
+
+class database;
+
+future<table_dropped> table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid);
+
+// Runs function f on given table. If f throws and the table is dropped, the exception is swallowed.
+// Function is aimed to handle no_such_column_family on remote node or different shard, as it synchronizes
+// schema before checking the table. Prefer standard error handling whenever possible.
+future<table_dropped> with_table_drop_silenced(replica::database& db, service::migration_manager& mm, const table_id& uuid,
+        std::function<future<>(const table_id&)> f);
+
+}
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1574,7 +1574,7 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
        if (querier_opt) {
            f = co_await coroutine::as_future(semaphore.with_ready_permit(querier_opt->permit(), read_func));
        } else {
-            f = co_await coroutine::as_future(semaphore.with_permit(s.get(), "data-query", cf.estimate_read_memory_cost(), timeout, read_func));
+            f = co_await coroutine::as_future(semaphore.with_permit(s, "data-query", cf.estimate_read_memory_cost(), timeout, read_func));
        }

        if (!f.failed()) {
@@ -1640,7 +1640,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
        if (querier_opt) {
            f = co_await coroutine::as_future(semaphore.with_ready_permit(querier_opt->permit(), read_func));
        } else {
-            f = co_await coroutine::as_future(semaphore.with_permit(s.get(), "mutation-query", cf.estimate_read_memory_cost(), timeout, read_func));
+            f = co_await coroutine::as_future(semaphore.with_permit(s, "mutation-query", cf.estimate_read_memory_cost(), timeout, read_func));
        }

        if (!f.failed()) {
@@ -1690,7 +1690,7 @@ reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() {
 }

 future<reader_permit> database::obtain_reader_permit(table& tbl, const char* const op_name, db::timeout_clock::time_point timeout) {
-    return get_reader_concurrency_semaphore().obtain_permit(tbl.schema().get(), op_name, tbl.estimate_read_memory_cost(), timeout);
+    return get_reader_concurrency_semaphore().obtain_permit(tbl.schema(), op_name, tbl.estimate_read_memory_cost(), timeout);
 }

 future<reader_permit> database::obtain_reader_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout) {
@@ -1760,7 +1760,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
            // counter state for each modified cell...

            tracing::trace(trace_state, "Reading counter values from the CF");
-            auto permit = get_reader_concurrency_semaphore().make_tracking_only_permit(m_schema.get(), "counter-read-before-write", timeout);
+            auto permit = get_reader_concurrency_semaphore().make_tracking_only_permit(m_schema, "counter-read-before-write", timeout);
            return counter_write_query(m_schema, cf.as_mutation_source(), std::move(permit), m.decorated_key(), slice, trace_state)
                    .then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
                // ...now, that we got existing state of all affected counter
@@ -2846,7 +2846,7 @@ flat_mutation_reader_v2 make_multishard_streaming_reader(distributed<replica::da
        }
        virtual future<reader_permit> obtain_reader_permit(schema_ptr schema, const char* const description, db::timeout_clock::time_point timeout) override {
            auto& cf = _db.local().find_column_family(_table_id);
-            return semaphore().obtain_permit(schema.get(), description, cf.estimate_read_memory_cost(), timeout);
+            return semaphore().obtain_permit(schema, description, cf.estimate_read_memory_cost(), timeout);
        }
    };
    auto ms = mutation_source([&db] (schema_ptr s,
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -844,7 +844,7 @@ table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtabl
        auto metadata = mutation_source_metadata{};
        metadata.min_timestamp = old->get_min_timestamp();
        metadata.max_timestamp = old->get_max_timestamp();
-        auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count());
+        auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count(), _schema);

        if (!_async_gate.is_closed()) {
            co_await _compaction_manager.maybe_wait_for_sstable_count_reduction(cg.as_table_state());
@@ -874,7 +874,7 @@ table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtabl

        auto f = consumer(old->make_flush_reader(
            old->schema(),
-            compaction_concurrency_semaphore().make_tracking_only_permit(old->schema().get(), "try_flush_memtable_to_sstable()", db::no_timeout),
+            compaction_concurrency_semaphore().make_tracking_only_permit(old->schema(), "try_flush_memtable_to_sstable()", db::no_timeout),
            service::get_local_memtable_flush_priority()));

        // Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
@@ -1183,6 +1183,9 @@ compaction_group::update_main_sstable_list_on_compaction_completion(sstables::co
 future<>
 table::compact_all_sstables() {
    co_await flush();
+    // Forces off-strategy before major, so sstables previously sitting on maintenance set will be included
+    // in the compaction's input set, to provide same semantics as before maintenance set came into existence.
+    co_await perform_offstrategy_compaction();
    co_await parallel_foreach_compaction_group([this] (compaction_group& cg) {
        return _compaction_manager.perform_major_compaction(cg.as_table_state());
    });
@@ -1894,11 +1897,9 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
 }

 static size_t memory_usage_of(const utils::chunked_vector<frozen_mutation_and_schema>& ms) {
-    // Overhead of sending a view mutation, in terms of data structures used by the storage_proxy.
-    constexpr size_t base_overhead_bytes = 256;
    return boost::accumulate(ms | boost::adaptors::transformed([] (const frozen_mutation_and_schema& m) {
-        return m.fm.representation().size();
-    }), size_t{base_overhead_bytes * ms.size()});
+        return db::view::memory_usage_of(m);
+    }), 0);
 }

 /**
@@ -2218,7 +2219,7 @@ write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, sstables::
            std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "write_memtable_to_sstable"),
            cfg,
            [&mt, sst] (auto& monitor, auto& semaphore, auto& cfg) {
-        return write_memtable_to_sstable(semaphore->make_tracking_only_permit(mt.schema().get(), "mt_to_sst", db::no_timeout), mt, std::move(sst), monitor, cfg)
+        return write_memtable_to_sstable(semaphore->make_tracking_only_permit(mt.schema(), "mt_to_sst", db::no_timeout), mt, std::move(sst), monitor, cfg)
        .finally([&semaphore] {
                return semaphore->stop();
        });
@@ -2547,7 +2548,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(schema_ptr s
    const bool need_static = db::view::needs_static_row(m.partition(), views);
    if (!need_regular && !need_static) {
        tracing::trace(tr_state, "View updates do not require read-before-write");
-        co_await generate_and_propagate_view_updates(base, sem.make_tracking_only_permit(s.get(), "push-view-updates-1", timeout), std::move(views), std::move(m), { }, std::move(tr_state), now);
+        co_await generate_and_propagate_view_updates(base, sem.make_tracking_only_permit(s, "push-view-updates-1", timeout), std::move(views), std::move(m), { }, std::move(tr_state), now);
        // In this case we are not doing a read-before-write, just a
        // write, so no lock is needed.
        co_return row_locker::lock_holder();
@@ -2580,7 +2581,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(schema_ptr s
    co_await utils::get_local_injector().inject("table_push_view_replica_updates_timeout", timeout);
    auto lock = co_await std::move(lockf);
    auto pk = dht::partition_range::make_singular(m.decorated_key());
-    auto permit = sem.make_tracking_only_permit(base.get(), "push-view-updates-2", timeout);
+    auto permit = sem.make_tracking_only_permit(base, "push-view-updates-2", timeout);
    auto reader = source.make_reader_v2(base, permit, pk, slice, io_priority, tr_state, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
    co_await this->generate_and_propagate_view_updates(base, std::move(permit), std::move(views), std::move(m), std::move(reader), tr_state, now);
    tracing::trace(tr_state, "View updates for {}.{} were generated and propagated", base->ks_name(), base->cf_name());
@@ -2665,7 +2666,7 @@ public:
        return _t.get_compaction_strategy();
    }
    reader_permit make_compaction_reader_permit() const override {
-        return _t.compaction_concurrency_semaphore().make_tracking_only_permit(schema().get(), "compaction", db::no_timeout);
+        return _t.compaction_concurrency_semaphore().make_tracking_only_permit(schema(), "compaction", db::no_timeout);
    }
    sstables::sstables_manager& get_sstables_manager() noexcept override {
        return _t.get_sstables_manager();
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -977,7 +977,7 @@ future<> row_cache::do_update(external_updater eu, replica::memtable& m, Updater
                            // this layer has a chance to restore invariants before deferring,
                            // in particular set _prev_snapshot_pos to the correct value.
                            if (update.run() == stop_iteration::no) {
-                                return;
+                                break;
                            }
                            update = {};
                            real_dirty_acc.unpin_memory(size_entry);
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -327,23 +327,12 @@ checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"

 [[package]]
 name = "errno"
-version = "0.2.8"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
 "libc",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -859,9 +848,9 @@ checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"

 [[package]]
 name = "rustix"
-version = "0.36.7"
+version = "0.36.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
+checksum = "305efbd14fde4139eb501df5f136994bb520b033fa9fbdce287507dc23b8c7ed"
 dependencies = [
 "bitflags",
 "errno",
@@ -870,7 +859,7 @@ dependencies = [
 "libc",
 "linux-raw-sys",
 "once_cell",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -1503,13 +1492,13 @@ version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
 ]

 [[package]]
@@ -1518,7 +1507,16 @@ version = "0.45.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.0",
 ]

 [[package]]
@@ -1527,13 +1525,28 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.0",
+ "windows_aarch64_msvc 0.52.0",
+ "windows_i686_gnu 0.52.0",
+ "windows_i686_msvc 0.52.0",
+ "windows_x86_64_gnu 0.52.0",
+ "windows_x86_64_gnullvm 0.52.0",
+ "windows_x86_64_msvc 0.52.0",
 ]

 [[package]]
@@ -1542,42 +1555,84 @@ version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"

+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"

+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"

+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"

+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"

+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"

+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"

+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
+
 [[package]]
 name = "winx"
 version = "0.34.0"
--- a/schema.cc
+++ b/schema.cc
@@ -35,6 +35,8 @@
 #include "utils/rjson.hh"
 #include "tombstone_gc_options.hh"
 #include "db/per_partition_rate_limit_extension.hh"
+#include "db/tags/utils.hh"
+#include "db/tags/extension.hh"

 constexpr int32_t schema::NAME_LENGTH;

@@ -907,8 +909,23 @@ std::ostream& schema::describe(replica::database& db, std::ostream& os, bool wit
    os << "\n    AND memtable_flush_period_in_ms = " << memtable_flush_period();
    os << "\n    AND min_index_interval = " << min_index_interval();
    os << "\n    AND read_repair_chance = " << read_repair_chance();
-    os << "\n    AND speculative_retry = '" << speculative_retry().to_sstring() << "';";
-    os << "\n";
+    os << "\n    AND speculative_retry = '" << speculative_retry().to_sstring() << "'";
+    os << "\n    AND paxos_grace_seconds = " << paxos_grace_seconds().count();
+
+    auto tombstone_gc_str = tombstone_gc_options().to_sstring();
+    std::replace(tombstone_gc_str.begin(), tombstone_gc_str.end(), '"', '\'');
+    os << "\n    AND tombstone_gc = " << tombstone_gc_str;
+    
+    if (cdc_options().enabled()) {
+        os << "\n    AND cdc = " << cdc_options().to_sstring();
+    }
+    if (is_view() && !is_index(db, view_info()->base_id(), *this)) {
+        auto is_sync_update = db::find_tag(*this, db::SYNCHRONOUS_VIEW_UPDATES_TAG_KEY);
+        if (is_sync_update.has_value()) {
+            os << "\n    AND synchronous_updates = " << *is_sync_update;
+        }
+    }
+    os << ";\n";

    if (with_internals) {
        for (auto& cdef : dropped_columns()) {
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -3566,7 +3566,10 @@ class scylla_io_queues(gdb.Command):
                    try:
                        gdb.write("\tCapacity tail:       {}\n".format(std_atomic(fg['_token_bucket']['_rovers']['tail']).get()))
                        gdb.write("\tCapacity head:       {}\n".format(std_atomic(fg['_token_bucket']['_rovers']['head']).get()))
-                        gdb.write("\tCapacity ceil:       {}\n".format(std_atomic(fg['_token_bucket']['_rovers']['ceil']).get()))
+                        try:
+                            gdb.write("\tCapacity ceil:       {}\n".format(std_atomic(fg['_token_bucket']['_rovers']['ceil']).get()))
+                        except gdb.error:
+                            pass
                    except gdb.error:
                        gdb.write("\tCapacity tail:       {}\n".format(std_atomic(fg['_capacity_tail']).get()))
                        gdb.write("\tCapacity head:       {}\n".format(std_atomic(fg['_capacity_head']).get()))
@@ -4115,7 +4118,7 @@ def find_sstables():
        system_sstables_manager = std_unique_ptr(db["_system_sstables_manager"]).get()
        for manager in (user_sstables_manager, system_sstables_manager):
            for sst_list_name in ("_active", "_undergoing_close"):
-                for sst in intrusive_list(manager[sst_list_name], link="_manager_link"):
+                for sst in intrusive_list(manager[sst_list_name], link="_manager_list_link"):
                    yield sst.address
    except gdb.error:
        # Scylla Enterprise 2020.1 compatibility
@@ -5128,7 +5131,12 @@ class scylla_read_stats(gdb.Command):
        total = permit_stats()

        for permit in intrusive_list(permit_list):
-            schema = permit['_schema']
+            try:
+                schema = permit['_schema']['_p']
+            except:
+                # schema is already a raw pointer in older versions
+                schema = permit['_schema']
+
            if schema:
                raw_schema = schema.dereference()['_raw']
                schema_name = "{}.{}".format(str(raw_schema['_ks_name']).replace('"', ''), str(raw_schema['_cf_name']).replace('"', ''))
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -123,31 +123,38 @@ void migration_manager::init_messaging_service()
        });
        return netw::messaging_service::no_wait();
    });
-    _messaging.register_migration_request(std::bind_front(
-            [] (migration_manager& self, const rpc::client_info& cinfo, rpc::optional<netw::schema_pull_options> options)
+    _messaging.register_migration_request([this] (const rpc::client_info& cinfo, rpc::optional<netw::schema_pull_options> options) {
+        shard_id shard = (options && options->group0_snapshot_transfer) ? 0 : this_shard_id();
+        return container().invoke_on(shard, std::bind_front(
+            [] (netw::msg_addr, rpc::optional<netw::schema_pull_options> options, migration_manager& self)
                -> future<rpc::tuple<std::vector<frozen_mutation>, std::vector<canonical_mutation>>> {
-        const auto cm_retval_supported = options && options->remote_supports_canonical_mutation_retval;
+            const auto cm_retval_supported = options && options->remote_supports_canonical_mutation_retval;

-        auto features = self._feat.cluster_schema_features();
-        auto& proxy = self._storage_proxy.container();
-        auto cm = co_await db::schema_tables::convert_schema_to_mutations(proxy, features);
-        if (options->group0_snapshot_transfer) {
-            // if `group0_snapshot_transfer` is `true`, the sender must also understand canonical mutations
-            // (`group0_snapshot_transfer` was added more recently).
-            if (!cm_retval_supported) {
-                on_internal_error(mlogger,
-                    "migration request handler: group0 snapshot transfer requested, but canonical mutations not supported");
+            auto features = self._feat.cluster_schema_features();
+            auto& proxy = self._storage_proxy.container();
+            semaphore_units<> guard;
+            if (options->group0_snapshot_transfer) {
+                guard = co_await self._group0_client.hold_read_apply_mutex(self._as);
            }
-            cm.emplace_back(co_await db::system_keyspace::get_group0_history(proxy));
-        }
-        if (cm_retval_supported) {
-            co_return rpc::tuple(std::vector<frozen_mutation>{}, std::move(cm));
-        }
-        auto fm = boost::copy_range<std::vector<frozen_mutation>>(cm | boost::adaptors::transformed([&db = proxy.local().get_db().local()] (const canonical_mutation& cm) {
-            return cm.to_mutation(db.find_column_family(cm.column_family_id()).schema());
-        }));
-        co_return rpc::tuple(std::move(fm), std::move(cm));
-    }, std::ref(*this)));
+            auto cm = co_await db::schema_tables::convert_schema_to_mutations(proxy, features);
+            if (options->group0_snapshot_transfer) {
+                // if `group0_snapshot_transfer` is `true`, the sender must also understand canonical mutations
+                // (`group0_snapshot_transfer` was added more recently).
+                if (!cm_retval_supported) {
+                    on_internal_error(mlogger,
+                        "migration request handler: group0 snapshot transfer requested, but canonical mutations not supported");
+                }
+                cm.emplace_back(co_await db::system_keyspace::get_group0_history(proxy));
+            }
+            if (cm_retval_supported) {
+                co_return rpc::tuple(std::vector<frozen_mutation>{}, std::move(cm));
+            }
+            auto fm = boost::copy_range<std::vector<frozen_mutation>>(cm | boost::adaptors::transformed([&db = proxy.local().get_db().local()] (const canonical_mutation& cm) {
+                return cm.to_mutation(db.find_column_family(cm.column_family_id()).schema());
+            }));
+            co_return rpc::tuple(std::move(fm), std::move(cm));
+        }, netw::messaging_service::get_source(cinfo), std::move(options)));
+    });
    _messaging.register_schema_check([this] {
        return make_ready_future<table_schema_version>(_storage_proxy.get_db().local().get_version());
    });
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -82,6 +82,10 @@ public:
    const migration_notifier& get_notifier() const { return _notifier; }
    service::storage_proxy& get_storage_proxy() { return _storage_proxy; }
    const service::storage_proxy& get_storage_proxy() const { return _storage_proxy; }
+    abort_source& get_abort_source() noexcept { return _as; }
+    const abort_source& get_abort_source() const noexcept { return _as; }
+    service::raft_group0_client& get_group0_client() noexcept { return _group0_client; }
+    const service::raft_group0_client& get_group0_client() const noexcept { return _group0_client; }

    future<> submit_migration_task(const gms::inet_address& endpoint, bool can_ignore_down_node = true);

--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -208,8 +208,10 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
            llogger.debug("Send CACHE_HITRATES update max_diff={}, published_nr={}", _diff, _published_nr);
            ++_published_nr;
            _published_time = now;
-            return _gossiper.add_local_application_state(gms::application_state::CACHE_HITRATES,
-                    gms::versioned_value::cache_hitrates(_gstate)).then([this, recalculate_duration] {
+            return container().invoke_on(0, [&gstate = _gstate] (cache_hitrate_calculator& self) {
+                return self._gossiper.add_local_application_state(gms::application_state::CACHE_HITRATES,
+                        gms::versioned_value::cache_hitrates(gstate));
+            }).then([recalculate_duration] {
                return recalculate_duration;
            });
        } else {
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -381,8 +381,26 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id) {
    // we ensure we haven't missed any IP update in the map.
    load_initial_raft_address_map();
    group0_log.info("Server {} is starting group 0 with id {}", my_id, group0_id);
-    co_await _raft_gr.start_server_for_group(create_server_for_group0(group0_id, my_id));
+    auto srv_for_group0 = create_server_for_group0(group0_id, my_id);
+    auto& persistence = srv_for_group0.persistence;
+    auto& server = *srv_for_group0.server;
+    co_await _raft_gr.start_server_for_group(std::move(srv_for_group0));
    _group0.emplace<raft::group_id>(group0_id);
+
+    // Fix for scylladb/scylladb#16683:
+    // If the snapshot index is 0, trigger creation of a new snapshot
+    // so bootstrapping nodes will receive a snapshot transfer.
+    auto snap = co_await persistence.load_snapshot_descriptor();
+    if (snap.idx == raft::index_t{0}) {
+        group0_log.info("Detected snapshot with index=0, id={}, triggering new snapshot", snap.id);
+        bool created = co_await server.trigger_snapshot(&_abort_source);
+        if (created) {
+            snap = co_await persistence.load_snapshot_descriptor();
+            group0_log.info("New snapshot created, index={} id={}", snap.idx, snap.id);
+        } else {
+            group0_log.warn("Could not create new snapshot, there are no entries applied");
+        }
+    }
 }

 future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, bool as_voter) {
@@ -418,14 +436,22 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, bool as_
        if (server == nullptr) {
            // This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
            raft::configuration initial_configuration;
+            bool nontrivial_snapshot = false;
            if (g0_info.id == my_id) {
                // We were chosen as the discovery leader.
                // We should start a new group with this node as voter.
                group0_log.info("Server {} chosen as discovery leader; bootstrapping group 0 from scratch", my_id);
                initial_configuration.current.emplace(my_addr, true);
+                // Force snapshot transfer from us to subsequently joining servers.
+                // This is important for upgrade and recovery, where the group 0 state machine
+                // (schema tables in particular) is nonempty.
+                // In a fresh cluster this will trigger an empty snapshot transfer which is redundant but correct.
+                // See #14066.
+                nontrivial_snapshot = true;
            }
            // Bootstrap the initial configuration
-            co_await raft_sys_table_storage(_qp, group0_id, my_id).bootstrap(std::move(initial_configuration));
+            co_await raft_sys_table_storage(_qp, group0_id, my_id)
+                    .bootstrap(std::move(initial_configuration), nontrivial_snapshot);
            co_await start_server_for_group0(group0_id);
            server = &_raft_gr.group0();
            // FIXME if we crash now or after getting added to the config but before storing group 0 ID,
--- a/service/raft/raft_group0_client.cc
+++ b/service/raft/raft_group0_client.cc
@@ -230,6 +230,17 @@ static utils::UUID generate_group0_state_id(utils::UUID prev_state_id) {
    return utils::UUID_gen::get_random_time_UUID_from_micros(std::chrono::microseconds{ts});
 }

+future<> raft_group0_client::perform_read_barrier(seastar::abort_source* as) {
+    if (this_shard_id() != 0) {
+        on_internal_error(logger, "perform_read_barrier: must run on shard 0");
+    }
+
+    if (_raft_gr.is_enabled()) {
+        return _raft_gr.group0().read_barrier(as);
+    }
+    return make_ready_future();
+}
+
 future<group0_guard> raft_group0_client::start_operation(seastar::abort_source* as) {
    if (this_shard_id() != 0) {
        on_internal_error(logger, "start_group0_operation: must run on shard 0");
@@ -367,6 +378,14 @@ future<> raft_group0_client::wait_until_group0_upgraded(abort_source& as) {
    }
 }

+future<semaphore_units<>> raft_group0_client::hold_read_apply_mutex(abort_source& as) {
+    if (this_shard_id() != 0) {
+        on_internal_error(logger, "hold_read_apply_mutex: must run on shard 0");
+    }
+
+    return get_units(_read_apply_mutex, 1, as);
+}
+
 db::system_keyspace& raft_group0_client::sys_ks() {
    return _sys_ks;
 }
--- a/service/raft/raft_group0_client.hh
+++ b/service/raft/raft_group0_client.hh
@@ -105,6 +105,9 @@ public:

    future<> add_entry_unguarded(group0_command group0_cmd, seastar::abort_source* as = nullptr);

+    // Call only on shard 0.
+    future<> perform_read_barrier(seastar::abort_source* as = nullptr);
+
    // Ensures that all previously finished operations on group 0 are visible on this node;
    // in particular, performs a Raft read barrier on group 0.
    //
@@ -163,6 +166,8 @@ public:
    // Wait until group 0 upgrade enters the `use_post_raft_procedures` state.
    future<> wait_until_group0_upgraded(abort_source&);

+    future<semaphore_units<>> hold_read_apply_mutex(abort_source&);
+
    db::system_keyspace& sys_ks();

    // for test only
--- a/service/raft/raft_group_registry.cc
+++ b/service/raft/raft_group_registry.cc
@@ -326,7 +326,7 @@ seastar::future<> raft_group_registry::start(raft::server_id my_id) {
    init_rpc_verbs();

    _direct_fd_subscription.emplace(co_await _direct_fd.register_listener(*_direct_fd_proxy,
-        direct_fd_clock::base::duration{std::chrono::seconds{1}}.count()));
+        direct_fd_clock::base::duration{std::chrono::seconds{2}}.count()));
 }

 const raft::server_id& raft_group_registry::get_my_raft_id() {
--- a/service/raft/raft_sys_table_storage.cc
+++ b/service/raft/raft_sys_table_storage.cc
@@ -178,19 +178,18 @@ future<> raft_sys_table_storage::store_snapshot_descriptor(const raft::snapshot_
                {_group_id.id, "PREVIOUS", srv.addr.id.id, srv.can_vote},
                    cql3::query_processor::cache_internal::yes);
        }
-        // Also update the latest snapshot id in `system.raft` table
-        static const auto store_latest_id_cql = format("INSERT INTO system.{} (group_id, snapshot_id) VALUES (?, ?)",
-            db::system_keyspace::RAFT);
-        co_await _qp.execute_internal(
-            store_latest_id_cql,
-            {_group_id.id, snap.id.id},
-            cql3::query_processor::cache_internal::yes
-        );
+
        if (preserve_log_entries > snap.idx) {
-            co_return;
+            static const auto store_latest_id_cql = format("INSERT INTO system.{} (group_id, snapshot_id) VALUES (?, ?)",
+                db::system_keyspace::RAFT);
+            co_await _qp.execute_internal(
+                store_latest_id_cql,
+                {_group_id.id, snap.id.id},
+                cql3::query_processor::cache_internal::yes
+            );
+        } else {
+            co_await update_snapshot_and_truncate_log_tail(snap, preserve_log_entries);
        }
-        // TODO: make truncation and snapshot update in `system.raft` atomic
-        co_await truncate_log_tail(raft::index_t(static_cast<uint64_t>(snap.idx) - static_cast<uint64_t>(preserve_log_entries)));
    });
 }

@@ -283,9 +282,20 @@ future<> raft_sys_table_storage::abort() {
    return std::move(_pending_op_fut);
 }

-future<> raft_sys_table_storage::truncate_log_tail(raft::index_t idx) {
-    static const auto truncate_cql = format("DELETE FROM system.{} WHERE group_id = ? AND \"index\" <= ?", db::system_keyspace::RAFT);
-    return _qp.execute_internal(truncate_cql, {_group_id.id, int64_t(idx)}, cql3::query_processor::cache_internal::yes).discard_result();
+future<> raft_sys_table_storage::update_snapshot_and_truncate_log_tail(const raft::snapshot_descriptor &snap, size_t preserve_log_entries) {
+    // Update snapshot and truncate logs in `system.raft` atomically
+    raft::index_t log_tail_idx = raft::index_t(static_cast<uint64_t>(snap.idx) - static_cast<uint64_t>(preserve_log_entries));
+    static const auto store_latest_id_and_truncate_log_tail_cql = format(
+        "BEGIN UNLOGGED BATCH"
+        "   INSERT INTO system.{} (group_id, snapshot_id) VALUES (?, ?);"   // store latest id
+        "   DELETE FROM system.{} WHERE group_id = ? AND \"index\" <= ?;"   // truncate log tail
+        "APPLY BATCH",
+        db::system_keyspace::RAFT, db::system_keyspace::RAFT);
+    return _qp.execute_internal(
+        store_latest_id_and_truncate_log_tail_cql,
+        {_group_id.id, snap.id.id, _group_id.id, int64_t(log_tail_idx)},
+        cql3::query_processor::cache_internal::yes
+    ).discard_result();
 }

 future<> raft_sys_table_storage::execute_with_linearization_point(std::function<future<>()> f) {
@@ -301,8 +311,10 @@ future<> raft_sys_table_storage::execute_with_linearization_point(std::function<
    }
 }

-future<> raft_sys_table_storage::bootstrap(raft::configuration initial_configuation) {
-    raft::snapshot_descriptor snapshot;
+future<> raft_sys_table_storage::bootstrap(raft::configuration initial_configuation, bool nontrivial_snapshot) {
+    auto init_index = nontrivial_snapshot ? raft::index_t{1} : raft::index_t{0};
+    raft::snapshot_descriptor snapshot{.idx{init_index}};
+    snapshot.id = raft::snapshot_id::create_random_id();
    snapshot.config = std::move(initial_configuation);
    co_await store_snapshot_descriptor(snapshot, 0);
 }
--- a/service/raft/raft_sys_table_storage.hh
+++ b/service/raft/raft_sys_table_storage.hh
@@ -71,15 +71,20 @@ public:

    // Persist initial configuration of a new Raft group.
    // To be called before start for the new group.
-    // Uses a special snapshot id (0) to identify the snapshot
-    // descriptor.
-    future<> bootstrap(raft::configuration initial_configuation);
+    //
+    // If `nontrivial_snapshot` is true, the initial snapshot will have index 1 instead of 0,
+    // which will trigger a snapshot transfer to servers which start with snapshot index 0.
+    // This should be set for the first group 0 server during upgrade or recovery, which
+    // will force snapshot transfers for subsequently joining nodes (so we can transfer initial
+    // schema etc.). It's also correct to do it when booting a cluster from
+    // scratch with Raft, although not necessary (it will force an empty snapshot transfer).
+    future<> bootstrap(raft::configuration initial_configuation, bool nontrivial_snapshot);
 private:

    future<> do_store_log_entries(const std::vector<raft::log_entry_ptr>& entries);
    // Truncate all entries from the persisted log with indices <= idx
    // Called from the `store_snapshot` function.
-    future<> truncate_log_tail(raft::index_t idx);
+    future<> update_snapshot_and_truncate_log_tail(const raft::snapshot_descriptor &snap, size_t preserve_log_entries);

    future<> execute_with_linearization_point(std::function<future<>()> f);
 };
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -923,6 +923,7 @@ public:
    // If upper_bound is provided, the upper bound within position is looked up
    future<bool> advance_lower_and_check_if_present(
            dht::ring_position_view key, std::optional<position_in_partition_view> pos = {}) {
+        utils::get_local_injector().inject("advance_lower_and_check_if_present", [] { throw std::runtime_error("advance_lower_and_check_if_present"); });
        return advance_to(_lower_bound, key).then([this, key, pos] {
            if (eof()) {
                return make_ready_future<bool>(false);
--- a/sstables/key.hh
+++ b/sstables/key.hh
@@ -29,14 +29,10 @@ public:
        return ::with_linearized(_bytes, func);
    }

-    std::vector<bytes_view> explode(const schema& s) const {
-        return with_linearized([&] (bytes_view v) {
-            return composite_view(v, s.partition_key_size() > 1).explode();
-        });
-    }
-
    partition_key to_partition_key(const schema& s) const {
-        return partition_key::from_exploded_view(explode(s));
+        return with_linearized([&] (bytes_view v) {
+            return partition_key::from_exploded_view(composite_view(v, s.partition_key_size() > 1).explode());
+        });
    }

    bool operator==(const key_view& k) const { return k._bytes == _bytes; }
--- a/sstables/kl/reader.cc
+++ b/sstables/kl/reader.cc
@@ -415,7 +415,7 @@ public:
        if (!_is_mutation_end) {
            return proceed::yes;
        }
-        auto pk = partition_key::from_exploded(key.explode(*_schema));
+        auto pk = key.to_partition_key(*_schema);
        setup_for_partition(pk);
        auto dk = dht::decorate_key(*_schema, pk);
        _reader->on_next_partition(std::move(dk), tombstone(deltime));
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -313,7 +313,7 @@ public:
        if (!_is_mutation_end) {
            return proceed::yes;
        }
-        auto pk = partition_key::from_exploded(key.explode(*_schema));
+        auto pk = key.to_partition_key(*_schema);
        setup_for_partition(pk);
        auto dk = dht::decorate_key(*_schema, pk);
        _reader->on_next_partition(std::move(dk), tombstone(deltime));
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -32,6 +32,7 @@
 #include <seastar/coroutine/parallel_for_each.hh>
 #include <seastar/coroutine/as_future.hh>

+#include "utils/error_injection.hh"
 #include "dht/sharder.hh"
 #include "types.hh"
 #include "writer.hh"
@@ -1294,7 +1295,7 @@ future<> sstable::load_first_and_last_position_in_partition() {
    }

    auto& sem = _manager.sstable_metadata_concurrency_sem();
-    reader_permit permit = co_await sem.obtain_permit(&*_schema, "sstable::load_first_and_last_position_range", sstable_buffer_size, db::no_timeout);
+    reader_permit permit = co_await sem.obtain_permit(_schema, "sstable::load_first_and_last_position_range", sstable_buffer_size, db::no_timeout);
    auto first_pos_opt = co_await find_first_position_in_partition(permit, get_first_decorated_key(), false);
    auto last_pos_opt = co_await find_first_position_in_partition(permit, get_last_decorated_key(), true);

@@ -1395,6 +1396,9 @@ future<> sstable::open_data(sstable_open_config cfg) noexcept {
    }
    _open_mode.emplace(open_flags::ro);
    _stats.on_open_for_reading();
+
+    _total_reclaimable_memory.reset();
+    _manager.increment_total_reclaimable_memory_and_maybe_reclaim(this);
 }

 future<> sstable::update_info_for_opened_data(sstable_open_config cfg) {
@@ -1498,6 +1502,50 @@ void sstable::write_filter(const io_priority_class& pc) {
    write_simple<component_type::Filter>(filter_ref, pc);
 }

+size_t sstable::total_reclaimable_memory_size() const {
+    if (!_total_reclaimable_memory) {
+        _total_reclaimable_memory = _components->filter ? _components->filter->memory_size() : 0;
+    }
+
+    return _total_reclaimable_memory.value();
+}
+
+size_t sstable::reclaim_memory_from_components() {
+    size_t memory_reclaimed_this_iteration = 0;
+
+    if (_components->filter) {
+        auto filter_memory_size = _components->filter->memory_size();
+        if (filter_memory_size > 0) {
+            // Discard it from memory by replacing it with an always present variant.
+            // No need to remove it from _recognized_components as the filter is still in disk.
+            _components->filter = std::make_unique<utils::filter::always_present_filter>();
+            memory_reclaimed_this_iteration += filter_memory_size;
+        }
+    }
+
+    _total_reclaimable_memory.reset();
+    _total_memory_reclaimed += memory_reclaimed_this_iteration;
+    return memory_reclaimed_this_iteration;
+}
+
+size_t sstable::total_memory_reclaimed() const {
+    return _total_memory_reclaimed;
+}
+
+future<> sstable::reload_reclaimed_components(const io_priority_class& pc) {
+    if (_total_memory_reclaimed == 0) {
+        // nothing to reload
+        co_return;
+    }
+
+    co_await utils::get_local_injector().inject("reload_reclaimed_components/pause", std::chrono::seconds{3});
+
+    co_await read_filter(pc);
+    _total_reclaimable_memory.reset();
+    _total_memory_reclaimed -= _components->filter->memory_size();
+    sstlog.info("Reloaded bloom filter of {}", get_filename());
+}
+
 // This interface is only used during tests, snapshot loading and early initialization.
 // No need to set tunable priorities for it.
 future<> sstable::load(const io_priority_class& pc, sstable_open_config cfg) noexcept {
@@ -1528,7 +1576,10 @@ future<> sstable::load(sstables::foreign_sstable_open_info info) noexcept {
        validate_min_max_metadata();
        validate_max_local_deletion_time();
        validate_partitioner();
-        return update_info_for_opened_data();
+        return update_info_for_opened_data().then([this]() {
+            _total_reclaimable_memory.reset();
+            _manager.increment_total_reclaimable_memory_and_maybe_reclaim(this);
+        });
    });
 }

@@ -1859,7 +1910,7 @@ future<> sstable::generate_summary(const io_priority_class& pc) {

        auto s = summary_generator(_schema->get_partitioner(), _components->summary, _manager.config().sstable_summary_ratio());
            auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(
-                    *this, sem.make_tracking_only_permit(_schema.get(), "generate-summary", db::no_timeout), s, trust_promoted_index::yes,
+                    *this, sem.make_tracking_only_permit(_schema, "generate-summary", db::no_timeout), s, trust_promoted_index::yes,
                    make_file_input_stream(index_file, 0, index_size, std::move(options)), 0, index_size,
                    (_version >= sstable_version_types::mc
                        ? std::make_optional(get_clustering_values_fixed_lengths(get_serialization_header()))
@@ -2990,12 +3041,16 @@ future<bool> sstable::has_partition_key(const utils::hashed_key& hk, const dht::
    bool present;
    std::exception_ptr ex;
    auto sem = reader_concurrency_semaphore(reader_concurrency_semaphore::no_limits{}, "sstables::has_partition_key()");
+    std::unique_ptr<sstables::index_reader> lh_index_ptr = nullptr;
    try {
-        auto lh_index_ptr = std::make_unique<sstables::index_reader>(s, sem.make_tracking_only_permit(_schema.get(), s->get_filename(), db::no_timeout), default_priority_class(), tracing::trace_state_ptr(), use_caching::yes);
+        lh_index_ptr = std::make_unique<sstables::index_reader>(s, sem.make_tracking_only_permit(_schema, s->get_filename(), db::no_timeout), default_priority_class(), tracing::trace_state_ptr(), use_caching::yes);
        present = co_await lh_index_ptr->advance_lower_and_check_if_present(dk);
    } catch (...) {
        ex = std::current_exception();
    }
+    if (auto lhi_ptr = std::move(lh_index_ptr)) {
+        co_await lhi_ptr->close();
+    }
    co_await sem.stop();
    if (ex) {
        co_return coroutine::exception(std::move(ex));
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -146,7 +146,8 @@ class sstable : public enable_lw_shared_from_this<sstable> {
 public:
    using version_types = sstable_version_types;
    using format_types = sstable_format_types;
-    using manager_link_type = bi::list_member_hook<bi::link_mode<bi::auto_unlink>>;
+    using manager_list_link_type = bi::list_member_hook<bi::link_mode<bi::auto_unlink>>;
+    using manager_set_link_type = bi::set_member_hook<bi::link_mode<bi::auto_unlink>>;
 public:
    sstable(schema_ptr schema,
            sstring dir,
@@ -576,7 +577,11 @@ private:
    sstables_manager& _manager;

    sstables_stats _stats;
-    manager_link_type _manager_link;
+    // link used by the _active list of sstables manager
+    manager_list_link_type _manager_list_link;
+    // link used by the _reclaimed set of sstables manager
+    manager_set_link_type _manager_set_link;
+

    // The _large_data_stats map stores e.g. largest partitions, rows, cells sizes,
    // and max number of rows in a partition.
@@ -585,6 +590,13 @@ private:
    // information in their scylla metadata.
    std::optional<scylla_metadata::large_data_stats> _large_data_stats;
    sstring _origin;
+
+    // Total reclaimable memory from all the components of the SSTable.
+    // It is initialized to 0 to prevent the sstables manager from reclaiming memory
+    // from the components before the SSTable has been fully loaded.
+    mutable std::optional<size_t> _total_reclaimable_memory{0};
+    // Total memory reclaimed so far from this sstable
+    size_t _total_memory_reclaimed{0};
 public:
    const bool has_component(component_type f) const;
    sstables_manager& manager() { return _manager; }
@@ -663,6 +675,16 @@ private:

    future<> create_data() noexcept;

+    // Note that only bloom filters are reclaimable by the following methods.
+    // Return the total reclaimable memory in this SSTable
+    size_t total_reclaimable_memory_size() const;
+    // Reclaim memory from the components back to the system.
+    size_t reclaim_memory_from_components();
+    // Return memory reclaimed so far from this sstable
+    size_t total_memory_reclaimed() const;
+    // Reload components from which memory was previously reclaimed
+    future<> reload_reclaimed_components(const io_priority_class& pc);
+
 public:
    // Finds first position_in_partition in a given partition.
    // If reversed is false, then the first position is actually the first row (can be the static one).
@@ -906,6 +928,13 @@ public:
    // Drops all evictable in-memory caches of on-disk content.
    future<> drop_caches();

+    struct lesser_reclaimed_memory {
+        // comparator class to be used by the _reclaimed set in sstables manager
+        bool operator()(const sstable& sst1, const sstable& sst2) const {
+            return sst1.total_memory_reclaimed() < sst2.total_memory_reclaimed();
+        }
+    };
+
    // Allow the test cases from sstable_test.cc to test private methods. We use
    // a placeholder to avoid cluttering this class too much. The sstable_test class
    // will then re-export as public every method it needs.
--- a/sstables/sstables_manager.cc
+++ b/sstables/sstables_manager.cc
@@ -20,7 +20,7 @@ logging::logger smlogger("sstables_manager");

 sstables_manager::sstables_manager(
    db::large_data_handler& large_data_handler, const db::config& dbcfg, gms::feature_service& feat, cache_tracker& ct, size_t available_memory, directory_semaphore& dir_sem)
-    : _large_data_handler(large_data_handler), _db_config(dbcfg), _features(feat), _cache_tracker(ct)
+    : _available_memory(available_memory), _large_data_handler(large_data_handler), _db_config(dbcfg), _features(feat), _cache_tracker(ct)
    , _sstable_metadata_concurrency_sem(
        max_count_sstable_metadata_concurrent_reads,
        max_memory_sstable_metadata_concurrent_reads(available_memory),
@@ -28,6 +28,7 @@ sstables_manager::sstables_manager(
        std::numeric_limits<size_t>::max())
    , _dir_semaphore(dir_sem)
 {
+    _components_reloader_status = components_reloader_fiber();
 }

 sstables_manager::~sstables_manager() {
@@ -69,11 +70,87 @@ sstable_writer_config sstables_manager::configure_writer(sstring origin) const {
    return cfg;
 }

+void sstables_manager::increment_total_reclaimable_memory_and_maybe_reclaim(sstable* sst) {
+    _total_reclaimable_memory += sst->total_reclaimable_memory_size();
+
+    size_t memory_reclaim_threshold = _available_memory * _db_config.components_memory_reclaim_threshold();
+    if (_total_reclaimable_memory <= memory_reclaim_threshold) {
+        // total memory used is within limit; no need to reclaim.
+        return;
+    }
+
+    // Memory consumption has crossed threshold. Reclaim from the SSTable that
+    // has the most reclaimable memory to get the total consumption under limit.
+    auto sst_with_max_memory = std::max_element(_active.begin(), _active.end(), [](const sstable& sst1, const sstable& sst2) {
+        return sst1.total_reclaimable_memory_size() < sst2.total_reclaimable_memory_size();
+    });
+
+    auto memory_reclaimed = sst_with_max_memory->reclaim_memory_from_components();
+    _total_memory_reclaimed += memory_reclaimed;
+    _total_reclaimable_memory -= memory_reclaimed;
+    _reclaimed.insert(*sst_with_max_memory);
+    smlogger.info("Reclaimed {} bytes of memory from SSTable components. Total memory reclaimed so far is {} bytes", memory_reclaimed, _total_memory_reclaimed);
+}
+
+size_t sstables_manager::get_memory_available_for_reclaimable_components() {
+    size_t memory_reclaim_threshold = _available_memory * _db_config.components_memory_reclaim_threshold();
+    return memory_reclaim_threshold - _total_reclaimable_memory;
+}
+
+future<> sstables_manager::components_reloader_fiber() {
+    sstlog.trace("components_reloader_fiber start");
+    while (true) {
+        co_await _sstable_deleted_event.when();
+
+        if (_closing) {
+            co_return;
+        }
+
+        // Reload bloom filters from the smallest to largest so as to maximize
+        // the number of bloom filters being reloaded.
+        auto memory_available = get_memory_available_for_reclaimable_components();
+        while (!_reclaimed.empty() && memory_available > 0) {
+            auto sstable_to_reload = _reclaimed.begin();
+            const size_t reclaimed_memory = sstable_to_reload->total_memory_reclaimed();
+            if (reclaimed_memory > memory_available) {
+                // cannot reload anymore sstables
+                break;
+            }
+
+            // Increment the total memory before reloading to prevent any parallel
+            // fibers from loading new bloom filters into memory.
+            _total_reclaimable_memory += reclaimed_memory;
+            _reclaimed.erase(sstable_to_reload);
+            // Use a lw_shared_ptr to prevent the sstable from getting deleted when
+            // the components are being reloaded.
+            auto sstable_ptr = sstable_to_reload->shared_from_this();
+            try {
+                co_await sstable_ptr->reload_reclaimed_components(default_priority_class());
+            } catch (...) {
+                // reload failed due to some reason
+                sstlog.warn("Failed to reload reclaimed SSTable components : {}", std::current_exception());
+                // revert back changes made before the reload
+                _total_reclaimable_memory -= reclaimed_memory;
+                _reclaimed.insert(*sstable_to_reload);
+                break;
+            }
+
+            _total_memory_reclaimed -= reclaimed_memory;
+            memory_available = get_memory_available_for_reclaimable_components();
+        }
+    }
+}
+
 void sstables_manager::add(sstable* sst) {
    _active.push_back(*sst);
 }

 void sstables_manager::deactivate(sstable* sst) {
+    // Remove SSTable from the reclaimable memory tracking
+    _total_reclaimable_memory -= sst->total_reclaimable_memory_size();
+    _total_memory_reclaimed -= sst->total_memory_reclaimed();
+    _reclaimed.erase(*sst);
+
    // At this point, sst has a reference count of zero, since we got here from
    // lw_shared_ptr_deleter<sstables::sstable>::dispose().
    _active.erase(_active.iterator_to(*sst));
@@ -89,6 +166,7 @@ void sstables_manager::deactivate(sstable* sst) {
 void sstables_manager::remove(sstable* sst) {
    _undergoing_close.erase(_undergoing_close.iterator_to(*sst));
    delete sst;
+    _sstable_deleted_event.signal();
    maybe_done();
 }

@@ -103,6 +181,9 @@ future<> sstables_manager::close() {
    maybe_done();
    co_await _done.get_future();
    co_await _sstable_metadata_concurrency_sem.stop();
+    // stop the components reload fiber
+    _sstable_deleted_event.signal();
+    co_await std::move(_components_reloader_status);
 }

 sstable_directory::components_lister sstables_manager::get_components_lister(std::filesystem::path dir) {
--- a/sstables/sstables_manager.hh
+++ b/sstables/sstables_manager.hh
@@ -45,9 +45,14 @@ static constexpr size_t default_sstable_buffer_size = 128 * 1024;

 class sstables_manager {
    using list_type = boost::intrusive::list<sstable,
-            boost::intrusive::member_hook<sstable, sstable::manager_link_type, &sstable::_manager_link>,
+            boost::intrusive::member_hook<sstable, sstable::manager_list_link_type, &sstable::_manager_list_link>,
            boost::intrusive::constant_time_size<false>>;
+    using set_type = boost::intrusive::set<sstable,
+            boost::intrusive::member_hook<sstable, sstable::manager_set_link_type, &sstable::_manager_set_link>,
+            boost::intrusive::constant_time_size<false>,
+            boost::intrusive::compare<sstable::lesser_reclaimed_memory>>;
 private:
+    size_t _available_memory;
    db::large_data_handler& _large_data_handler;
    const db::config& _db_config;
    gms::feature_service& _features;
@@ -65,6 +70,16 @@ private:
    list_type _active;
    list_type _undergoing_close;

+    // Total reclaimable memory used by components of sstables in _active list
+    size_t _total_reclaimable_memory{0};
+    // Total memory reclaimed so far across all sstables
+    size_t _total_memory_reclaimed{0};
+    // Set of sstables from which memory has been reclaimed
+    set_type _reclaimed;
+    // Condition variable that gets notified when an sstable is deleted
+    seastar::condition_variable _sstable_deleted_event;
+    future<> _components_reloader_status = make_ready_future<>();
+
    bool _closing = false;
    promise<> _done;
    cache_tracker& _cache_tracker;
@@ -120,11 +135,22 @@ private:
    static constexpr size_t max_count_sstable_metadata_concurrent_reads{10};
    // Allow at most 10% of memory to be filled with such reads.
    size_t max_memory_sstable_metadata_concurrent_reads(size_t available_memory) { return available_memory * 0.1; }
+
+    // Increment the _total_reclaimable_memory with the new SSTable's reclaimable
+    // memory and if the total memory usage exceeds the pre-defined threshold,
+    // reclaim it from the SSTable that has the most reclaimable memory.
+    void increment_total_reclaimable_memory_and_maybe_reclaim(sstable* sst);
+    // Fiber to reload reclaimed components back into memory when memory becomes available.
+    future<> components_reloader_fiber();
+    size_t get_memory_available_for_reclaimable_components();
 private:
    db::large_data_handler& get_large_data_handler() const {
        return _large_data_handler;
    }
    friend class sstable;
+
+    // Allow testing private methods/variables via test_env_sstables_manager
+    friend class test_env_sstables_manager;
 };

 }   // namespace sstables
--- a/streaming/consumer.cc
+++ b/streaming/consumer.cc
@@ -33,9 +33,10 @@ std::function<future<> (flat_mutation_reader_v2)> make_streaming_consumer(sstrin
            //FIXME: for better estimations this should be transmitted from remote
            auto metadata = mutation_source_metadata{};
            auto& cs = cf->get_compaction_strategy();
-            const auto adjusted_estimated_partitions = cs.adjust_partition_estimate(metadata, estimated_partitions);
+            // Data segregation is postponed to happen during off-strategy if latter is enabled, which
+            // means partition estimation shouldn't be adjusted.
+            const auto adjusted_estimated_partitions = (offstrategy) ? estimated_partitions : cs.adjust_partition_estimate(metadata, estimated_partitions, cf->schema());
            auto make_interposer_consumer = [&cs, offstrategy] (const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) mutable {
-                // postpone data segregation to off-strategy compaction if enabled
                if (offstrategy) {
                    return end_consumer;
                }
--- a/streaming/stream_manager.hh
+++ b/streaming/stream_manager.hh
@@ -128,6 +128,7 @@ public:

    replica::database& db() noexcept { return _db.local(); }
    netw::messaging_service& ms() noexcept { return _ms.local(); }
+    service::migration_manager& mm() noexcept { return _mm.local(); }

    const std::unordered_map<plan_id, shared_ptr<stream_result_future>>& get_initiated_streams() const {
        return _initiated_streams;
--- a/streaming/stream_result_future.cc
+++ b/streaming/stream_result_future.cc
@@ -77,7 +77,7 @@ template <typename Event>
 void stream_result_future::fire_stream_event(Event event) {
    // delegate to listener
    for (auto listener : _event_listeners) {
-        listener->handle_stream_event(std::move(event));
+        listener->handle_stream_event(event);
    }
 }

--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -36,6 +36,7 @@
 #include "streaming/stream_mutation_fragments_cmd.hh"
 #include "consumer.hh"
 #include "readers/generating_v2.hh"
+#include "utils/error_injection.hh"

 namespace streaming {

@@ -167,6 +168,9 @@ void stream_manager::init_messaging_service_handler() {
            // Make sure the table with cf_id is still present at this point.
            // Close the sink in case the table is dropped.
            auto op = _db.local().find_column_family(cf_id).stream_in_progress();
+            utils::get_local_injector().inject("stream_mutation_fragments_table_dropped", [this] () {
+                _db.local().find_column_family(table_id::create_null_id());
+            });
            //FIXME: discarded future.
            (void)mutation_writer::distribute_reader_and_consume_on_shards(s,
                make_generating_reader_v1(s, permit, std::move(get_next_mutation_fragment)),
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -30,7 +30,9 @@
 #include <boost/icl/interval_set.hpp>
 #include "sstables/sstables.hh"
 #include "replica/database.hh"
+#include "repair/table_check.hh"
 #include "gms/feature_service.hh"
+#include "utils/error_injection.hh"

 namespace streaming {

@@ -189,54 +191,58 @@ future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
 future<> stream_transfer_task::execute() {
    auto plan_id = session->plan_id();
    auto cf_id = this->cf_id;
-    auto dst_cpu_id = session->dst_cpu_id;
    auto id = netw::messaging_service::msg_addr{session->peer, session->dst_cpu_id};
-    sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}", plan_id, cf_id);
-    sort_and_merge_ranges();
-    auto reason = session->get_reason();
    auto& sm = session->manager();
-    return sm.container().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, reason] (stream_manager& sm) mutable {
-        auto& tbl = sm.db().find_column_family(cf_id);
-      return sm.db().obtain_reader_permit(tbl, "stream-transfer-task", db::no_timeout).then([&sm, &tbl, plan_id, cf_id, id, dst_cpu_id, ranges=std::move(ranges), reason] (reader_permit permit) mutable {
-        auto si = make_lw_shared<send_info>(sm.ms(), plan_id, tbl, std::move(permit), std::move(ranges), id, dst_cpu_id, reason, [&sm, plan_id, addr = id.addr] (size_t sz) {
-            sm.update_progress(plan_id, addr, streaming::progress_info::direction::OUT, sz);
-        });
-        return si->has_relevant_range_on_this_shard().then([&sm, si, plan_id, cf_id] (bool has_relevant_range_on_this_shard) {
-            if (!has_relevant_range_on_this_shard) {
-                sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
-                        plan_id, cf_id, this_shard_id());
-                return make_ready_future<>();
-            }
-            return send_mutation_fragments(std::move(si));
-        }).finally([si] {
-            return si->reader.close();
-        });
-      });
-    }).then([this, plan_id, cf_id, id, &sm] {
-        sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
-        return sm.ms().send_stream_mutation_done(id, plan_id, _ranges,
-                cf_id, session->dst_cpu_id).handle_exception([plan_id, id, cf_id] (auto ep) {
-            sslog.warn("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION_DONE to {}: {}", plan_id, id, ep);
+    auto table_dropped = co_await repair::with_table_drop_silenced(sm.db(), sm.mm(), cf_id, [this, &sm, cf_id, plan_id, id] (const table_id &) {
+        auto dst_cpu_id = session->dst_cpu_id;
+        sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}", plan_id, cf_id);
+        sort_and_merge_ranges();
+        auto reason = session->get_reason();
+        return sm.container().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, reason] (stream_manager& sm) mutable {
+            auto& tbl = sm.db().find_column_family(cf_id);
+            return sm.db().obtain_reader_permit(tbl, "stream-transfer-task", db::no_timeout).then([&sm, &tbl, plan_id, cf_id, id, dst_cpu_id, ranges=std::move(ranges), reason] (reader_permit permit) mutable {
+                auto si = make_lw_shared<send_info>(sm.ms(), plan_id, tbl, std::move(permit), std::move(ranges), id, dst_cpu_id, reason, [&sm, plan_id, addr = id.addr] (size_t sz) {
+                    sm.update_progress(plan_id, addr, streaming::progress_info::direction::OUT, sz);
+                });
+                return si->has_relevant_range_on_this_shard().then([&sm, si, plan_id, cf_id] (bool has_relevant_range_on_this_shard) {
+                    if (!has_relevant_range_on_this_shard) {
+                        sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
+                                plan_id, cf_id, this_shard_id());
+                        return make_ready_future<>();
+                    }
+                    return send_mutation_fragments(std::move(si));
+                }).finally([si] {
+                    return si->reader.close();
+                });
+            });
+        }).then([this, plan_id, cf_id, id, &sm] {
+            sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
+            return sm.ms().send_stream_mutation_done(id, plan_id, _ranges,
+                    cf_id, session->dst_cpu_id).handle_exception([plan_id, id, cf_id] (auto ep) {
+                sslog.warn("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION_DONE to {}: {}", plan_id, id, ep);
+                std::rethrow_exception(ep);
+            });
+        }).then([this, id, plan_id, cf_id] {
+            _mutation_done_sent = true;
+            sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id.addr);
+        }).handle_exception([plan_id, id, &sm] (std::exception_ptr ep) {
+            sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
+            utils::get_local_injector().inject("stream_mutation_fragments_table_dropped", [&sm] () {
+                sm.db().find_column_family(table_id::create_null_id());
+            });
            std::rethrow_exception(ep);
        });
-    }).then([this, id, plan_id, cf_id] {
-        _mutation_done_sent = true;
-        sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id.addr);
-    }).handle_exception([this, plan_id, cf_id, id] (std::exception_ptr ep) {
-        // If the table is dropped during streaming, we can ignore the
-        // errors and make the stream successful. This allows user to
-        // drop tables during node operations like decommission or
-        // bootstrap.
-        if (!session->manager().db().column_family_exists(cf_id)) {
-            sslog.warn("[Stream #{}] Ignore the table with table_id {} which is dropped during streaming: {}", plan_id, cf_id, ep);
-            if (!_mutation_done_sent) {
-                return session->manager().ms().send_stream_mutation_done(id, plan_id, _ranges, cf_id, session->dst_cpu_id);
-            }
-            return make_ready_future<>();
-        }
-        sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
-        std::rethrow_exception(ep);
    });
+    // If the table is dropped during streaming, we can ignore the
+    // errors and make the stream successful. This allows user to
+    // drop tables during node operations like decommission or
+    // bootstrap.
+    if (table_dropped) {
+        sslog.warn("[Stream #{}] Ignore the table with table_id {} which is dropped during streaming", plan_id, cf_id);
+        if (!_mutation_done_sent) {
+            co_await session->manager().ms().send_stream_mutation_done(id, plan_id, _ranges, cf_id, session->dst_cpu_id);
+        }
+    }
 }

 void stream_transfer_task::append_ranges(const dht::token_range_vector& ranges) {
--- a/tasks/task_manager.cc
+++ b/tasks/task_manager.cc
@@ -201,7 +201,7 @@ void task_manager::task::unregister_task() noexcept {
    _impl->_module->unregister_task(id());
 }

-const task_manager::foreign_task_vector& task_manager::task::get_children() const noexcept {
+const task_manager::foreign_task_list& task_manager::task::get_children() const noexcept {
    return _impl->_children;
 }

--- a/tasks/task_manager.hh
+++ b/tasks/task_manager.hh
@@ -40,7 +40,7 @@ public:
    using task_ptr = lw_shared_ptr<task_manager::task>;
    using task_map = std::unordered_map<task_id, task_ptr>;
    using foreign_task_ptr = foreign_ptr<task_ptr>;
-    using foreign_task_vector = std::vector<foreign_task_ptr>;
+    using foreign_task_list = std::list<foreign_task_ptr>;
    using module_ptr = shared_ptr<module>;
    using modules = std::unordered_map<std::string, module_ptr>;
 private:
@@ -95,7 +95,7 @@ public:
            status _status;
            progress _progress;             // Reliable only for tasks with no descendants.
            task_id _parent_id;
-            foreign_task_vector _children;
+            foreign_task_list _children;
            shared_promise<> _done;
            module_ptr _module;
            abort_source _as;
@@ -145,7 +145,7 @@ public:
        future<> done() const noexcept;
        void register_task();
        void unregister_task() noexcept;
-        const foreign_task_vector& get_children() const noexcept;
+        const foreign_task_list& get_children() const noexcept;
        void release_resources() noexcept;

        friend class test_task;
--- a/test/alternator/test_gsi.py
+++ b/test/alternator/test_gsi.py
@@ -1384,3 +1384,78 @@ def test_gsi_query_select_2(dynamodb):
            IndexName='hello',
            Select='COUNT',
            KeyConditions={'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}})
+
+# Test similar to test_11801 and test_11801_variant2, but this test first
+# updates the range key b to a new value (like variant2) and then sets it
+# back to its original value. It reproduces issue #17119 - the last
+# modification was lost because the wrong timestamp was used.
+# The bug is specific to the case that the GSI has two non-key columns
+# as its keys, so we test it on test_table_gsi_3 which has this feature.
+def test_17119(test_table_gsi_3):
+    p = random_string()
+    a = random_string()
+    b = random_string()
+    item = {'p': p, 'a': a, 'b': b, 'd': random_string()}
+    test_table_gsi_3.put_item(Item=item)
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    # Change the GSI range key b to a different value newb.
+    newb = random_string()
+    test_table_gsi_3.update_item(Key={'p':  p}, AttributeUpdates={'b': {'Value': newb, 'Action': 'PUT'}})
+    item['b'] = newb
+    assert item == test_table_gsi_3.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    # The item newb should appear in the GSI, item b should be gone:
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [newb], 'ComparisonOperator': 'EQ'}})
+    assert_index_query(test_table_gsi_3, 'hello', [],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    # Change the GSI range key b back to its original value. Item newb
+    # should disappear from the GSI, and item b should reappear:
+    test_table_gsi_3.update_item(Key={'p':  p}, AttributeUpdates={'b': {'Value': b, 'Action': 'PUT'}})
+    item['b'] = b
+    assert item == test_table_gsi_3.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    assert_index_query(test_table_gsi_3, 'hello', [],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [newb], 'ComparisonOperator': 'EQ'}})
+    # This assertion failed in issue #17119:
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+
+# This test is like test_17119 above, just in a table with just one new
+# key column in the GSI. The bug of #17119 doesn't reproduce here, showing
+# the problem was specific to the case of two new GSI key columns.
+def test_17119a(test_table_gsi_2):
+    p = random_string()
+    x = random_string()
+    item = {'p': p, 'x': x, 'z': random_string()}
+    test_table_gsi_2.put_item(Item=item)
+    assert_index_query(test_table_gsi_2, 'hello', [item],
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
+                       'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}})
+    # Change the GSI range key x to a different value.
+    newx = random_string()
+    test_table_gsi_2.update_item(Key={'p':  p}, AttributeUpdates={'x': {'Value': newx, 'Action': 'PUT'}})
+    item['x'] = newx
+    assert item == test_table_gsi_2.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    # The item newx should appear in the GSI, item x should be gone:
+    assert_index_query(test_table_gsi_2, 'hello', [item],
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
+                       'x': {'AttributeValueList': [newx], 'ComparisonOperator': 'EQ'}})
+    assert_index_query(test_table_gsi_2, 'hello', [],
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
+                       'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}})
+    # Change the GSI range key x back to its original value. Item newx
+    # should disappear from the GSI, and item x should reappear:
+    test_table_gsi_2.update_item(Key={'p':  p}, AttributeUpdates={'x': {'Value': x, 'Action': 'PUT'}})
+    item['x'] = x
+    assert item == test_table_gsi_2.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    assert_index_query(test_table_gsi_2, 'hello', [],
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
+                       'x': {'AttributeValueList': [newx], 'ComparisonOperator': 'EQ'}})
+    assert_index_query(test_table_gsi_2, 'hello', [item],
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
+                       'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}})
--- a/test/boost/castas_fcts_test.cc
+++ b/test/boost/castas_fcts_test.cc
@@ -494,29 +494,29 @@ SEASTAR_TEST_CASE(test_time_casts_in_selection_clause) {
        }
        {
            auto msg = e.execute_cql("SELECT CAST(CAST(a AS timestamp) AS text), CAST(CAST(a AS date) AS text), CAST(CAST(b as date) AS text), CAST(CAST(c AS timestamp) AS text) FROM test").get0();
-            assert_that(msg).is_rows().with_size(1).with_row({{utf8_type->from_string("2009-12-17T00:26:29.805000")},
+            assert_that(msg).is_rows().with_size(1).with_row({{utf8_type->from_string("2009-12-17T00:26:29.805Z")},
                                                              {utf8_type->from_string("2009-12-17")},
                                                              {utf8_type->from_string("2015-05-21")},
-                                                              {utf8_type->from_string("2015-05-21T00:00:00")}});
+                                                              {utf8_type->from_string("2015-05-21T00:00:00.000Z")}});
        }
        {
            auto msg = e.execute_cql("SELECT CAST(a AS text), CAST(b as text), CAST(c AS text), CAST(d AS text) FROM test").get0();
            assert_that(msg).is_rows().with_size(1).with_row({{utf8_type->from_string("d2177dd0-eaa2-11de-a572-001b779c76e3")},
-                                                              {utf8_type->from_string("2015-05-21T11:03:02")},
+                                                              {utf8_type->from_string("2015-05-21T11:03:02.000Z")},
                                                              {utf8_type->from_string("2015-05-21")},
                                                              {utf8_type->from_string("11:03:02.000000000")}});
        }
        {
            auto msg = e.execute_cql("SELECT CAST(CAST(a AS timestamp) AS ascii), CAST(CAST(a AS date) AS ascii), CAST(CAST(b as date) AS ascii), CAST(CAST(c AS timestamp) AS ascii) FROM test").get0();
-            assert_that(msg).is_rows().with_size(1).with_row({{ascii_type->from_string("2009-12-17T00:26:29.805000")},
+            assert_that(msg).is_rows().with_size(1).with_row({{ascii_type->from_string("2009-12-17T00:26:29.805Z")},
                                                              {ascii_type->from_string("2009-12-17")},
                                                              {ascii_type->from_string("2015-05-21")},
-                                                              {ascii_type->from_string("2015-05-21T00:00:00")}});
+                                                              {ascii_type->from_string("2015-05-21T00:00:00.000Z")}});
        }
        {
            auto msg = e.execute_cql("SELECT CAST(a AS ascii), CAST(b as ascii), CAST(c AS ascii), CAST(d AS ascii) FROM test").get0();
            assert_that(msg).is_rows().with_size(1).with_row({{ascii_type->from_string("d2177dd0-eaa2-11de-a572-001b779c76e3")},
-                                                              {ascii_type->from_string("2015-05-21T11:03:02")},
+                                                              {ascii_type->from_string("2015-05-21T11:03:02.000Z")},
                                                              {ascii_type->from_string("2015-05-21")},
                                                              {ascii_type->from_string("11:03:02.000000000")}});
        }
--- a/Show More
+++ b/Show More