doc: remove wrong image upgrade info (5.2-to-2023.1)

This commit removes the information about the recommended way of upgrading ScyllaDB images - by updating ScyllaDB and OS packages in one step. This upgrade procedure is not supported (it was implemented, but then reverted). Refs https://github.com/scylladb/scylladb/issues/15733 Closes scylladb/scylladb#21876 Fixes https://github.com/scylladb/scylla-enterprise/issues/5041 Fixes https://github.com/scylladb/scylladb/issues/21898 (cherry picked from commit 98860905d8)
db/config.cc: increment components_memory_reclaim_threshold config default
2024-12-12 15:28:20 +02:00 · 2024-06-04 07:13:28 +03:00 · 2024-05-30 11:11:39 +03:00 · 2024-05-30 11:10:49 +03:00 · 2024-05-27 08:52:06 +03:00 · 2024-05-26 16:30:06 +03:00
281 changed files with 6147 additions and 2563 deletions
--- a/2
+++ b/2
@@ -72,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.2.7
+VERSION=5.2.19

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -764,7 +764,6 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
        co_return api_error::access_denied("Incorrect resource identifier");
    }
    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
    const rjson::value* tags = rjson::find(request, "Tags");
    if (!tags || !tags->IsArray()) {
        co_return api_error::validation("Cannot parse tags");
@@ -772,8 +771,9 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    if (tags->Size() < 1) {
        co_return api_error::validation("The number of tags must be at least 1") ;
    }
-    update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
+    });
    co_return json_string("");
 }

@@ -791,9 +791,9 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli

    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
+        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+    });
    co_return json_string("");
 }

--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -94,24 +94,25 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
-    if (enabled) {
-        if (tags_map.contains(TTL_TAG_KEY)) {
-            co_return api_error::validation("TTL is already enabled");
+    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
+        if (enabled) {
+            if (tags_map.contains(TTL_TAG_KEY)) {
+                throw api_error::validation("TTL is already enabled");
+            }
+            tags_map[TTL_TAG_KEY] = attribute_name;
+        } else {
+            auto i = tags_map.find(TTL_TAG_KEY);
+            if (i == tags_map.end()) {
+                throw api_error::validation("TTL is already disabled");
+            } else if (i->second != attribute_name) {
+                throw api_error::validation(format(
+                    "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
+                    attribute_name, i->second));
+            }
+            tags_map.erase(TTL_TAG_KEY);
        }
-        tags_map[TTL_TAG_KEY] = attribute_name;
-    } else {
-        auto i = tags_map.find(TTL_TAG_KEY);
-        if (i == tags_map.end()) {
-            co_return api_error::validation("TTL is already disabled");
-        } else if (i->second != attribute_name) {
-            co_return api_error::validation(format(
-                "Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
-                attribute_name, i->second));
-        }
-        tags_map.erase(TTL_TAG_KEY);
-    }
-    co_await db::update_tags(_mm, schema, std::move(tags_map));
+    });
+
    // Prepare the response, which contains a TimeToLiveSpecification
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
--- a/api/api-doc/raft.json
+++ b/api/api-doc/raft.json
@@ -0,0 +1,43 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/raft",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/raft/trigger_snapshot/{group_id}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Triggers snapshot creation and log truncation for the given Raft group",
+               "type":"string",
+               "nickname":"trigger_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"group_id",
+                     "description":"The ID of the group which should get snapshotted",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Timeout in seconds after which the endpoint returns a failure. If not provided, 60s is used.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1946,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Reset local schema",
+               "summary":"Forces this node to recalculate versions of schema objects.",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
--- a/api/api.cc
+++ b/api/api.cc
@@ -31,6 +31,7 @@
 #include "api/config.hh"
 #include "task_manager.hh"
 #include "task_manager_test.hh"
+#include "raft.hh"

 logging::logger apilog("api");

@@ -277,6 +278,18 @@ future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::confi

 #endif

+future<> set_server_raft(http_context& ctx, sharded<service::raft_group_registry>& raft_gr) {
+    auto rb = std::make_shared<api_registry_builder>(ctx.api_doc);
+    return ctx.http_server.set_routes([rb, &ctx, &raft_gr] (routes& r) {
+        rb->register_function(r, "raft", "The Raft API");
+        set_raft(ctx, r, raft_gr);
+    });
+}
+
+future<> unset_server_raft(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_raft(ctx, r); });
+}
+
 void req_params::process(const request& req) {
    // Process mandatory parameters
    for (auto& [name, ent] : params) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -22,6 +22,7 @@ namespace service {
 class load_meter;
 class storage_proxy;
 class storage_service;
+class raft_group_registry;

 } // namespace service

@@ -116,5 +117,7 @@ future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
 future<> set_server_task_manager(http_context& ctx);
 future<> set_server_task_manager_test(http_context& ctx, lw_shared_ptr<db::config> cfg);
+future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
+future<> unset_server_raft(http_context&);

 }
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -17,36 +17,42 @@ namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
-        std::vector<fd::endpoint_state> res;
-        for (auto i : g.get_endpoint_states()) {
-            fd::endpoint_state val;
-            val.addrs = boost::lexical_cast<std::string>(i.first);
-            val.is_alive = i.second.is_alive();
-            val.generation = i.second.get_heart_beat_state().get_generation();
-            val.version = i.second.get_heart_beat_state().get_heart_beat_version();
-            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
-            for (auto a : i.second.get_application_state_map()) {
-                fd::version_value version_val;
-                // We return the enum index and not it's name to stay compatible to origin
-                // method that the state index are static but the name can be changed.
-                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
-                version_val.value = a.second.value;
-                version_val.version = a.second.version;
-                val.application_state.push(version_val);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::vector<fd::endpoint_state> res;
+            for (auto i : g.get_endpoint_states()) {
+                fd::endpoint_state val;
+                val.addrs = boost::lexical_cast<std::string>(i.first);
+                val.is_alive = i.second.is_alive();
+                val.generation = i.second.get_heart_beat_state().get_generation();
+                val.version = i.second.get_heart_beat_state().get_heart_beat_version();
+                val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+                for (auto a : i.second.get_application_state_map()) {
+                    fd::version_value version_val;
+                    // We return the enum index and not it's name to stay compatible to origin
+                    // method that the state index are static but the name can be changed.
+                    version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                    version_val.value = a.second.value;
+                    version_val.version = a.second.version;
+                    val.application_state.push(version_val);
+                }
+                res.push_back(val);
            }
-            res.push_back(val);
-        }
-        return make_ready_future<json::json_return_type>(res);
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_up_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_up_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_up_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_down_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_down_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_down_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_phi_convict_threshold.set(r, [] (std::unique_ptr<request> req) {
@@ -54,11 +60,13 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
-        std::map<sstring, sstring> nodes_status;
-        for (auto& entry : g.get_endpoint_states()) {
-            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
-        }
-        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::map<sstring, sstring> nodes_status;
+            for (auto& entry : g.get_endpoint_states()) {
+                nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+            }
+            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        });
    });

    fd::set_phi_convict_threshold.set(r, [](std::unique_ptr<request> req) {
@@ -67,13 +75,15 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
-        if (!state) {
-            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
-        }
-        std::stringstream ss;
-        g.append_endpoint_state(ss, *state);
-        return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
+            auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
+            if (!state) {
+                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+            }
+            std::stringstream ss;
+            g.append_endpoint_state(ss, *state);
+            return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        });
    });

    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -6,6 +6,8 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <seastar/core/coroutine.hh>
+
 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
 #include "gms/gossiper.hh"
@@ -14,19 +16,23 @@ namespace api {
 using namespace json;

 void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
-    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_unreachable_members();
-        return container_to_vec(res);
+    httpd::gossiper_json::get_down_endpoint.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        auto res = co_await g.get_unreachable_members_synchronized();
+        co_return json::json_return_type(container_to_vec(res));
    });

-    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_live_members();
-        return container_to_vec(res);
+
+    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (std::unique_ptr<request> req) {
+        return g.get_live_members_synchronized().then([] (auto res) {
+            return make_ready_future<json::json_return_type>(container_to_vec(res));
+        });
    });

-    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
-        gms::inet_address ep(req.param["addr"]);
-        return g.get_endpoint_downtime(ep);
+    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
+        gms::inet_address ep(req->param["addr"]);
+        // synchronize unreachable_members on all shards
+        co_await g.get_unreachable_members_synchronized();
+        co_return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<request> req) {
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <seastar/core/coroutine.hh>
+
+#include "api/api.hh"
+#include "api/api-doc/raft.json.hh"
+
+#include "service/raft/raft_group_registry.hh"
+
+using namespace seastar::httpd;
+
+extern logging::logger apilog;
+
+namespace api {
+
+namespace r = httpd::raft_json;
+using namespace json;
+
+void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
+    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
+        raft::group_id gid{utils::UUID{req->param["group_id"]}};
+        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
+            if (timeout_str.empty()) {
+                return std::chrono::seconds{60};
+            }
+            auto dur = std::stoll(timeout_str);
+            if (dur <= 0) {
+                throw std::runtime_error{"Timeout must be a positive number."};
+            }
+            return std::chrono::seconds{dur};
+        });
+
+        std::atomic<bool> found_srv{false};
+        co_await raft_gr.invoke_on_all([gid, timeout_dur, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
+            auto* srv = raft_gr.find_server(gid);
+            if (!srv) {
+                co_return;
+            }
+
+            found_srv = true;
+            abort_on_expiry aoe(lowres_clock::now() + timeout_dur);
+            apilog.info("Triggering Raft group {} snapshot", gid);
+            auto result = co_await srv->trigger_snapshot(&aoe.abort_source());
+            if (result) {
+                apilog.info("New snapshot for Raft group {} created", gid);
+            } else {
+                apilog.info("Could not create new snapshot for Raft group {}, no new entries applied", gid);
+            }
+        });
+
+        if (!found_srv) {
+            throw std::runtime_error{fmt::format("Server for group ID {} not found", gid)};
+        }
+
+        co_return json_void{};
+    });
+}
+
+void unset_raft(http_context&, httpd::routes& r) {
+    r::trigger_snapshot.unset(r);
+}
+
+}
+
--- a/api/raft.hh
+++ b/api/raft.hh
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2023-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "api_init.hh"
+
+namespace api {
+
+void set_raft(http_context& ctx, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr);
+void unset_raft(http_context& ctx, httpd::routes& r);
+
+}
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -243,17 +243,21 @@ future<json::json_return_type> set_tables_autocompaction(http_context& ctx, cons
 }

 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
-    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::start_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::stop_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -275,17 +279,21 @@ void unset_transport_controller(http_context& ctx, routes& r) {
 }

 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
-    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+    ss::stop_rpc_server.set(r, [&ctx, &ctl] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+    ss::start_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -1041,14 +1049,11 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&sys_ks](std::unique_ptr<request> req) {
+    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
-        auto& sp = service::get_storage_proxy();
-        auto& fs = sp.local().features();
        apilog.info("reset_local_schema");
-        return db::schema_tables::recalculate_schema_version(sys_ks, sp, fs).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        co_await ss.local().reload_schema();
+        co_return json_void();
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -176,7 +176,9 @@ void set_task_manager(http_context& ctx, routes& r) {
        auto task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
            return task->done().then_wrapped([task] (auto f) {
                task->unregister_task();
-                f.get();
+                // done() is called only because we want the task to be complete before getting its status.
+                // The future should be ignored here as the result does not matter.
+                f.ignore_ready_future();
                return make_foreign(task);
            });
        }));
@@ -204,8 +206,8 @@ void set_task_manager(http_context& ctx, routes& r) {
        while (!q.empty()) {
            auto& current = q.front();
            res.push_back(co_await retrieve_status(current));
-            for (auto i = 0; i < current->get_children().size(); ++i) {
-                q.push(co_await current->get_children()[i].copy());
+            for (auto& child: current->get_children()) {
+                q.push(co_await child.copy());
            }
            q.pop();
        }
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -229,6 +229,8 @@ future<authenticated_user> password_authenticator::authenticate(
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
        } catch (exceptions::authentication_exception& e) {
            std::throw_with_nested(e);
+        } catch (exceptions::unavailable_exception& e) {
+            std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
        } catch (...) {
            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -25,6 +25,7 @@
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
 #include "utils/UUID_gen.hh"
+#include "utils/error_injection.hh"

 #include "cdc/generation.hh"
 #include "cdc/cdc_options.hh"
@@ -44,8 +45,16 @@ static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const

 namespace cdc {

-extern const api::timestamp_clock::duration generation_leeway =
-    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+api::timestamp_clock::duration get_generation_leeway() {
+    static thread_local auto generation_leeway =
+            std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+    utils::get_local_injector().inject("increase_cdc_generation_leeway", [&] {
+        generation_leeway = std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::minutes(5));
+    });
+
+    return generation_leeway;
+}

 static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    i = net::hton(i);
@@ -160,18 +169,18 @@ bool token_range_description::operator==(const token_range_description& o) const
        && sharding_ignore_msb == o.sharding_ignore_msb;
 }

-topology_description::topology_description(std::vector<token_range_description> entries)
+topology_description::topology_description(utils::chunked_vector<token_range_description> entries)
    : _entries(std::move(entries)) {}

 bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const& {
+const utils::chunked_vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-std::vector<token_range_description>&& topology_description::entries() && {
+utils::chunked_vector<token_range_description>&& topology_description::entries() && {
    return std::move(_entries);
 }

@@ -263,7 +272,7 @@ public:
    topology_description generate() const {
        const auto tokens = get_tokens();

-        std::vector<token_range_description> vnode_descriptions;
+        utils::chunked_vector<token_range_description> vnode_descriptions;
        vnode_descriptions.reserve(tokens.size());

        vnode_descriptions.push_back(
@@ -331,7 +340,7 @@ future<cdc::generation_id> generation_service::make_new_generation(const std::un
    auto new_generation_timestamp = [add_delay, ring_delay = _cfg.ring_delay] {
        auto ts = db_clock::now();
        if (add_delay && ring_delay != 0ms) {
-            ts += 2 * ring_delay + duration_cast<milliseconds>(generation_leeway);
+            ts += 2 * ring_delay + duration_cast<milliseconds>(get_generation_leeway());
        }
        return ts;
    };
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -46,6 +46,8 @@ namespace gms {

 namespace cdc {

+api::timestamp_clock::duration get_generation_leeway();
+
 class stream_id final {
    bytes _value;
 public:
@@ -94,13 +96,13 @@ struct token_range_description {
 * in the `_entries` vector. See the comment above `token_range_description` for explanation.
 */
 class topology_description {
-    std::vector<token_range_description> _entries;
+    utils::chunked_vector<token_range_description> _entries;
 public:
-    topology_description(std::vector<token_range_description> entries);
+    topology_description(utils::chunked_vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const&;
-    std::vector<token_range_description>&& entries() &&;
+    const utils::chunked_vector<token_range_description>& entries() const&;
+    utils::chunked_vector<token_range_description>&& entries() &&;
 };

 /**
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -15,10 +15,6 @@

 extern logging::logger cdc_log;

-namespace cdc {
-    extern const api::timestamp_clock::duration generation_leeway;
-} // namespace cdc
-
 static api::timestamp_type to_ts(db_clock::time_point tp) {
    // This assumes that timestamp_clock and db_clock have the same epochs.
    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
@@ -40,7 +36,7 @@ static cdc::stream_id get_stream(

 // non-static for testing
 cdc::stream_id get_stream(
-        const std::vector<cdc::token_range_description>& entries,
+        const utils::chunked_vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
        on_internal_error(cdc_log, "get_stream: entries empty");
@@ -73,7 +69,7 @@ bool cdc::metadata::streams_available() const {

 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
-    if (ts > now + generation_leeway.count()) {
+    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
@@ -86,27 +82,43 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // Nothing protects us from that until we start using transactions for generation switching.
    }

-    auto it = gen_used_at(now);
-    if (it == _gens.end()) {
+    auto it = gen_used_at(now - get_generation_leeway().count());
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+    }
+
+    if (ts <= now - get_generation_leeway().count()) {
+        // We reject the write if `ts <= now - generation_leeway` and the write is not to the current generation, which
+        // happens iff one of the following is true:
+        // - the write is to no generation,
+        // - the write is to a generation older than the generation under `it`,
+        // - the write is to the generation under `it` and that generation is not the current generation.
+        // Note that we cannot distinguish the first and second cases because we garbage-collect obsolete generations,
+        // but we can check if one of them takes place (`it == _gens.end() || ts < it->first`). These three conditions
+        // are sufficient. The write with `ts <= now - generation_leeway` cannot be to one of the generations following
+        // the generation under `it` because that generation was operating at `now - generation_leeway`.
+        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
+        if (it == _gens.end() || ts < it->first || is_previous_gen) {
+            throw exceptions::invalid_request_exception(format(
+                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
+                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                    " consistency properties.\n"
+                    "We *do* allow sending writes into the near past, but our ability to do that is limited."
+                    " Are you using client-side timestamps? Make sure your clocks are well-synchronized"
+                    " with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        }
+    }
+
+    it = _gens.begin();
+    if (it == _gens.end() || ts < it->first) {
        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
    }

-    // Garbage-collect generations that will no longer be used.
-    it = _gens.erase(_gens.begin(), it);
-
-    if (it->first > ts) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream from an earlier generation than the currently used one."
-                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                " consistency properties (write timestamp: {}, current generation started at: {})",
-                format_timestamp(ts), format_timestamp(it->first)));
-    }
-
-    // With `generation_leeway` we allow sending writes to the near future. It might happen
-    // that `ts` doesn't belong to the current generation ("current" according to our clock),
-    // but to the next generation. Adjust for this case:
+    // Find the generation operating at `ts`.
    {
        auto next_it = std::next(it);
        while (next_it != _gens.end() && next_it->first <= ts) {
@@ -147,8 +159,8 @@ bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
        ++it;
    }

-    // Check if some new generation has already superseded this one.
-    return it != _gens.end() && it->first <= api::new_timestamp();
+    // Check if the generation is obsolete.
+    return it != _gens.end() && it->first <= api::new_timestamp() - get_generation_leeway().count();
 }

 bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
@@ -157,7 +169,7 @@ bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen)
    }

    auto now = api::new_timestamp();
-    auto it = gen_used_at(now);
+    auto it = gen_used_at(now - get_generation_leeway().count());

    if (it != _gens.end()) {
        // Garbage-collect generations that will no longer be used.
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -42,7 +42,9 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
 public:
-    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
+     * it is older than the generation operating at `now - get_generation_leeway()`.
+     */
    bool known_or_obsolete(db_clock::time_point) const;

    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
@@ -54,8 +56,9 @@ public:
     *
     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
-     * by the `cdc::generation_leeway` constant.
+     * yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
+     * the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
+     * `get_generation_leeway()`.
     */
    stream_id get_stream(api::timestamp_type ts, dht::token tok);

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -168,7 +168,7 @@ std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
 }

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -179,6 +179,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
            hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
        }
        if (sst->filter_has_key(*hk)) {
+            bloom_filter_checks++;
            timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
        }
    }
@@ -463,6 +464,8 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    double _estimated_droppable_tombstone_ratio = 0;
+    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
    bool _can_split_large_partition = false;
@@ -519,7 +522,7 @@ protected:
        auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
        uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
-                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
+                        _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
    }

    void setup_new_sstable(shared_sstable& sst) {
@@ -582,6 +585,7 @@ protected:
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
        cfg.run_identifier = gc_run;
        cfg.monitor = monitor.get();
+        uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
    }
@@ -625,8 +629,8 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -652,6 +656,7 @@ private:
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

+        double sum_of_estimated_droppable_tombstone_ratio = 0;
        _input_sstable_generations.reserve(_sstables.size());
        for (auto& sst : _sstables) {
            co_await coroutine::maybe_yield();
@@ -686,12 +691,16 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
+            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state());
+            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->all()->size() < _sstables.size()) {
            log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
                      _sstables.size() - ssts->all()->size(), _sstables.size());
        }
+        // _estimated_droppable_tombstone_ratio could exceed 1.0 in certain cases, so limit it to 1.0.
+        _estimated_droppable_tombstone_ratio = std::min(1.0, sum_of_estimated_droppable_tombstone_ratio / ssts->all()->size());

        _compacting = std::move(ssts);

@@ -765,6 +774,7 @@ protected:
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
+                .bloom_filter_checks = _bloom_filter_checks,
            },
        };

@@ -784,7 +794,7 @@ protected:
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_start_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -805,7 +815,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
        };
    }

@@ -1005,51 +1015,6 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

-class reshape_compaction : public compaction {
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
-        : compaction(table_s, std::move(descriptor), cdata) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader() const override {
-        return _compacting->make_local_shard_sstable_reader(_schema,
-                _permit,
-                query::full_partition_range,
-                _schema->full_slice(),
-                _io_priority,
-                tracing::trace_state_ptr(),
-                ::streamed_mutation::forwarding::no,
-                ::mutation_reader::forwarding::no,
-                default_read_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            finish_new_sstable(writer);
-        }
-    }
-};
-
 class regular_compaction : public compaction {
    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
    mutable compaction_read_monitor_generator _monitor_generator;
@@ -1159,12 +1124,13 @@ private:
    }

    void update_pending_ranges() {
-        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
+        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
+        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : _cdata.pending_replacements) {
+        for (auto& pending_replacement : pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1178,7 +1144,70 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-        _cdata.pending_replacements.clear();
+    }
+};
+
+class reshape_compaction : public regular_compaction {
+private:
+    bool has_sstable_replacer() const noexcept {
+        return bool(_replacer);
+    }
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+            : regular_compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
+        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            if (has_sstable_replacer()) {
+                regular_compaction::stop_sstable_writer(writer);
+            } else {
+                finish_new_sstable(writer);
+            }
+        }
+    }
+
+    virtual void on_end_of_compaction() override {
+        if (has_sstable_replacer()) {
+            regular_compaction::on_end_of_compaction();
+        }
    }
 };

@@ -1598,7 +1627,7 @@ private:
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
+                _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
    }
 public:
    resharding_compaction(table_state& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata)
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -92,12 +92,15 @@ struct compaction_stats {
    uint64_t start_size = 0;
    uint64_t end_size = 0;
    uint64_t validation_errors = 0;
+    // Bloom filter checks during max purgeable calculation
+    uint64_t bloom_filter_checks = 0;

    compaction_stats& operator+=(const compaction_stats& r) {
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
+        bloom_filter_checks += r.bloom_filter_checks;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -298,7 +298,7 @@ compaction_manager::task::task(compaction_manager& mgr, compaction::table_state*
    , _description(std::move(desc))
 {}

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task, throw_if_stopping do_throw_if_stopping) {
    _tasks.push_back(task);
    auto unregister_task = defer([this, task] {
        _tasks.remove(task);
@@ -311,6 +311,9 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
        co_return res;
    } catch (sstables::compaction_stopped_exception& e) {
        cmlog.info("{}: stopped, reason: {}", *task, e.what());
+        if (do_throw_if_stopping) {
+            throw;
+        }
    } catch (sstables::compaction_aborted_exception& e) {
        cmlog.error("{}: aborted, reason: {}", *task, e.what());
        _stats.errors++;
@@ -344,8 +347,11 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables_a

    co_return res;
 }
-future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge) {
+
+future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge,
+                                                                               sstables::offstrategy offstrategy) {
    compaction::table_state& t = *_compacting_table;
+
    if (can_purge) {
        descriptor.enable_garbage_collection(t.main_sstable_set());
    }
@@ -353,7 +359,8 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables(s
        auto sst = t.make_sstable();
        return sst;
    };
-    descriptor.replacer = [this, &t, &on_replace] (sstables::compaction_completion_desc desc) {
+
+    descriptor.replacer = [this, &t, &on_replace, offstrategy] (sstables::compaction_completion_desc desc) {
        t.get_compaction_strategy().notify_completion(desc.old_sstables, desc.new_sstables);
        _cm.propagate_replacement(t, desc.old_sstables, desc.new_sstables);
        // on_replace updates the compacting registration with the old and new
@@ -370,7 +377,7 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables(s
        // - are not being compacted.
        on_replace.on_addition(desc.new_sstables);
        auto old_sstables = desc.old_sstables;
-        t.on_compaction_completion(std::move(desc), sstables::offstrategy::no).get();
+        t.on_compaction_completion(std::move(desc), offstrategy).get();
        on_replace.on_removal(old_sstables);
    };

@@ -475,12 +482,12 @@ protected:
    }
 };

-future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job) {
+future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping) {
    if (_state != state::enabled) {
        return make_ready_future<>();
    }

-    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job))).discard_result();
+    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job)), do_throw_if_stopping).discard_result();
 }

 future<> compaction_manager::update_static_shares(float static_shares) {
@@ -1061,7 +1068,7 @@ void compaction_manager::submit(compaction::table_state& t) {

    // OK to drop future.
    // waited via task->stop()
-    (void)perform_task(make_shared<regular_compaction_task>(*this, t));
+    (void)perform_task(make_shared<regular_compaction_task>(*this, t)).then_wrapped([] (auto f) { f.ignore_ready_future(); });
 }

 bool compaction_manager::can_perform_regular_compaction(compaction::table_state& t) {
@@ -1120,54 +1127,40 @@ public:
    }
 private:
    future<> run_offstrategy_compaction(sstables::compaction_data& cdata) {
-        // This procedure will reshape sstables in maintenance set until it's ready for
-        // integration into main set.
-        // It may require N reshape rounds before the set satisfies the strategy invariant.
-        // This procedure also only updates maintenance set at the end, on success.
-        // Otherwise, some overlapping could be introduced in the set after each reshape
-        // round, progressively degrading read amplification until integration happens.
-        // The drawback of this approach is the 2x space requirement as the old sstables
-        // will only be deleted at the end. The impact of this space requirement is reduced
-        // by the fact that off-strategy is serialized across all tables, meaning that the
-        // actual requirement is the size of the largest table's maintenance set.
+        // Incrementally reshape the SSTables in maintenance set. The output of each reshape
+        // round is merged into the main set. The common case is that off-strategy input
+        // is mostly disjoint, e.g. repair-based node ops, then all the input will be
+        // reshaped in a single round. The incremental approach allows us to be space
+        // efficient (avoiding a 100% overhead) as we will incrementally replace input
+        // SSTables from maintenance set by output ones into main set.

        compaction::table_state& t = *_compacting_table;
-        const auto& maintenance_sstables = t.maintenance_sstable_set();

        // Filter out sstables that require view building, to avoid a race between off-strategy
        // and view building. Refs: #11882
-        const auto old_sstables = boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_sstables.all()
-                | boost::adaptors::filtered([] (const sstables::shared_sstable& sst) {
-            return !sst->requires_view_building();
-        }));
-        std::vector<sstables::shared_sstable> reshape_candidates = old_sstables;
-        std::vector<sstables::shared_sstable> sstables_to_remove;
-        std::unordered_set<sstables::shared_sstable> new_unused_sstables;
-
-        auto cleanup_new_unused_sstables_on_failure = defer([&new_unused_sstables] {
-            for (auto& sst : new_unused_sstables) {
-                sst->mark_for_deletion();
-            }
-        });
+        auto get_reshape_candidates = [&t] () {
+            auto maintenance_ssts = t.maintenance_sstable_set().all();
+            return boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_ssts
+                | boost::adaptors::filtered([](const sstables::shared_sstable& sst) {
+                        return !sst->requires_view_building();
+                }));
+        };

        auto get_next_job = [&] () -> std::optional<sstables::compaction_descriptor> {
            auto& iop = service::get_local_streaming_priority(); // run reshape in maintenance mode
-            auto desc = t.get_compaction_strategy().get_reshaping_job(reshape_candidates, t.schema(), iop, sstables::reshape_mode::strict);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), iop, sstables::reshape_mode::strict);
            return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

        std::exception_ptr err;
        while (auto desc = get_next_job()) {
-            desc->creator = [this, &new_unused_sstables, &t] (shard_id dummy) {
-                auto sst = t.make_sstable();
-                new_unused_sstables.insert(sst);
-                return sst;
-            };
-            auto input = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(desc->sstables);
+            auto compacting = compacting_sstable_registration(_cm, desc->sstables);
+            auto on_replace = compacting.update_on_sstable_replacement();

-            sstables::compaction_result ret;
            try {
-                ret = co_await sstables::compact_sstables(std::move(*desc), cdata, t);
+                sstables::compaction_result _ = co_await compact_sstables(std::move(*desc), _compaction_data, on_replace,
+                                                                          compaction_manager::can_purge_tombstones::no,
+                                                                          sstables::offstrategy::yes);
            } catch (sstables::compaction_stopped_exception&) {
                // If off-strategy compaction stopped on user request, let's not discard the partial work.
                // Therefore, both un-reshaped and reshaped data will be integrated into main set, allowing
@@ -1176,36 +1169,20 @@ private:
                break;
            }
            _performed = true;
-
-            // update list of reshape candidates without input but with output added to it
-            auto it = boost::remove_if(reshape_candidates, [&] (auto& s) { return input.contains(s); });
-            reshape_candidates.erase(it, reshape_candidates.end());
-            std::move(ret.new_sstables.begin(), ret.new_sstables.end(), std::back_inserter(reshape_candidates));
-
-            // If compaction strategy is unable to reshape input data in a single round, it may happen that a SSTable A
-            // created in round 1 will be compacted in a next round producing SSTable B. As SSTable A is no longer needed,
-            // it can be removed immediately. Let's remove all such SSTables immediately to reduce off-strategy space requirement.
-            // Input SSTables from maintenance set can only be removed later, as SSTable sets are only updated on completion.
-            auto can_remove_now = [&] (const sstables::shared_sstable& s) { return new_unused_sstables.contains(s); };
-            for (auto&& sst : input) {
-                if (can_remove_now(sst)) {
-                    co_await sst->unlink();
-                    new_unused_sstables.erase(std::move(sst));
-                } else {
-                    sstables_to_remove.push_back(std::move(sst));
-                }
-            }
        }

-        // at this moment reshape_candidates contains a set of sstables ready for integration into main set
-        auto completion_desc = sstables::compaction_completion_desc{
-            .old_sstables = std::move(old_sstables),
-            .new_sstables = std::move(reshape_candidates)
-        };
-        co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
+        // There might be some remaining sstables in maintenance set that didn't require reshape, or the
+        // user has aborted off-strategy. So we can only integrate them into the main set, such that
+        // they become candidates for regular compaction. We cannot hold them forever in maintenance set,
+        // as that causes read and space amplification issues.
+        if (auto sstables = get_reshape_candidates(); sstables.size()) {
+            auto completion_desc = sstables::compaction_completion_desc{
+                .old_sstables = sstables, // removes from maintenance set.
+                .new_sstables = sstables, // adds into main set.
+            };
+            co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
+        }

-        cleanup_new_unused_sstables_on_failure.cancel();
-        co_await sstables::sstable_directory::delete_atomically(std::move(sstables_to_remove));
        if (err) {
            co_await coroutine::return_exception_ptr(std::move(err));
        }
@@ -1228,9 +1205,11 @@ protected:
            std::exception_ptr ex;
            try {
                compaction::table_state& t = *_compacting_table;
-                auto maintenance_sstables = t.maintenance_sstable_set().all();
-                cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
-                        t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                {
+                    auto maintenance_sstables = t.maintenance_sstable_set().all();
+                    cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
+                               t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                }
                co_await run_offstrategy_compaction(_compaction_data);
                finish_compaction();
                cmlog.info("Done with off-strategy compaction for {}.{}", t.schema()->ks_name(), t.schema()->cf_name());
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -50,6 +50,8 @@ public:
    boost::icl::interval_map<dht::token, gc_clock::time_point, boost::icl::partial_absorber, std::less, boost::icl::inplace_max> map;
 };

+using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -152,8 +154,6 @@ public:
    protected:
        virtual future<compaction_stats_opt> do_run() = 0;

-        using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
        state switch_state(state new_state);

        future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -173,7 +173,7 @@ public:
        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
-                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
+                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstables::offstrategy offstrategy = sstables::offstrategy::no);
        future<> update_history(compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
        bool should_update_history(sstables::compaction_type ct) {
            return ct == sstables::compaction_type::Compaction;
@@ -325,7 +325,7 @@ private:
    per_table_history_maps _repair_history_maps;
    tombstone_gc_state _tombstone_gc_state;
 private:
-    future<compaction_stats_opt> perform_task(shared_ptr<task>);
+    future<compaction_stats_opt> perform_task(shared_ptr<task>, throw_if_stopping do_throw_if_stopping = throw_if_stopping::no);

    future<> stop_tasks(std::vector<shared_ptr<task>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -470,7 +470,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping);

    class compaction_reenabler {
        compaction_manager& _cm;
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -65,7 +65,7 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
    return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
 }

-uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
+uint64_t compaction_strategy_impl::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
    return partition_estimate;
 }

@@ -750,8 +750,8 @@ compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema
    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, iop, mode);
 }

-uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate);
+uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) {
+    return _compaction_strategy_impl->adjust_partition_estimate(ms_meta, partition_estimate, std::move(schema));
 }

 reader_consumer_v2 compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -108,7 +108,7 @@ public:

    compaction_backlog_tracker make_backlog_tracker();

-    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr);

    reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -70,7 +70,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() = 0;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema);

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer);

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -144,6 +144,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    auto max_sstable_size_in_bytes = _max_sstable_size_in_mb * 1024 * 1024;

+    leveled_manifest::logger.debug("get_reshaping_job: mode={} input.size={} max_sstable_size_in_bytes={}", mode == reshape_mode::relaxed ? "relaxed" : "strict", input.size(), max_sstable_size_in_bytes);
+
    for (auto& sst : input) {
        auto sst_level = sst->get_sstable_level();
        if (sst_level > leveled_manifest::MAX_LEVELS - 1) {
@@ -227,6 +229,9 @@ leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, s
 }

 unsigned leveled_compaction_strategy::ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size) {
+    if (!max_sstable_size) {
+        return 1;
+    }
    auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
        double inv_log_fanout = 1.0f / std::log(fanout);
        return log(x) * inv_log_fanout;
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -100,16 +100,27 @@ public:
    };
 };

-uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
-        // Not enough information, we assume the worst
-        return partition_estimate / max_data_segregation_window_count;
-    }
-    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
-    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
-    const auto window_size = get_window_size(_options);
+uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) {
+    // If not enough information, we assume the worst
+    auto estimated_window_count = max_data_segregation_window_count;
+    auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
+    bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
+    auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
+        const auto window_size = get_window_size(_options);
+        return (max_window + (window_size - 1) - min_window) / window_size;
+    };

-    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
+    if (!min_and_max_ts_available && default_ttl.count()) {
+        auto min_window = get_window_for(_options, timestamp_type(0));
+        auto max_window = get_window_for(_options, timestamp_type(default_ttl.count()));
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    } else if (min_and_max_ts_available) {
+        auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
+        auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
+
+        estimated_window_count = estimate_window_count(min_window, max_window);
+    }

    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
 }
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -157,7 +157,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;

-    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) override;
+    virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr s) override;

    virtual reader_consumer_v2 make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer_v2 end_consumer) override;

--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -560,7 +560,7 @@ public:
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
-                throw runtime_exception(format("non-zero component divider found ({:d}) mid", format("0x{:02x}", composite::eoc_type(marker) & 0xff)));
+                throw runtime_exception(format("non-zero component divider found ({:#02x}) mid", composite::eoc_type(marker) & 0xff));
            }
        }
        return ret;
--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -117,6 +117,8 @@ struct date_type_impl final : public concrete_type<db_clock::time_point> {

 using timestamp_date_base_class = concrete_type<db_clock::time_point>;

+sstring timestamp_to_json_string(const timestamp_date_base_class& t, const bytes_view& bv);
+
 struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    timeuuid_type_impl();
    static utils::UUID from_sstring(sstring_view s);
--- a/configure.py
+++ b/configure.py
@@ -698,6 +698,7 @@ scylla_core = (['message/messaging_service.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
+                'utils/on_internal_error.cc',
                'converting_mutation_partition_applier.cc',
                'readers/combined.cc',
                'readers/multishard.cc',
@@ -969,6 +970,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
+                'repair/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -1077,6 +1079,8 @@ api = ['api/api.cc',
       Json2Code('api/api-doc/error_injection.json'),
       'api/authorization_cache.cc',
       Json2Code('api/api-doc/authorization_cache.json'),
+       'api/raft.cc',
+       Json2Code('api/api-doc/raft.json'),
       ]

 alternator = [
@@ -1269,7 +1273,7 @@ deps['test/boost/bytes_ostream_test'] = [
    "test/lib/log.cc",
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
-deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
+deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'hashers.cc', 'utils/on_internal_error.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -463,8 +463,7 @@ orderByClause[raw::select_statement::parameters::orderings_type& orderings]
    ;

 jsonValue returns [expression value]
-    :
-    | s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
+    : s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
    | m=marker         { $value = std::move(m); }
    ;

@@ -1578,8 +1577,7 @@ marker returns [expression value]
    ;

 intValue returns [expression value]
-    :
-    | t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
+    : t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
    | e=marker      { $value = std::move(e); }
    ;

--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -78,7 +78,7 @@ static
 void
 usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    if (!receiver.type->is_user_type()) {
-        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid user type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto ut = static_pointer_cast<const user_type_impl>(receiver.type);
@@ -90,7 +90,7 @@ usertype_constructor_validate_assignable_to(const usertype_constructor& u, data_
        const expression& value = u.elements.at(field);
        auto&& field_spec = usertype_field_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *field_spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", receiver.name, field, field_spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid user type literal for {}: field {} is not of type {}", *receiver.name, field, field_spec->type->as_cql3_type()));
        }
    }
 }
@@ -313,7 +313,7 @@ set_validate_assignable_to(const collection_constructor& c, data_dictionary::dat
            return;
        }

-        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid set literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

    auto&& value_spec = set_value_spec_of(receiver);
@@ -501,18 +501,18 @@ void
 tuple_constructor_validate_assignable_to(const tuple_constructor& tc, data_dictionary::database db, const sstring& keyspace, const column_specification& receiver) {
    auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver.type->underlying_type());
    if (!tt) {
-        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver.name, receiver.type->as_cql3_type()));
+        throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    for (size_t i = 0; i < tc.elements.size(); ++i) {
        if (i >= tt->size()) {
            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: too many elements. Type {} expects {:d} but got {:d}",
-                                                            receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
+                                                            *receiver.name, tt->as_cql3_type(), tt->size(), tc.elements.size()));
        }

        auto&& value = tc.elements[i];
        auto&& spec = component_spec_of(receiver, i);
        if (!assignment_testable::is_assignable(test_assignment(value, db, keyspace, *spec))) {
-            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", receiver.name, i, spec->type->as_cql3_type()));
+            throw exceptions::invalid_request_exception(format("Invalid tuple literal for {}: component {:d} is not of type {}", *receiver.name, i, spec->type->as_cql3_type()));
        }
    }
 }
--- a/cql3/operation.cc
+++ b/cql3/operation.cc
@@ -32,9 +32,9 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
    using exceptions::invalid_request_exception;
    auto rtype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!rtype) {
-        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for non collection column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!rtype->is_multi_cell()) {
-        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (rtype->get_kind() == abstract_type::kind::list) {
@@ -47,7 +47,7 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
            return make_shared<lists::setter_by_index>(receiver, std::move(idx), std::move(lval));
        }
    } else if (rtype->get_kind() == abstract_type::kind::set) {
-        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name()));
+        throw invalid_request_exception(format("Invalid operation ({}) for set column {}", to_string(receiver), receiver.name_as_text()));
    } else if (rtype->get_kind() == abstract_type::kind::map) {
        auto key = prepare_expression(_selector, db, keyspace, nullptr, maps::key_spec_of(*receiver.column_specification));
        auto mval = prepare_expression(_value, db, keyspace, nullptr, maps::value_spec_of(*receiver.column_specification));
@@ -136,11 +136,11 @@ operation::addition::prepare(data_dictionary::database db, const sstring& keyspa
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        return make_shared<constants::adder>(receiver, std::move(v));
    } else if (!ctype->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -169,14 +169,14 @@ operation::subtraction::prepare(data_dictionary::database db, const sstring& key
    auto ctype = dynamic_pointer_cast<const collection_type_impl>(receiver.type);
    if (!ctype) {
        if (!receiver.is_counter()) {
-            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name()));
+            throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non counter column {}", to_string(receiver), receiver.name_as_text()));
        }
        auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);
        return make_shared<constants::subtracter>(receiver, std::move(v));
    }
    if (!ctype->is_multi_cell()) {
        throw exceptions::invalid_request_exception(
-                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name()));
+                format("Invalid operation ({}) for frozen collection column {}", to_string(receiver), receiver.name_as_text()));
    }

    if (ctype->get_kind() == abstract_type::kind::list) {
@@ -211,9 +211,9 @@ operation::prepend::prepare(data_dictionary::database db, const sstring& keyspac
    auto v = prepare_expression(_value, db, keyspace, nullptr, receiver.column_specification);

    if (!dynamic_cast<const list_type_impl*>(receiver.type.get())) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for non list column {}", to_string(receiver), receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid operation ({}) for frozen list column {}", to_string(receiver), receiver.name_as_text()));
    }

    return make_shared<lists::prepender>(receiver, std::move(v));
@@ -340,9 +340,9 @@ operation::element_deletion::affected_column() const {
 shared_ptr<operation>
 operation::element_deletion::prepare(data_dictionary::database db, const sstring& keyspace, const column_definition& receiver) const {
    if (!receiver.type->is_collection()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for non collection column {}", receiver.name_as_text()));
    } else if (!receiver.type->is_multi_cell()) {
-        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name()));
+        throw exceptions::invalid_request_exception(format("Invalid deletion operation for frozen collection column {}", receiver.name_as_text()));
    }
    auto ctype = static_pointer_cast<const collection_type_impl>(receiver.type);
    if (ctype->get_kind() == abstract_type::kind::list) {
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -932,6 +932,9 @@ void query_processor::migration_subscriber::on_update_aggregate(const sstring& k
 void query_processor::migration_subscriber::on_update_view(
        const sstring& ks_name,
        const sstring& view_name, bool columns_changed) {
+    // scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
+    // them as such when changed.
+    on_update_column_family(ks_name, view_name, columns_changed);
 }

 void query_processor::migration_subscriber::on_drop_keyspace(const sstring& ks_name) {
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -404,20 +404,19 @@ alter_table_statement::prepare_schema_mutations(query_processor& qp, api::timest

 std::unique_ptr<cql3::statements::prepared_statement>
 cql3::statements::alter_table_statement::prepare(data_dictionary::database db, cql_stats& stats) {
+    auto t = db.try_find_table(keyspace(), column_family());
+    std::optional<schema_ptr> s = t ? std::make_optional(t->schema()) : std::nullopt;
+    std::optional<sstring> warning = check_restricted_table_properties(db, s, keyspace(), column_family(), *_properties);
+    if (warning) {
+        mylogger.warn("{}", *warning);
+    }
    return std::make_unique<prepared_statement>(make_shared<alter_table_statement>(*this));
 }

 future<::shared_ptr<messages::result_message>>
 alter_table_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    auto s = validation::validate_column_family(qp.db(), keyspace(), column_family());
-    std::optional<sstring> warning = check_restricted_table_properties(qp, s, keyspace(), column_family(), *_properties);
-    return schema_altering_statement::execute(qp, state, options).then([this, warning = std::move(warning)] (::shared_ptr<messages::result_message> msg) {
-        if (warning) {
-            msg->add_warning(*warning);
-            mylogger.warn("{}", *warning);
-        }
-        return msg;
-    });
+    validation::validate_column_family(qp.db(), keyspace(), column_family());
+    return schema_altering_statement::execute(qp, state, options);
 }

 }
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -141,6 +141,18 @@ user_type alter_type_statement::add_or_alter::do_add(data_dictionary::database d
        throw exceptions::invalid_request_exception(format("Cannot add new field to type {}: maximum number of fields reached", _name));
    }

+    if (_field_type->is_duration()) {
+        auto&& ks = db.find_keyspace(keyspace());
+        for (auto&& schema : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
+            for (auto&& column : schema->clustering_key_columns()) {
+                if (column.type->references_user_type(_name.get_keyspace(), _name.get_user_type_name())) {
+                    throw exceptions::invalid_request_exception(format("Cannot add new field to type {} because it is used in the clustering key column {} of table {}.{} where durations are not allowed",
+                        _name.to_string(), column.name_as_text(), schema->ks_name(), schema->cf_name()));
+                }
+            }
+        }
+    }
+
    std::vector<bytes> new_names(to_update->field_names());
    new_names.push_back(_field_name->name());
    std::vector<data_type> new_types(to_update->field_types());
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -185,6 +185,10 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    if (_properties.properties()->get_synchronous_updates_flag()) {
        throw exceptions::invalid_request_exception(format("The synchronous_updates option is only applicable to materialized views, not to base tables"));
    }
+    std::optional<sstring> warning = check_restricted_table_properties(db, std::nullopt, keyspace(), column_family(), *_properties.properties());
+    if (warning) {
+        mylogger.warn("{}", *warning);
+    }
    const bool has_default_ttl = _properties.properties()->get_default_time_to_live() > 0;

    auto stmt = ::make_shared<create_table_statement>(*_cf_name, _properties.properties(), _if_not_exists, _static_columns, _properties.properties()->get_id());
@@ -426,7 +430,7 @@ void create_table_statement::raw_statement::add_column_alias(::shared_ptr<column
 // legal but restricted by the configuration. Checks for other of errors
 // in the table's options are done elsewhere.
 std::optional<sstring> check_restricted_table_properties(
-    query_processor& qp,
+    data_dictionary::database db,
    std::optional<schema_ptr> schema,
    const sstring& keyspace, const sstring& table,
    const cf_prop_defs& cfprops)
@@ -450,7 +454,7 @@ std::optional<sstring> check_restricted_table_properties(
    auto cs = (strategy) ? strategy : current_strategy;

    if (strategy && *strategy == sstables::compaction_strategy_type::date_tiered) {
-        switch(qp.db().get_config().restrict_dtcs()) {
+        switch(db.get_config().restrict_dtcs()) {
        case db::tri_mode_restriction_t::mode::TRUE:
            throw exceptions::configuration_exception(
                "DateTieredCompactionStrategy is deprecated, and "
@@ -471,7 +475,7 @@ std::optional<sstring> check_restricted_table_properties(
        std::map<sstring, sstring> options = (strategy) ? cfprops.get_compaction_type_options() : (*schema)->compaction_strategy_options();
        sstables::time_window_compaction_strategy_options twcs_options(options);
        long ttl = (cfprops.has_property(cf_prop_defs::KW_DEFAULT_TIME_TO_LIVE)) ? cfprops.get_default_time_to_live() : current_ttl.count();
-        auto max_windows = qp.db().get_config().twcs_max_window_count();
+        auto max_windows = db.get_config().twcs_max_window_count();

        // It may happen that an user tries to update an unrelated table property. Allow the request through.
        if (!cfprops.has_property(cf_prop_defs::KW_DEFAULT_TIME_TO_LIVE) && !strategy) {
@@ -491,7 +495,7 @@ std::optional<sstring> check_restricted_table_properties(
                                                   "highly discouraged.", ttl, twcs_options.get_sstable_window_size().count(), window_count, max_windows));
            }
        } else {
-              switch (qp.db().get_config().restrict_twcs_without_default_ttl()) {
+              switch (db.get_config().restrict_twcs_without_default_ttl()) {
              case db::tri_mode_restriction_t::mode::TRUE:
                  throw exceptions::configuration_exception(
                      "TimeWindowCompactionStrategy tables without a strict default_time_to_live setting "
@@ -510,18 +514,6 @@ std::optional<sstring> check_restricted_table_properties(
    return std::nullopt;
 }

-future<::shared_ptr<messages::result_message>>
-create_table_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    std::optional<sstring> warning = check_restricted_table_properties(qp, std::nullopt, keyspace(), column_family(), *_properties);
-    return schema_altering_statement::execute(qp, state, options).then([this, warning = std::move(warning)] (::shared_ptr<messages::result_message> msg) {
-        if (warning) {
-            msg->add_warning(*warning);
-            mylogger.warn("{}", *warning);
-        }
-        return msg;
-    });
-}
-
 }

 }
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -79,9 +79,6 @@ public:

    virtual future<> grant_permissions_to_creator(const service::client_state&) const override;

-    virtual future<::shared_ptr<messages::result_message>>
-    execute(query_processor& qp, service::query_state& state, const query_options& options) const override;
-
    schema_ptr get_cf_meta_data(const data_dictionary::database) const;

    class raw_statement;
@@ -129,7 +126,7 @@ public:
 };

 std::optional<sstring> check_restricted_table_properties(
-    query_processor& qp,
+    data_dictionary::database db,
    std::optional<schema_ptr> schema,
    const sstring& keyspace, const sstring& table,
    const cf_prop_defs& cfprops);
--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -178,7 +178,13 @@ future<std::vector<description>> function(replica::database& db, const sstring&

    auto udfs = boost::copy_range<std::vector<shared_ptr<const keyspace_element>>>(fs | boost::adaptors::transformed([] (const auto& f) {
        return dynamic_pointer_cast<const functions::user_function>(f.second);
+    }) | boost::adaptors::filtered([] (const auto& f) {
+        return f != nullptr;
    }));
+    if (udfs.empty()) {
+        throw exceptions::invalid_request_exception(format("Function '{}' not found in keyspace '{}'", name, ks));
+    }
+
    co_return co_await generate_descriptions(db, udfs, true);
 }

@@ -191,13 +197,19 @@ future<std::vector<description>> functions(replica::database& db,const sstring&

 future<std::vector<description>> aggregate(replica::database& db, const sstring& ks, const sstring& name) {
    auto fs = functions::functions::find(functions::function_name(ks, name));
-    if(fs.empty()) {
+    if (fs.empty()) {
        throw exceptions::invalid_request_exception(format("Aggregate '{}' not found in keyspace '{}'", name, ks));
    }

    auto udas = boost::copy_range<std::vector<shared_ptr<const keyspace_element>>>(fs | boost::adaptors::transformed([] (const auto& f) {
        return dynamic_pointer_cast<const functions::user_aggregate>(f.second);
+    }) | boost::adaptors::filtered([] (const auto& f) {
+        return f != nullptr;
    }));
+    if (udas.empty()) {
+        throw exceptions::invalid_request_exception(format("Aggregate '{}' not found in keyspace '{}'", name, ks));
+    }
+
    co_return co_await generate_descriptions(db, udas, true);
 }

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -120,7 +120,10 @@ schema_altering_statement::execute(query_processor& qp, service::query_state& st
    }

    return execute0(qp, state, options).then([this, &state, internal](::shared_ptr<messages::result_message> result) {
-        auto permissions_granted_fut = internal
+        // We don't want to grant the permissions to the supposed creator even if the statement succeeded if it's an internal query
+        // or if the query did not actually create the item, i.e. the query is bounced to another shard or it's a IF NOT EXISTS
+        // query where the item already exists.
+        auto permissions_granted_fut = internal || !result->is_schema_change()
                ? make_ready_future<>()
                : grant_permissions_to_creator(state.get_client_state());
        return permissions_granted_fut.then([result = std::move(result)] {
--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -60,7 +60,11 @@ void use_statement::validate(query_processor&, const service::client_state& stat

 future<::shared_ptr<cql_transport::messages::result_message>>
 use_statement::execute(query_processor& qp, service::query_state& state, const query_options& options) const {
-    state.get_client_state().set_keyspace(qp.db().real_database(), _keyspace);
+    try {
+        state.get_client_state().set_keyspace(qp.db().real_database(), _keyspace);
+    } catch(...) {
+        return make_exception_future<::shared_ptr<cql_transport::messages::result_message>>(std::current_exception());
+    }
    auto result =::make_shared<cql_transport::messages::result_message::set_keyspace>(_keyspace);
    return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>(result);
 }
--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -151,14 +151,19 @@ static bytes from_json_object_aux(const map_type_impl& t, const rjson::value& va
    std::map<bytes, bytes, serialized_compare> raw_map(t.get_keys_type()->as_less_comparator());
    for (auto it = value.MemberBegin(); it != value.MemberEnd(); ++it) {
        bytes value = from_json_object(*t.get_values_type(), it->value);
-        if (!t.get_keys_type()->is_compatible_with(*utf8_type)) {
+        if (t.get_keys_type()->underlying_type() == ascii_type ||
+            t.get_keys_type()->underlying_type() == utf8_type) {
+            raw_map.emplace(from_json_object(*t.get_keys_type(), it->name), std::move(value));
+        } else {
            // Keys in maps can only be strings in JSON, but they can also be a string representation
            // of another JSON type, which needs to be reparsed. Example - map<frozen<list<int>>, int>
            // will be represented like this: { "[1, 3, 6]": 3, "[]": 0, "[1, 2]": 2 }
-            rjson::value map_key = rjson::parse(rjson::to_string_view(it->name));
-            raw_map.emplace(from_json_object(*t.get_keys_type(), map_key), std::move(value));
-        } else {
-            raw_map.emplace(from_json_object(*t.get_keys_type(), it->name), std::move(value));
+            try {
+                rjson::value map_key = rjson::parse(rjson::to_string_view(it->name));
+                raw_map.emplace(from_json_object(*t.get_keys_type(), map_key), std::move(value));
+            } catch (rjson::error& e) {
+                throw marshal_exception(format("Failed parsing map_key {}: {}", it->name, e.what()));
+            }
        }
    }
    return map_type_impl::serialize_to_bytes(raw_map);
@@ -480,7 +485,7 @@ struct to_json_string_visitor {
    sstring operator()(const string_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const bytes_type_impl& t) { return quote_json_string("0x" + t.to_string(bv)); }
    sstring operator()(const boolean_type_impl& t) { return t.to_string(bv); }
-    sstring operator()(const timestamp_date_base_class& t) { return quote_json_string(t.to_string(bv)); }
+    sstring operator()(const timestamp_date_base_class& t) { return quote_json_string(timestamp_to_json_string(t, bv)); }
    sstring operator()(const timeuuid_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const map_type_impl& t) { return to_json_string_aux(t, bv); }
    sstring operator()(const set_type_impl& t) { return to_json_string_aux(t, bv); }
@@ -488,7 +493,7 @@ struct to_json_string_visitor {
    sstring operator()(const tuple_type_impl& t) { return to_json_string_aux(t, bv); }
    sstring operator()(const user_type_impl& t) { return to_json_string_aux(t, bv); }
    sstring operator()(const simple_date_type_impl& t) { return quote_json_string(t.to_string(bv)); }
-    sstring operator()(const time_type_impl& t) { return t.to_string(bv); }
+    sstring operator()(const time_type_impl& t) { return quote_json_string(t.to_string(bv)); }
    sstring operator()(const empty_type_impl& t) { return "null"; }
    sstring operator()(const duration_type_impl& t) {
        auto v = t.deserialize(bv);
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -254,6 +254,11 @@ keyspace_metadata::new_keyspace(std::string_view name,
    return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
 }

+lw_shared_ptr<keyspace_metadata>
+keyspace_metadata::new_keyspace(const keyspace_metadata& ksm) {
+    return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.durable_writes(), std::vector<schema_ptr>{}, ksm.get_storage_options());
+}
+
 void keyspace_metadata::add_user_type(const user_type ut) {
    _user_types.add_type(ut);
 }
--- a/data_dictionary/keyspace_metadata.hh
+++ b/data_dictionary/keyspace_metadata.hh
@@ -55,6 +55,8 @@ public:
                 bool durables_writes,
                 std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
                 storage_options storage_opts = {});
+    static lw_shared_ptr<keyspace_metadata>
+    new_keyspace(const keyspace_metadata& ksm);
    void validate(const locator::topology&) const;
    const sstring& name() const {
        return _name;
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -2591,12 +2591,20 @@ db::commitlog::read_log_file(sstring filename, sstring pfx, seastar::io_priority
            return eof || next == pos;
        }
        future<> skip(size_t bytes) {
-            pos += bytes;
-            if (pos > file_size) {
+            auto n = std::min(file_size - pos, bytes);
+            pos += n;
+            if (pos == file_size) {
                eof = true;
-                pos = file_size;
            }
-            return fin.skip(bytes);
+            if (n < bytes) {
+                // if we are trying to skip past end, we have at least
+                // the bytes skipped or the source from where we read 
+                // this corrupt. So add at least four bytes. This is
+                // inexact, but adding the full "bytes" is equally wrong
+                // since it could be complete garbled junk.
+                corrupt_size += std::max(n, sizeof(uint32_t));
+            }
+            return fin.skip(n);
        }
        void stop() {
            eof = true;
--- a/db/config.cc
+++ b/db/config.cc
@@ -406,6 +406,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Adjusts the sensitivity of the failure detector on an exponential scale. Generally this setting never needs adjusting.\n"
        "Related information: Failure detection and recovery")
    , failure_detector_timeout_in_ms(this, "failure_detector_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 20 * 1000, "Maximum time between two successful echo message before gossip mark a node down in milliseconds.\n")
+    , direct_failure_detector_ping_timeout_in_ms(this, "direct_failure_detector_ping_timeout_in_ms", value_status::Used, 600, "Duration after which the direct failure detector aborts a ping message, so the next ping can start.\n"
+        "Note: this failure detector is used by Raft, and is different from gossiper's failure detector (configured by `failure_detector_timeout_in_ms`).\n")
    /* Performance tuning properties */
    /* Tuning performance and system reso   urce utilization, including commit log, compaction, memory, disk I/O, CPU, reads, and writes. */
    /* Commit log settings */
@@ -817,6 +819,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , unspooled_dirty_soft_limit(this, "unspooled_dirty_soft_limit", value_status::Used, 0.6, "Soft limit of unspooled dirty memory expressed as a portion of the hard limit")
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default) "
        "bytes written to data file. Value must be between 0 and 1.")
+    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting")
@@ -829,6 +832,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Unused, true, "Enable SSTables 'md' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , sstable_format(this, "sstable_format", value_status::Used, "me", "Default sstable file format", {"md", "me"})
+    , table_digest_insensitive_to_expiry(this, "table_digest_insensitive_to_expiry", liveness::MustRestart, value_status::Used, true,
+            "When enabled, per-table schema digest calculation ignores empty partitions.")
    , enable_dangerous_direct_import_of_cassandra_counters(this, "enable_dangerous_direct_import_of_cassandra_counters", value_status::Used, false, "Only turn this option on if you want to import tables from Cassandra containing counters, and you are SURE that no counters in that table were created in a version earlier than Cassandra 2.1."
        " It is not enough to have ever since upgraded to newer versions of Cassandra. If you EVER used a version earlier than 2.1 in the cluster where these SSTables come from, DO NOT TURN ON THIS OPTION! You will corrupt your data. You have been warned.")
    , enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance")
--- a/db/config.hh
+++ b/db/config.hh
@@ -173,6 +173,7 @@ public:
    named_value<bool> snapshot_before_compaction;
    named_value<uint32_t> phi_convict_threshold;
    named_value<uint32_t> failure_detector_timeout_in_ms;
+    named_value<uint32_t> direct_failure_detector_ping_timeout_in_ms;
    named_value<sstring> commitlog_sync;
    named_value<uint32_t> commitlog_segment_size_in_mb;
    named_value<uint32_t> schema_commitlog_segment_size_in_mb;
@@ -322,6 +323,7 @@ public:
    named_value<unsigned> murmur3_partitioner_ignore_msb_bits;
    named_value<double> unspooled_dirty_soft_limit;
    named_value<double> sstable_summary_ratio;
+    named_value<double> components_memory_reclaim_threshold;
    named_value<size_t> large_memory_allocation_warning_threshold;
    named_value<bool> enable_deprecated_partitioners;
    named_value<bool> enable_keyspace_column_family_metrics;
@@ -332,6 +334,7 @@ public:
    named_value<bool> enable_sstables_mc_format;
    named_value<bool> enable_sstables_md_format;
    named_value<sstring> sstable_format;
+    named_value<bool> table_digest_insensitive_to_expiry;
    named_value<bool> enable_dangerous_direct_import_of_cassandra_counters;
    named_value<bool> enable_shard_aware_drivers;
    named_value<bool> enable_ipv6_dns_lookup;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -860,7 +860,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
        ctx_ptr->mark_hint_as_in_progress(rp);

        // Future is waited on indirectly in `send_one_file()` (via `ctx_ptr->file_send_gate`).
-        (void)with_gate(ctx_ptr->file_send_gate, [this, secs_since_file_mod, &fname, buf = std::move(buf), rp, ctx_ptr] () mutable {
+        auto h = ctx_ptr->file_send_gate.hold();
+        (void)std::invoke([this, secs_since_file_mod, &fname, buf = std::move(buf), rp, ctx_ptr] () mutable {
            try {
                auto m = this->get_mutation(ctx_ptr, buf);
                gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();
@@ -896,7 +897,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                return make_exception_future<>(std::move(eptr));
            }
            return make_ready_future<>();
-        }).then_wrapped([this, units = std::move(units), rp, ctx_ptr] (future<>&& f) {
+        }).then_wrapped([this, units = std::move(units), rp, ctx_ptr, h = std::move(h)] (future<>&& f) {
            // Information about the error was already printed somewhere higher.
            // We just need to account in the ctx that sending of this hint has failed.
            if (!f.failed()) {
--- a/db/hints/sync_point.cc
+++ b/db/hints/sync_point.cc
@@ -17,13 +17,22 @@
 #include "idl/hinted_handoff.dist.hh"
 #include "idl/hinted_handoff.dist.impl.hh"
 #include "utils/base64.hh"
+#include "xx_hasher.hh"

 namespace db {
 namespace hints {
-
+// Sync points can be encoded in two formats: V1 and V2. V2 extends V1 by adding
+// a checksum. Currently, we use the V2 format, but sync points encoded in the V1
+// format still can be safely decoded.
+//
 // Format V1 (encoded in base64):
 //   uint8_t 0x01 - version of format
-//   sync_point_v1 - encoded using IMR
+//   sync_point_v1 - encoded using IDL
+//
+// Format V2 (encoded in base64):
+//   uint8_t 0x02 - version of format
+//   sync_point_v1 - encoded using IDL
+//   uint64_t - checksum computed using the xxHash algorithm
 //
 // sync_point_v1:
 //   UUID host_id - ID of the host which created the sync point
@@ -41,6 +50,9 @@ namespace hints {
 //       Flattened representation was chosen in order to save space on
 //       vector lengths etc.

+static constexpr size_t version_size = sizeof(uint8_t);
+static constexpr size_t checksum_size = sizeof(uint64_t);
+
 static std::vector<sync_point::shard_rps> decode_one_type_v1(uint16_t shard_count, const per_manager_sync_point_v1& v1) {
    std::vector<sync_point::shard_rps> ret;

@@ -67,16 +79,37 @@ static std::vector<sync_point::shard_rps> decode_one_type_v1(uint16_t shard_coun
    return ret;
 }

+static uint64_t calculate_checksum(const sstring_view s) {
+    xx_hasher h;
+    h.update(s.data(), s.size());
+    return h.finalize_uint64();
+}
+
 sync_point sync_point::decode(sstring_view s) {
    bytes raw = base64_decode(s);
    if (raw.empty()) {
        throw std::runtime_error("Could not decode the sync point - not a valid hex string");
    }
-    if (raw[0] != 1) {
-        throw std::runtime_error(format("Unsupported sync point format version: {}", int(raw[0])));
+
+    sstring_view raw_s(reinterpret_cast<const char*>(raw.data()), raw.size());
+    seastar::simple_memory_input_stream in{raw_s.data(), raw_s.size()};
+
+    uint8_t version = ser::serializer<uint8_t>::read(in);
+    if (version == 2) {
+        if (raw_s.size() < version_size + checksum_size) {
+            throw std::runtime_error("Could not decode the sync point encoded in the V2 format - serialized blob is too short");
+        }
+
+        seastar::simple_memory_input_stream in_checksum{raw_s.end() - checksum_size, checksum_size};
+        uint64_t checksum = ser::serializer<uint64_t>::read(in_checksum);
+        if (checksum != calculate_checksum(raw_s.substr(0, raw_s.size() - checksum_size))) {
+            throw std::runtime_error("Could not decode the sync point encoded in the V2 format - wrong checksum");
+        }
+    }
+    else if (version != 1) {
+        throw std::runtime_error(format("Unsupported sync point format version: {}", int(version)));
    }

-    seastar::simple_memory_input_stream in{reinterpret_cast<const char*>(raw.data()) + 1, raw.size() - 1};
    sync_point_v1 v1 = ser::serializer<sync_point_v1>::read(in);

    return sync_point{
@@ -133,11 +166,16 @@ sstring sync_point::encode() const {
    seastar::measuring_output_stream measure;
    ser::serializer<sync_point_v1>::write(measure, v1);

-    // Reserve 1 byte for the version
-    bytes serialized{bytes::initialized_later{}, 1 + measure.size()};
-    serialized[0] = 1;
-    seastar::simple_memory_output_stream out{reinterpret_cast<char*>(serialized.data()), measure.size(), 1};
+    // Reserve version_size bytes for the version and checksum_size bytes for the checksum
+    bytes serialized{bytes::initialized_later{}, version_size + measure.size() + checksum_size};
+
+    // Encode using V2 format
+    seastar::simple_memory_output_stream out{reinterpret_cast<char*>(serialized.data()), serialized.size()};
+    ser::serializer<uint8_t>::write(out, 2);
    ser::serializer<sync_point_v1>::write(out, v1);
+    sstring_view serialized_s(reinterpret_cast<const char*>(serialized.data()), version_size + measure.size());
+    uint64_t checksum = calculate_checksum(serialized_s);
+    ser::serializer<uint64_t>::write(out, checksum);

    return base64_encode(serialized);
 }
--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -157,7 +157,7 @@ future<> cql_table_large_data_handler::try_record(std::string_view large_table,
    const auto sstable_name = large_data_handler::sst_filename(sst);
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
+    large_data_logger.warn("Writing large {} {}/{}: {} ({} bytes) to {}", desc, ks_name, cf_name, extra_path, size, sstable_name);
    return _sys_ks->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -184,10 +184,10 @@ future<> cql_table_large_data_handler::internal_record_large_cells(const sstable
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("/{}/{}", ck_str, column_name), extra_fields, ck_str, column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name);
    } else {
        auto desc = format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, format("//{}", column_name), extra_fields, data_value::make_null(utf8_type), column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
    }
 }

@@ -199,10 +199,10 @@ future<> cql_table_large_data_handler::internal_record_large_cells_and_collectio
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("/{}/{}", ck_str, column_name), extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
    } else {
        auto desc = format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, format("//{}", column_name), extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
    }
 }

@@ -212,7 +212,7 @@ future<> cql_table_large_data_handler::record_large_rows(const sstables::sstable
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        std::string ck_str = key_to_str(*clustering_key, s);
-        return try_record("row", sst, partition_key, int64_t(row_size), "row", format("/{}", ck_str), extra_fields,  ck_str);
+        return try_record("row", sst, partition_key, int64_t(row_size), "row", "", extra_fields, ck_str);
    } else {
        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, data_value::make_null(utf8_type));
    }
--- a/db/schema_features.hh
+++ b/db/schema_features.hh
@@ -24,6 +24,10 @@ enum class schema_feature {
    PER_TABLE_PARTITIONERS,
    SCYLLA_KEYSPACES,
    SCYLLA_AGGREGATES,
+
+    // When enabled, schema_mutations::digest() will skip empty mutations (with only tombstones),
+    // so that the digest remains the same after schema tables are compacted.
+    TABLE_DIGEST_INSENSITIVE_TO_EXPIRY,
 };

 using schema_features = enum_set<super_enum<schema_feature,
@@ -33,7 +37,8 @@ using schema_features = enum_set<super_enum<schema_feature,
    schema_feature::CDC_OPTIONS,
    schema_feature::PER_TABLE_PARTITIONERS,
    schema_feature::SCYLLA_KEYSPACES,
-    schema_feature::SCYLLA_AGGREGATES
+    schema_feature::SCYLLA_AGGREGATES,
+    schema_feature::TABLE_DIGEST_INSENSITIVE_TO_EXPIRY
    >>;

 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -93,15 +93,18 @@ static bool is_extra_durable(const sstring& ks_name, const sstring& cf_name) {
 /** system.schema_* tables used to store keyspace/table/type attributes prior to C* 3.0 */
 namespace db {

-schema_ctxt::schema_ctxt(const db::config& cfg, std::shared_ptr<data_dictionary::user_types_storage> uts)
-    : _extensions(cfg.extensions())
+schema_ctxt::schema_ctxt(const db::config& cfg, std::shared_ptr<data_dictionary::user_types_storage> uts, 
+                         const gms::feature_service& features, replica::database* db)
+    : _db(db)
+    , _features(features)
+    , _extensions(cfg.extensions())
    , _murmur3_partitioner_ignore_msb_bits(cfg.murmur3_partitioner_ignore_msb_bits())
    , _schema_registry_grace_period(cfg.schema_registry_grace_period())
    , _user_types(std::move(uts))
 {}

-schema_ctxt::schema_ctxt(const replica::database& db)
-    : schema_ctxt(db.get_config(), db.as_user_types_storage())
+schema_ctxt::schema_ctxt(replica::database& db)
+    : schema_ctxt(db.get_config(), db.as_user_types_storage(), db.features(), &db)
 {}

 schema_ctxt::schema_ctxt(distributed<replica::database>& db)
@@ -148,7 +151,8 @@ static future<> merge_tables_and_views(distributed<service::storage_proxy>& prox
    std::map<table_id, schema_mutations>&& tables_before,
    std::map<table_id, schema_mutations>&& tables_after,
    std::map<table_id, schema_mutations>&& views_before,
-    std::map<table_id, schema_mutations>&& views_after);
+    std::map<table_id, schema_mutations>&& views_after,
+    bool reload);

 struct [[nodiscard]] user_types_to_drop final {
    seastar::noncopyable_function<future<> ()> drop;
@@ -161,7 +165,7 @@ static future<user_types_to_drop> merge_types(distributed<service::storage_proxy
 static future<> merge_functions(distributed<service::storage_proxy>& proxy, schema_result before, schema_result after);
 static future<> merge_aggregates(distributed<service::storage_proxy>& proxy, schema_result before, schema_result after, schema_result scylla_before, schema_result scylla_after);

-static future<> do_merge_schema(distributed<service::storage_proxy>&, std::vector<mutation>, bool do_flush);
+static future<> do_merge_schema(distributed<service::storage_proxy>&, std::vector<mutation>, bool do_flush, bool reload);

 using computed_columns_map = std::unordered_map<bytes, column_computation_ptr>;
 static computed_columns_map get_computed_columns(const schema_mutations& sm);
@@ -941,18 +945,18 @@ future<> update_schema_version_and_announce(sharded<db::system_keyspace>& sys_ks
 * @throws ConfigurationException If one of metadata attributes has invalid value
 * @throws IOException If data was corrupted during transportation or failed to apply fs operations
 */
-future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations)
+future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations, bool reload)
 {
    if (this_shard_id() != 0) {
        // mutations must be applied on the owning shard (0).
        co_await smp::submit_to(0, [&, fmuts = freeze(mutations)] () mutable -> future<> {
-            return merge_schema(sys_ks, proxy, feat, unfreeze(fmuts));
+            return merge_schema(sys_ks, proxy, feat, unfreeze(fmuts), reload);
        });
        co_return;
    }
    co_await with_merge_lock([&] () mutable -> future<> {
        bool flush_schema = proxy.local().get_db().local().get_config().flush_schema_tables_after_modification();
-        co_await do_merge_schema(proxy, std::move(mutations), flush_schema);
+        co_await do_merge_schema(proxy, std::move(mutations), flush_schema, reload);
        co_await update_schema_version_and_announce(sys_ks, proxy, feat.cluster_schema_features());
    });
 }
@@ -1095,7 +1099,7 @@ future<> store_column_mapping(distributed<service::storage_proxy>& proxy, schema
 // and or filesystem calls, e.g. fsync.
 constexpr size_t max_concurrent = 8;

-static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
+static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush, bool reload)
 {
    slogger.trace("do_merge_schema: {}", mutations);
    schema_ptr s = keyspaces();
@@ -1110,6 +1114,12 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
        delete_schema_version(mutation);
    }

+    if (reload) {
+        for (auto&& ks : proxy.local().get_db().local().get_non_system_keyspaces()) {
+            keyspaces.emplace(ks);
+        }
+    }
+
    // current state of the schema
    auto&& old_keyspaces = co_await read_schema_for_keyspaces(proxy, KEYSPACES, keyspaces);
    auto&& old_column_families = co_await read_tables_for_keyspaces(proxy, keyspaces, tables());
@@ -1145,18 +1155,16 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
    auto types_to_drop = co_await merge_types(proxy, std::move(old_types), std::move(new_types));
    co_await merge_tables_and_views(proxy,
        std::move(old_column_families), std::move(new_column_families),
-        std::move(old_views), std::move(new_views));
+        std::move(old_views), std::move(new_views), reload);
    co_await merge_functions(proxy, std::move(old_functions), std::move(new_functions));
    co_await merge_aggregates(proxy, std::move(old_aggregates), std::move(new_aggregates), std::move(old_scylla_aggregates), std::move(new_scylla_aggregates));
    co_await types_to_drop.drop();

-    co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> {
-        // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
-        for (auto keyspace_to_drop : keyspaces_to_drop) {
-            db.drop_keyspace(keyspace_to_drop);
-            co_await db.get_notifier().drop_keyspace(keyspace_to_drop);
-        }
-    });
+    auto& sharded_db = proxy.local().get_db();
+    // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
+    for (auto keyspace_to_drop : keyspaces_to_drop) {
+        co_await replica::database::drop_keyspace_on_all_shards(sharded_db, keyspace_to_drop);
+    }
 }

 future<lw_shared_ptr<query::result_set>> extract_scylla_specific_keyspace_info(distributed<service::storage_proxy>& proxy, const schema_result_value_type& partition) {
@@ -1205,19 +1213,18 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
        slogger.info("Altering keyspace {}", key);
        altered.emplace_back(key);
    }
-    co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> {
-        for (auto&& val : created) {
-            auto scylla_specific_rs = co_await extract_scylla_specific_keyspace_info(proxy, val);
-            auto ksm = create_keyspace_from_schema_partition(val, std::move(scylla_specific_rs));
-            co_await db.create_keyspace(ksm, proxy.local().get_erm_factory());
-            co_await db.get_notifier().create_keyspace(ksm);
-        }
-        {
-            for (auto& name : altered) {
-                co_await db.update_keyspace(proxy, name);
-            };
-        }
-    });
+    auto& sharded_db = proxy.local().get_db();
+    for (auto&& val : created) {
+        auto scylla_specific_rs = co_await extract_scylla_specific_keyspace_info(proxy, val);
+        auto ksm = create_keyspace_from_schema_partition(val, std::move(scylla_specific_rs));
+        co_await replica::database::create_keyspace_on_all_shards(sharded_db, proxy, *ksm);
+    }
+    for (auto& name : altered) {
+        auto v = co_await db::schema_tables::read_schema_partition_for_keyspace(proxy, db::schema_tables::KEYSPACES, name);
+        auto scylla_specific_rs = co_await db::schema_tables::extract_scylla_specific_keyspace_info(proxy, v);
+        auto tmp_ksm = db::schema_tables::create_keyspace_from_schema_partition(v, scylla_specific_rs);
+        co_await replica::database::update_keyspace_on_all_shards(sharded_db, proxy, *tmp_ksm);
+    }
    co_return dropped;
 }

@@ -1252,6 +1259,7 @@ enum class schema_diff_side {
 static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy,
    std::map<table_id, schema_mutations>&& before,
    std::map<table_id, schema_mutations>&& after,
+    bool reload,
    noncopyable_function<schema_ptr (schema_mutations sm, schema_diff_side)> create_schema)
 {
    schema_diff d;
@@ -1272,6 +1280,13 @@ static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy
        slogger.info("Altering {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
        d.altered.emplace_back(schema_diff::altered_schema{s_before, s});
    }
+    if (reload) {
+        for (auto&& key: diff.entries_in_common) {
+            auto s = create_schema(std::move(after.at(key)), schema_diff_side::right);
+            slogger.info("Reloading {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
+            d.altered.emplace_back(schema_diff::altered_schema {s, s});
+        }
+    }
    return d;
 }

@@ -1284,12 +1299,13 @@ static future<> merge_tables_and_views(distributed<service::storage_proxy>& prox
    std::map<table_id, schema_mutations>&& tables_before,
    std::map<table_id, schema_mutations>&& tables_after,
    std::map<table_id, schema_mutations>&& views_before,
-    std::map<table_id, schema_mutations>&& views_after)
+    std::map<table_id, schema_mutations>&& views_after,
+    bool reload)
 {
-    auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), [&] (schema_mutations sm, schema_diff_side) {
+    auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), reload, [&] (schema_mutations sm, schema_diff_side) {
        return create_table_from_mutations(proxy, std::move(sm));
    });
-    auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), [&] (schema_mutations sm, schema_diff_side side) {
+    auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), reload, [&] (schema_mutations sm, schema_diff_side side) {
        // The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
        // If we don't do it we are leaving a window where write commands to this schema are illegal.
        // There are 3 possibilities:
@@ -2964,7 +2980,7 @@ schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations
    if (version) {
        builder.with_version(*version);
    } else {
-        builder.with_version(sm.digest());
+        builder.with_version(sm.digest(ctxt.features().cluster_schema_features()));
    }

    if (auto partitioner = sm.partitioner()) {
@@ -3195,7 +3211,7 @@ view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm
    if (version) {
        builder.with_version(*version);
    } else {
-        builder.with_version(sm.digest());
+        builder.with_version(sm.digest(ctxt.features().cluster_schema_features()));
    }

    auto base_id = table_id(row.get_nonnull<utils::UUID>("base_table_id"));
@@ -3524,7 +3540,8 @@ view_ptr maybe_fix_legacy_secondary_index_mv_schema(replica::database& db, const
    if (v->clustering_key_size() == 0) {
        return view_ptr(nullptr);
    }
-    const column_definition& first_view_ck = v->clustering_key_columns().front();
+    const auto ck_cols = v->clustering_key_columns();
+    const column_definition& first_view_ck = ck_cols.front();
    if (first_view_ck.is_computed()) {
        return view_ptr(nullptr);
    }
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -14,6 +14,7 @@
 #include "schema_fwd.hh"
 #include "schema_features.hh"
 #include "hashing.hh"
+#include "gms/feature_service.hh"
 #include "schema_mutations.hh"
 #include "types/map.hh"
 #include "query-result-set.hh"
@@ -66,8 +67,8 @@ class config;

 class schema_ctxt {
 public:
-    schema_ctxt(const config&, std::shared_ptr<data_dictionary::user_types_storage> uts);
-    schema_ctxt(const replica::database&);
+    schema_ctxt(const config&, std::shared_ptr<data_dictionary::user_types_storage> uts, const gms::feature_service&, replica::database* = nullptr);
+    schema_ctxt(replica::database&);
    schema_ctxt(distributed<replica::database>&);
    schema_ctxt(distributed<service::storage_proxy>&);

@@ -87,7 +88,17 @@ public:
        return *_user_types;
    }

+    const gms::feature_service& features() const {
+        return _features;
+    }
+
+    replica::database* get_db() const {
+        return _db;
+    }
+
 private:
+    replica::database* _db;
+    const gms::feature_service& _features;
    const db::extensions& _extensions;
    const unsigned _murmur3_partitioner_ignore_msb_bits;
    const uint32_t _schema_registry_grace_period;
@@ -128,6 +139,7 @@ schema_ptr indexes();
 schema_ptr tables();
 schema_ptr scylla_tables(schema_features features = schema_features::full());
 schema_ptr views();
+schema_ptr types();
 schema_ptr computed_columns();
 // Belongs to the "system" keyspace
 schema_ptr scylla_table_schema_history();
@@ -184,7 +196,7 @@ future<mutation> read_keyspace_mutation(distributed<service::storage_proxy>&, co
 // Must be called on shard 0.
 future<semaphore_units<>> hold_merge_lock() noexcept;

-future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations);
+future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations, bool reload);

 // Recalculates the local schema version.
 //
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -486,7 +486,7 @@ system_distributed_keyspace::read_cdc_topology_description(
            return {};
        }

-        std::vector<cdc::token_range_description> entries;
+        utils::chunked_vector<cdc::token_range_description> entries;

        auto entries_val = value_cast<list_type_impl::native_type>(
                cdc_generation_description_type->deserialize(cql_result->one().get_view("description")));
@@ -580,7 +580,7 @@ system_distributed_keyspace::insert_cdc_generation(

 future<std::optional<cdc::topology_description>>
 system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
-    std::vector<cdc::token_range_description> entries;
+    utils::chunked_vector<cdc::token_range_description> entries;
    auto num_ranges = 0;
    co_await _qp.query_internal(
            // This should be a local read so 20s should be more than enough
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2839,8 +2839,7 @@ static void install_virtual_readers(db::system_keyspace& sys_ks, replica::databa

 static bool maybe_write_in_user_memory(schema_ptr s) {
    return (s.get() == system_keyspace::batchlog().get()) || (s.get() == system_keyspace::paxos().get())
-            || s == system_keyspace::v3::scylla_views_builds_in_progress()
-            || s == system_keyspace::raft();
+            || s == system_keyspace::v3::scylla_views_builds_in_progress();
 }

 future<> system_keyspace_make(db::system_keyspace& sys_ks, distributed<replica::database>& dist_db, distributed<service::storage_service>& dist_ss, sharded<gms::gossiper>& dist_gossiper, distributed<service::raft_group_registry>& dist_raft_gr, db::config& cfg, table_selector& tables) {
--- a/db/tags/utils.cc
+++ b/db/tags/utils.cc
@@ -11,6 +11,8 @@
 #include "db/tags/extension.hh"
 #include "schema_builder.hh"
 #include "schema_registry.hh"
+#include "service/storage_proxy.hh"
+#include "data_dictionary/data_dictionary.hh"

 namespace db {

@@ -38,14 +40,27 @@ std::optional<std::string> find_tag(const schema& s, const sstring& tag) {
    }
 }

-future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map) {
-    co_await mm.container().invoke_on(0, [s = global_schema_ptr(std::move(schema)), tags_map = std::move(tags_map)] (service::migration_manager& mm) -> future<> {
+future<> modify_tags(service::migration_manager& mm, sstring ks, sstring cf,
+                     std::function<void(std::map<sstring, sstring>&)> modify) {
+    co_await mm.container().invoke_on(0, [ks = std::move(ks), cf = std::move(cf), modify = std::move(modify)] (service::migration_manager& mm) -> future<> {
        // FIXME: the following needs to be in a loop. If mm.announce() below
        // fails, we need to retry the whole thing.
        auto group0_guard = co_await mm.start_group0_operation();
-
+        // After getting the schema-modification lock, we need to read the
+        // table's *current* schema - it might have changed before we got
+        // the lock, by some concurrent modification. If the table is gone,
+        // this will throw no_such_column_family.
+        schema_ptr s = mm.get_storage_proxy().data_dictionary().find_schema(ks, cf);
+        const std::map<sstring, sstring>* tags_ptr = get_tags_of_table(s);
+        std::map<sstring, sstring> tags;
+        if (tags_ptr) {
+            // tags_ptr is a constant pointer to schema data. To allow func()
+            // to modify the tags, we must make a copy.
+            tags = *tags_ptr;
+        }
+        modify(tags);
        schema_builder builder(s);
-        builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags_map));
+        builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags));

        auto m = co_await mm.prepare_column_family_update_announcement(builder.build(), false, std::vector<view_ptr>(), group0_guard.write_timestamp());

--- a/db/tags/utils.hh
+++ b/db/tags/utils.hh
@@ -33,9 +33,18 @@ const std::map<sstring, sstring>* get_tags_of_table(schema_ptr schema);
 // tags exist but not this tag.
 std::optional<std::string> find_tag(const schema& s, const sstring& tag);

-// FIXME: Updating tags currently relies on updating schema, which may be subject
-// to races during concurrent updates of the same table. Once Scylla schema updates
-// are fixed, this issue will automatically get fixed as well.
-future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map);
-
+// modify_tags() atomically modifies the tags on a given table: It reads the
+// existing tags, passes them as a map to the given function which can modify
+// the map, and finally writes the modified tags. This read-modify-write
+// operation is atomic - isolated from other concurrent schema operations.
+//
+// The isolation requirement is also why modify_tags() takes the table's name
+// ks,cf and not a schema object - the current schema may not be relevant by
+// the time the tags are modified, due to some other concurrent modification.
+// If a table (ks, cf) doesn't exist, no_such_column_family is thrown.
+//
+// If the table didn't have the tags schema extension, it's fine: The function
+// is passed an empty map, and the tags it adds will be added to the table.
+future<> modify_tags(service::migration_manager& mm, sstring ks, sstring cf,
+                     std::function<void(std::map<sstring, sstring>&)> modify_func);
 }
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -487,37 +487,56 @@ mutation_partition& view_updates::partition_for(partition_key&& key) {
 }

 size_t view_updates::op_count() const {
-    return _op_count++;;
+    return _op_count;
 }

 row_marker view_updates::compute_row_marker(const clustering_or_static_row& base_row) const {
    /*
-     * We need to compute both the timestamp and expiration.
+     * We need to compute both the timestamp and expiration for view rows.
     *
-     * There are 3 cases:
-     *   1) There is a column that is not in the base PK but is in the view PK. In that case, as long as that column
-     *      lives, the view entry does too, but as soon as it expires (or is deleted for that matter) the entry also
-     *      should expire. So the expiration for the view is the one of that column, regardless of any other expiration.
-     *      To take an example of that case, if you have:
-     *        CREATE TABLE t (a int, b int, c int, PRIMARY KEY (a, b))
-     *        CREATE MATERIALIZED VIEW mv AS SELECT * FROM t WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)
-     *        INSERT INTO t(a, b) VALUES (0, 0) USING TTL 3;
-     *        UPDATE t SET c = 0 WHERE a = 0 AND b = 0;
-     *      then even after 3 seconds elapsed, the row will still exist (it just won't have a "row marker" anymore) and so
-     *      the MV should still have a corresponding entry.
-     *      This cell determines the liveness of the view row.
-     *   2) The columns for the base and view PKs are exactly the same, and all base columns are selected by the view.
-     *      In that case, all components (marker, deletion and cells) are the same and trivially mapped.
-     *   3) The columns for the base and view PKs are exactly the same, but some base columns are not selected in the view.
-     *      Use the max timestamp out of the base row marker and all the unselected columns - this ensures we can keep the
-     *      view row alive. Do the same thing for the expiration, if the marker is dead or will expire, and so
-     *      will all unselected columns.
+     * Below there are several distinct cases depending on how many new key
+     * columns the view has - i.e., how many of the view's key columns were
+     * regular columns in the base. base_regular_columns_in_view_pk.size():
+     *
+     * Zero new key columns:
+     *     The view rows key is composed only from base key columns, and those
+     *     cannot be changed in an update, so the view row remains alive as
+     *     long as the base row is alive. We need to return the same row
+     *     marker as the base for the view - to keep an empty view row alive
+    *      for as long as an empty base row exists.
+     *     Note that in this case, if there are *unselected* base columns, we
+     *     may need to keep an empty view row alive even without a row marker
+     *     because the base row (which has additional columns) is still alive.
+     *     For that we have the "virtual columns" feature: In the zero new
+     *     key columns case, we put unselected columns in the view as empty
+     *     columns, to keep the view row alive.
+     *
+     * One new key column:
+     *     In this case, there is a regular base column that is part of the
+     *     view key. This regular column can be added or deleted in an update,
+     *     or its expiration be set, and those can cause the view row -
+     *     including its row marker - to need to appear or disappear as well.
+     *     So the liveness of cell of this one column determines the liveness
+     *     of the view row and the row marker that we return.
+     *
+     * Two or more new key columns:
+     *     This case is explicitly NOT supported in CQL - one cannot create a
+     *     view with more than one base-regular columns in its key. In general
+     *     picking one liveness (timestamp and expiration) is not possible
+     *     if there are multiple regular base columns in the view key, as
+     *     those can have different liveness.
+     *     However, we do allow this case for Alternator - we need to allow
+     *     the case of two (but not more) because the DynamoDB API allows
+     *     creating a GSI whose two key columns (hash and range key) were
+     *     regular columns.
+     *     We can support this case in Alternator because it doesn't use
+     *     expiration (the "TTL" it does support is different), and doesn't
+     *     support user-defined timestamps. But, the two columns can still
+     *     have different timestamps - this happens if an update modifies
+     *     just one of them. In this case the timestamp of the view update
+     *     (and that of the row marker we return) is the later of these two
+     *     updated columns.
     */
-
-    // WARNING: The code assumes that if multiple regular base columns are present in the view key,
-    // they share liveness information. It's true especially in the only case currently allowed by CQL,
-    // which assumes there's up to one non-pk column in the view key. It's also true in alternator,
-    // which does not carry TTL information.
    const auto& col_ids = base_row.is_clustering_row()
            ? _base_info->base_regular_columns_in_view_pk()
            : _base_info->base_static_columns_in_view_pk();
@@ -525,7 +544,20 @@ row_marker view_updates::compute_row_marker(const clustering_or_static_row& base
        auto& def = _base->column_at(base_row.column_kind(), col_ids[0]);
        // Note: multi-cell columns can't be part of the primary key.
        auto cell = base_row.cells().cell_at(col_ids[0]).as_atomic_cell(def);
-        return cell.is_live_and_has_ttl() ? row_marker(cell.timestamp(), cell.ttl(), cell.expiry()) : row_marker(cell.timestamp());
+        auto ts = cell.timestamp();
+        if (col_ids.size() > 1){
+            // As explained above, this case only happens in Alternator,
+            // and we may need to pick a higher ts:
+            auto& second_def = _base->column_at(base_row.column_kind(), col_ids[1]);
+            auto second_cell = base_row.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
+            auto second_ts = second_cell.timestamp();
+            ts = std::max(ts, second_ts);
+            // Alternator isn't supposed to have TTL or more than two col_ids!
+            if (col_ids.size() != 2 || cell.is_live_and_has_ttl() || second_cell.is_live_and_has_ttl()) [[unlikely]] {
+                utils::on_internal_error(format("Unexpected col_ids length {} or has TTL", col_ids.size()));
+            }
+        }
+        return cell.is_live_and_has_ttl() ? row_marker(ts, cell.ttl(), cell.expiry()) : row_marker(ts);
    }

    return base_row.marker();
@@ -923,8 +955,22 @@ void view_updates::do_delete_old_entry(const partition_key& base_key, const clus
            // Note: multi-cell columns can't be part of the primary key.
            auto& def = _base->column_at(kind, col_ids[0]);
            auto cell = existing.cells().cell_at(col_ids[0]).as_atomic_cell(def);
+            auto ts = cell.timestamp();
+            if (col_ids.size() > 1) {
+                // This is the Alternator-only support for two regular base
+                // columns that become view key columns. See explanation in
+                // view_updates::compute_row_marker().
+                auto& second_def = _base->column_at(kind, col_ids[1]);
+                auto second_cell = existing.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
+                auto second_ts = second_cell.timestamp();
+                ts = std::max(ts, second_ts);
+                // Alternator isn't supposed to have more than two col_ids!
+                if (col_ids.size() != 2) [[unlikely]] {
+                    utils::on_internal_error(format("Unexpected col_ids length {}", col_ids.size()));
+                }
+            }
            if (cell.is_live()) {
-                r->apply(shadowable_tombstone(cell.timestamp(), now));
+                r->apply(shadowable_tombstone(ts, now));
            }
        } else {
            // "update" caused the base row to have been deleted, and !col_id
@@ -1308,11 +1354,12 @@ void view_update_builder::generate_update(static_row&& update, const tombstone&

 future<stop_iteration> view_update_builder::on_results() {
    constexpr size_t max_rows_for_view_updates = 100;
-    size_t rows_for_view_updates = std::accumulate(_view_updates.begin(), _view_updates.end(), 0, [] (size_t acc, const view_updates& vu) {
-        return acc + vu.op_count();
-    });
-    const bool stop_updates = rows_for_view_updates >= max_rows_for_view_updates;
-
+    auto should_stop_updates = [this] () -> bool {
+        size_t rows_for_view_updates = std::accumulate(_view_updates.begin(), _view_updates.end(), 0, [] (size_t acc, const view_updates& vu) {
+            return acc + vu.op_count();
+        });
+        return rows_for_view_updates >= max_rows_for_view_updates;
+    };
    if (_update && !_update->is_end_of_partition() && _existing && !_existing->is_end_of_partition()) {
        auto cmp = position_in_partition::tri_compare(*_schema)(_update->position(), _existing->position());
        if (cmp < 0) {
@@ -1335,7 +1382,7 @@ future<stop_iteration> view_update_builder::on_results() {
                              : std::nullopt;
                generate_update(std::move(update), _update_partition_tombstone, std::move(existing), _existing_partition_tombstone);
            }
-            return stop_updates ? stop() : advance_updates();
+            return should_stop_updates() ? stop() : advance_updates();
        }
        if (cmp > 0) {
            // We have something existing but no update (which will happen either because it's a range tombstone marker in
@@ -1371,7 +1418,7 @@ future<stop_iteration> view_update_builder::on_results() {
                    generate_update(std::move(update), _update_partition_tombstone, { std::move(existing) }, _existing_partition_tombstone);
                }
            }
-            return stop_updates ? stop () : advance_existings();
+            return should_stop_updates() ? stop () : advance_existings();
        }
        // We're updating a row that had pre-existing data
        if (_update->is_range_tombstone_change()) {
@@ -1393,8 +1440,9 @@ future<stop_iteration> view_update_builder::on_results() {
                                                  mutation_fragment_v2::printer(*_schema, *_update), mutation_fragment_v2::printer(*_schema, *_existing)));
            }
            generate_update(std::move(*_update).as_static_row(), _update_partition_tombstone, { std::move(*_existing).as_static_row() }, _existing_partition_tombstone);
+
        }
-        return stop_updates ? stop() : advance_all();
+        return should_stop_updates() ? stop() : advance_all();
    }

    auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
@@ -1409,7 +1457,7 @@ future<stop_iteration> view_update_builder::on_results() {
            auto update = static_row();
            generate_update(std::move(update), _update_partition_tombstone, { std::move(existing) }, _existing_partition_tombstone);
        }
-        return stop_updates ? stop() : advance_existings();
+        return should_stop_updates() ? stop() : advance_existings();
    }

    // If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
@@ -1430,7 +1478,7 @@ future<stop_iteration> view_update_builder::on_results() {
                          : std::nullopt;
            generate_update(std::move(*_update).as_static_row(), _update_partition_tombstone, std::move(existing), _existing_partition_tombstone);
        }
-        return stop_updates ? stop() : advance_updates();
+        return should_stop_updates() ? stop() : advance_updates();
    }

    return stop();
@@ -1609,6 +1657,13 @@ static bool should_update_synchronously(const schema& s) {
    return *tag_opt == "true";
 }

+size_t memory_usage_of(const frozen_mutation_and_schema& mut) {
+    // Overhead of sending a view mutation, in terms of data structures used by the storage_proxy, as well as possible background tasks
+    // allocated for a remote view update.
+    constexpr size_t base_overhead_bytes = 2288;
+    return base_overhead_bytes + mut.fm.representation().size();
+}
+
 // Take the view mutations generated by generate_view_updates(), which pertain
 // to a modification of a single base partition, and apply them to the
 // appropriate paired replicas. This is done asynchronously - we do not wait
@@ -1630,7 +1685,7 @@ future<> mutate_MV(
        auto& keyspace_name = mut.s->ks_name();
        auto target_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto remote_endpoints = service::get_local_storage_proxy().get_token_metadata_ptr()->pending_endpoints_for(view_token, keyspace_name);
-        auto sem_units = pending_view_updates.split(mut.fm.representation().size());
+        auto sem_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_updates.split(memory_usage_of(mut)));

        const bool update_synchronously = should_update_synchronously(*mut.s);
        if (update_synchronously) {
@@ -1676,9 +1731,9 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_mv_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
-                            units = sem_units.split(sem_units.count())] (future<>&& f) {
+                            sem_units] (future<>&& f) {
                --stats.writes;
                if (f.failed()) {
                    ++stats.view_updates_failed_local;
@@ -1715,7 +1770,7 @@ future<> mutate_MV(
            schema_ptr s = mut.s;
            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), std::move(mut), base_token, view_token, allow_hints, tr_state).then_wrapped(
                    [s = std::move(s), &stats, &cf_stats, tr_state, base_token, view_token, target_endpoint, updates_pushed_remote,
-                            units = sem_units.split(sem_units.count()), apply_update_synchronously] (future<>&& f) mutable {
+                            sem_units, apply_update_synchronously] (future<>&& f) mutable {
                if (f.failed()) {
                    stats.view_updates_failed_remote += updates_pushed_remote;
                    cf_stats.total_view_updates_failed_remote += updates_pushed_remote;
@@ -2230,7 +2285,7 @@ future<> view_builder::do_build_step() {
            }
        }
    }).handle_exception([] (std::exception_ptr ex) {
-        vlogger.warn("Unexcepted error executing build step: {}. Ignored.", std::current_exception());
+        vlogger.warn("Unexcepted error executing build step: {}. Ignored.", ex);
    });
 }

--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -213,7 +213,7 @@ class view_updates final {
    schema_ptr _base;
    base_info_ptr _base_info;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
-    mutable size_t _op_count = 0;
+    size_t _op_count = 0;
 public:
    explicit view_updates(view_and_base vab)
            : _view(std::move(vab.view))
@@ -327,6 +327,8 @@ future<> mutate_MV(
        service::allow_hints allow_hints,
        wait_for_all_updates wait_for_all);

+size_t memory_usage_of(const frozen_mutation_and_schema& mut);
+
 /**
 * create_virtual_column() adds a "virtual column" to a schema builder.
 * The definition of a "virtual column" is based on the given definition
--- a/direct_failure_detector/failure_detector.cc
+++ b/direct_failure_detector/failure_detector.cc
@@ -96,6 +96,7 @@ struct failure_detector::impl {
    clock& _clock;

    clock::interval_t _ping_period;
+    clock::interval_t _ping_timeout;

    // Number of workers on each shard.
    // We use this to decide where to create new workers (we pick a shard with the smallest number of workers).
@@ -138,7 +139,7 @@ struct failure_detector::impl {
    // The unregistering process requires cross-shard operations which we perform on this fiber.
    future<> _destroy_subscriptions = make_ready_future<>();

-    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period);
+    impl(failure_detector& parent, pinger&, clock&, clock::interval_t ping_period, clock::interval_t ping_timeout);
    ~impl();

    // Inform update_endpoint_fiber() about an added/removed endpoint.
@@ -174,12 +175,14 @@ struct failure_detector::impl {
    future<> mark(listener* l, pinger::endpoint_id ep, bool alive);
 };

-failure_detector::failure_detector(pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period))
+failure_detector::failure_detector(
+    pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _impl(std::make_unique<impl>(*this, pinger, clock, ping_period, ping_timeout))
 {}

-failure_detector::impl::impl(failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period)
-        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period) {
+failure_detector::impl::impl(
+    failure_detector& parent, pinger& pinger, clock& clock, clock::interval_t ping_period, clock::interval_t ping_timeout)
+        : _parent(parent), _pinger(pinger), _clock(clock), _ping_period(ping_period), _ping_timeout(ping_timeout) {
    if (this_shard_id() != 0) {
        return;
    }
@@ -536,11 +539,9 @@ future<> endpoint_worker::ping_fiber() noexcept {
        auto start = clock.now();
        auto next_ping_start = start + _fd._ping_period;

-        // A ping should take significantly less time than _ping_period, but we give it a multiple of ping_period before it times out
-        // just in case of transient network partitions.
-        // However, if there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
+        auto timeout = start + _fd._ping_timeout;
+        // If there's a listener that's going to timeout soon (before the ping returns), we abort the ping in order to handle
        // the listener (mark it as dead).
-        auto timeout = start + 3 * _fd._ping_period;
        for (auto& [threshold, l]: _fd._listeners_liveness) {
            if (l.endpoint_liveness[_id].alive && last_response + threshold < timeout) {
                timeout = last_response + threshold;
--- a/direct_failure_detector/failure_detector.hh
+++ b/direct_failure_detector/failure_detector.hh
@@ -120,14 +120,14 @@ public:

        // Every endpoint in the detected set will be periodically pinged every `ping_period`,
        // assuming that the pings return in a timely manner. A ping may take longer than `ping_period`
-        // before it's aborted (up to a certain multiple of `ping_period`), in which case the next ping
-        // will start immediately.
-        //
-        // `ping_period` should be chosen so that during normal operation, a ping takes significantly
-        // less time than `ping_period` (preferably at least an order of magnitude less).
+        // before it's aborted (up to `ping_timeout`), in which case the next ping will start immediately.
        //
        // The passed-in value must be the same on every shard.
-        clock::interval_t ping_period
+        clock::interval_t ping_period,
+
+        // Duration after which a ping is aborted, so that next ping can be started
+        // (pings are sent sequentially).
+        clock::interval_t ping_timeout
    );

    ~failure_detector();
@@ -147,7 +147,7 @@ public:
    // The listener stops being called when the returned subscription is destroyed.
    // The subscription must be destroyed before service is stopped.
    //
-    // `threshold` should be significantly larger than `ping_period`, preferably at least an order of magnitude larger.
+    // `threshold` should be significantly larger than `ping_timeout`, preferably at least an order of magnitude larger.
    //
    // Different listeners may use different thresholds, depending on the use case:
    // some listeners may want to mark endpoints as dead more aggressively if fast reaction times are important
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -62,8 +62,7 @@ ExternalSizeMax=1024G
 [Unit]
 Description=Save coredump to scylla data directory
 Conflicts=umount.target
-Before=scylla-server.service
-After=local-fs.target
+Before=local-fs.target scylla-server.service
 DefaultDependencies=no

 [Mount]
@@ -73,7 +72,7 @@ Type=none
 Options=bind

 [Install]
-WantedBy=multi-user.target
+WantedBy=local-fs.target
 '''[1:-1]
            with open('/etc/systemd/system/var-lib-systemd-coredump.mount', 'w') as f:
                f.write(dot_mount)
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -9,15 +9,90 @@

 import os
 import argparse
+import distutils.util
 import pwd
 import grp
 import sys
 import stat
 import distro
+import logging
+import pyudev
 from pathlib import Path
 from scylla_util import *
 from subprocess import run, SubprocessError

+LOGGER = logging.getLogger(__name__)
+
+class UdevInfo:
+    def __init__(self, device_file):
+        self.context = pyudev.Context()
+        self.device = pyudev.Devices.from_device_file(self.context, device_file)
+
+    def verify(self):
+        if not self.id_fs_uuid:
+            LOGGER.error('ID_FS_UUID does not found')
+        if self.id_fs_type != 'xfs':
+            LOGGER.error('ID_FS_TYPE is not "xfs"')
+        if self.id_fs_usage != 'filesystem':
+            LOGGER.error('ID_FS_USAGE is not "filesystem"')
+
+    def dump_variables(self):
+        LOGGER.error(f'    sys_path: {self.device.sys_path}')
+        LOGGER.error(f'    sys_name: {self.device.sys_name}')
+        LOGGER.error(f'    sys_number: {self.device.sys_number}')
+        LOGGER.error(f'    device_path: {self.device.device_path}')
+        LOGGER.error(f'    tags: {list(self.device.tags)}')
+        LOGGER.error(f'    subsystem: {self.device.subsystem}')
+        LOGGER.error(f'    driver: {self.device.driver}')
+        LOGGER.error(f'    device_type: {self.device.device_type}')
+        LOGGER.error(f'    device_node: {self.device.device_node}')
+        LOGGER.error(f'    device_number: {self.device.device_number}')
+        LOGGER.error(f'    device_links: {list(self.device.device_links)}')
+        LOGGER.error(f'    is_initialized: {self.device.is_initialized}')
+        LOGGER.error(f'    time_since_initialized: {self.device.time_since_initialized}')
+        for k, v in self.device.properties.items():
+            LOGGER.error(f'    {k}: {v}')
+
+    @property
+    def id_fs_uuid(self):
+        return self.device.properties.get('ID_FS_UUID')
+
+    @property
+    def id_fs_type(self):
+        return self.device.properties.get('ID_FS_TYPE')
+
+    @property
+    def id_fs_usage(self):
+        return self.device.properties.get('ID_FS_USAGE')
+
+    @property
+    def uuid_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-uuid/'):
+                return l
+
+    @property
+    def label_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-label/'):
+                return l
+
+    @property
+    def partuuid_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-partuuid/'):
+                return l
+
+    @property
+    def path_link(self):
+        for l in self.device.device_links:
+            if l.startswith('/dev/disk/by-path/'):
+                return l
+
+    @property
+    def id_links(self):
+        return [l for l in self.device.device_links if l.startswith('/dev/disk/by-id')]
+
 if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
@@ -37,11 +112,14 @@ if __name__ == '__main__':
                        help='force constructing RAID when only one disk is specified')
    parser.add_argument('--raid-level', default='0',
                        help='specify RAID level')
-    parser.add_argument('--online-discard', default=True,
+    parser.add_argument('--online-discard', default="True",
                        help='Enable XFS online discard (trim SSD cells after file deletion)')

    args = parser.parse_args()

+    # Allow args.online_discard to be used as a boolean value
+    args.online_discard = distutils.util.strtobool(args.online_discard)
+
    root = args.root.rstrip('/')
    if args.volume_role == 'all':
        mount_at=root
@@ -157,35 +235,51 @@ if __name__ == '__main__':

    os.makedirs(mount_at, exist_ok=True)

-    uuid = out(f'blkid -s UUID -o value {fsdev}')
-    if not uuid:
-        raise Exception(f'Failed to get UUID of {fsdev}')
+    udev_info = UdevInfo(fsdev)
+    mount_dev = None
+    if udev_info.uuid_link:
+        mount_dev = udev_info.uuid_link
+    else:
+        if udev_info.label_link:
+            mount_dev = udev_info.label_link
+            dev_type = 'label'
+        elif udev_info.partuuid_link:
+            mount_dev = udev_info.partuuid_link
+            dev_type = 'partuuid'
+        elif udev_info.path_link:
+            mount_dev = udev_info.path_link
+            dev_type = 'path'
+        elif udev_info.id_links:
+            mount_dev = udev_info.id_links[0]
+            dev_type = 'id'
+        else:
+            mount_dev = fsdev
+            dev_type = 'realpath'
+        LOGGER.error(f'Failed to detect uuid, using {dev_type}: {mount_dev}')

-    uuidpath = f'/dev/disk/by-uuid/{uuid}'
-
-    after = 'local-fs.target'
+    after = ''
    wants = ''
    if raid and args.raid_level != '0':
-        after += f' {md_service}'
-        wants = f'\nWants={md_service}'
+        after = wants = 'md_service'
    opt_discard = ''
    if args.online_discard:
        opt_discard = ',discard'
    unit_data = f'''
 [Unit]
 Description=Scylla data directory
-Before=scylla-server.service
-After={after}{wants}
+Before=local-fs.target scylla-server.service
+After={after}
+Wants={wants}
 DefaultDependencies=no

 [Mount]
-What={uuidpath}
+What={mount_dev}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}

 [Install]
-WantedBy=multi-user.target
+WantedBy=local-fs.target
 '''[1:-1]
    with open(f'/etc/systemd/system/{mntunit_bn}', 'w') as f:
        f.write(unit_data)
@@ -205,10 +299,18 @@ WantedBy=multi-user.target
        mount = systemd_unit(mntunit_bn)
        mount.start()
    except SubprocessError as e:
-        if not os.path.exists(uuidpath):
-            print(f'\nERROR: {uuidpath} is not found\n')
-        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
-            print(f'\nERROR: {uuidpath} is not block device\n')
+        if mount_dev != fsdev:
+            if not os.path.islink(mount_dev):
+                LOGGER.error('{mount_dev} is not found')
+            if not os.path.exists(mount_dev):
+                LOGGER.error('{mount_dev} is broken link')
+        if not os.path.exists(fsdev):
+            LOGGER.error('{fsdev} is not found')
+        if not stat.S_ISBLK(os.stat(fsdev).st_mode):
+            LOGGER.error('{fsdev} is not block device')
+        LOGGER.error(f'Error detected, dumping udev env parameters on {fsdev}')
+        udev_info.verify()
+        udev_info.dump_variables()
        raise e

    if args.enable_on_nextboot:
@@ -224,3 +326,8 @@ WantedBy=multi-user.target

    if is_debian_variant():
        run('update-initramfs -u', shell=True, check=True)
+
+    if not udev_info.uuid_link:
+        LOGGER.error(f'Error detected, dumping udev env parameters on {fsdev}')
+        udev_info.verify()
+        udev_info.dump_variables()
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -63,7 +63,6 @@ bcp "${packages[@]}" packages/

 bcp dist/docker/etc etc/
 bcp dist/docker/scylla-housekeeping-service.sh /scylla-housekeeping-service.sh
-bcp dist/docker/sshd-service.sh /sshd-service.sh

 bcp dist/docker/scyllasetup.py /scyllasetup.py
 bcp dist/docker/commandlineparser.py /commandlineparser.py
@@ -73,10 +72,11 @@ bcp dist/docker/scylla_bashrc /scylla_bashrc

 run apt-get -y clean expire-cache
 run apt-get -y update
+run apt-get -y upgrade
 run apt-get -y install dialog apt-utils
 run bash -ec "echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections"
 run bash -ec "rm -rf /etc/rsyslog.conf"
-run apt-get -y install hostname supervisor openssh-server openssh-client openjdk-11-jre-headless python2 python3 python3-yaml curl rsyslog sudo
+run apt-get -y install hostname supervisor openjdk-11-jre-headless python2 python3 python3-yaml curl rsyslog sudo
 run bash -ec "echo LANG=C.UTF-8 > /etc/default/locale"
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
--- a/dist/docker/etc/supervisord.conf.d/sshd-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/sshd-server.conf
@@ -1,6 +0,0 @@
-[program:sshd]
-command=/sshd-service.sh
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
--- a/dist/docker/sshd-service.sh
+++ b/dist/docker/sshd-service.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-if [ ! -f /run/sshd ]; then
-  mkdir -p /run/sshd
-fi
-
-if [ ! -f /etc/ssh/ssh_host_ed25519_key ]; then
-    ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N ''
-fi
-if [ ! -f /etc/ssh/ssh_host_rsa_key ]; then
-    ssh-keygen -t rsa -b 4096 -f /etc/ssh/ssh_host_rsa_key -N ''
-fi
-
-/usr/sbin/sshd -D
-
--- a/docs/architecture/ringarchitecture/index.rst
+++ b/docs/architecture/ringarchitecture/index.rst
@@ -70,9 +70,7 @@ You can use the ``nodetool`` command to describe different aspects of your nodes

 ``$ nodetool ring <keyspace>``

-Outputs all tokens of a node, and displays the token ring information_.  It produces output as follows for a single datacenter:
-
-.. _information: /operating-scylla/nodetool-commands/ring/
+Outputs all tokens of a node, and displays the :doc:`token ring information </operating-scylla/nodetool-commands/ring>`.  It produces output as follows for a single datacenter:

 .. code-block:: shell

--- a/docs/dev/cdc.md
+++ b/docs/dev/cdc.md
@@ -42,7 +42,7 @@ namespace cdc {
        uint8_t sharding_ignore_msb;
    };
    class topology_description {
-        std::vector<token_range_description> _entries;
+        utils::chunked_vector<token_range_description> _entries;
 public:
        ... methods ...
    };
@@ -158,9 +158,27 @@ We're not able to prevent a node learning about a new generation too late due to
 However, it could happen that a node learns about the generation from gossip in time, but then won't be able to extract it from `cdc_generation_descriptions_v2`. In that case we can still maintain consistency: the node will remember that there is a new generation even though it doesn't yet know what it is (it knows only the ID, in particular it knows the timestamp) using the `cdc::metadata::prepare(db_clock::time_point)` method, and then _reject_ writes for CDC-enabled tables that are supposed to use this new generation. The node will keep trying to read the generation's data in background until it succeeds or sees that it's not necessary anymore (e.g. because the generation was already superseded by a new generation).
 Thus we give up availability for safety. This likely won't happen if the administrator ensures that the cluster is not partitioned before bootstrapping a new node. This problem will also be mitigated with a future patch.

-Due to the need of maintaining colocation we don't allow the client to send writes with arbitrary timestamps.
-Suppose that a write is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`). Then we only allow the write if its timestamp is in the interval [`T`, `C + generation_leeway`), where `generation_leeway` is a small time-inteval constant (e.g. 5 seconds).
-Reason: we cannot allow writes before `T`, because they belong to the old generation whose token ranges might no longer refine the current vnodes, so the corresponding log write would not necessarily be colocated with the base write. We also cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.
+#### Generation switching: accepting writes
+
+Due to the need of maintaining colocation we don't allow the client to send writes with arbitrary timestamps. We allow:
+- writes to the current and next generations unless they are too far into the future,
+- writes to the previous generations unless they are too far into the past.
+
+##### Writes to the current and next generations
+
+Suppose that a write with timestamp `W` is requested and the write coordinator's local clock has time `C` and the generation operating at time `C` has timestamp `T` (`T <= C`) such that `T <= W`. Then we only allow the write if `W < C + generation_leeway`, where `generation_leeway` is a small time-interval constant (e.g. 5 seconds).
+
+We cannot allow writes too far "into the future" because we don't know what generation will be operating at that time (the node which will introduce this generation might not have joined yet). But, as mentioned before, we assume that we'll learn about the next generation in time. Again --- the need for this assumption will be gone in a future patch.
+
+##### Writes to the previous generations
+
+This time suppose that `T > W`. Then we only allow the write if `W > C - generation_leeway` and there was a generation operating at `W`.
+
+We allow writes to previous generations to improve user experience. If a client generates timestamps by itself and clocks are not perfectly synchronized, there may be short periods of time around the moment of switching generations when client's writes are rejected because they fall into one of the previous generations. Usually, this problem is easy to overcome by the client. It can simply repeat a write a few times, but using a higher timestamp. Unfortunately, if a table additionally uses LWT, the client cannot increase the timestamp because LWT makes timestamps permanent. Once Paxos commits an entry with a given timestamp, Scylla will keep trying to apply that entry until it succeeds, with the same timestamp. Applying the entry involves doing a CDC log table write. If it fails, we are stuck. Allowing writes to the previous generations is also a probabilistic fix for this bug.
+
+Note that writing only to the previous generation might not be enough. With the Raft-based topology and tablets, we can add multiple nodes almost instantly. Then, we can have multiple generations with almost identical timestamps.
+
+We allow writes only to the recent past to reduce the number of generations that must be stored in memory.

 ### Streams description tables

--- a/docs/operating-scylla/admin-tools/scylla-sstable.rst
+++ b/docs/operating-scylla/admin-tools/scylla-sstable.rst
@@ -17,7 +17,7 @@ This tool is similar to SStableDump_, with notable differences:
 * Expanded scope: this tool supports much more than dumping SStable data components (see `Supported Operations`_).
 * More flexible on how schema is obtained and where SStables are located: SStableDump_ only supports dumping SStables located in their native data directory. To dump an SStable, one has to clone the entire ScyllaDB data directory tree, including system table directories and even config files. ``scylla sstable`` can dump sstables from any path with multiple choices on how to obtain the schema, see Schema_.

-Currently, SStableDump_ works better on production systems as it automatically loads the schema from the system tables, unlike ``scylla sstable``, which has to be provided with the schema explicitly. On the other hand ``scylla sstable`` works better for off-line investigations, as it can be used with as little as just a schema definition file and a single sstable. In the future we plan on closing this gap -- adding support for automatic schema-loading for ``scylla sstable`` too -- and completely supplant SStableDump_ with ``scylla sstable``.
+``scylla sstable`` was developed to supplant SStableDump_ as ScyllaDB-native tool, better tailored for the needs of ScyllaDB.

 .. _SStableDump: /operating-scylla/admin-tools/sstabledump

@@ -35,14 +35,33 @@ You can specify more than one SStable.

 Schema
 ------
+
 All operations need a schema to interpret the SStables with.
-Currently, there are two ways to obtain the schema:
+This tool tries to auto-detect the location of the ScyllaDB data directories and the name of the table the SStable belongs to.
+If the SStable is located in a ScyllaDB data directory, it works out-of-the-box, without any additional input from the user.
+If the SStable is located at an external path, you need to specify the names of the keyspace and table to which the SStable belongs. In addition, some hints as to where the ScyllaDB data directory is located may also be required.

+The schema can be obtained in the following ways:
+
+* Auto-detected - If the SStable is located in the table's directory within the ScyllaDB data directory.
+* ``--keyspace=KEYSPACE --table=TABLE`` - If the SStable is located at an external location, but the ScyllaDB data directory or the config file are located at the standard location. The tool also reads the ``SCYLLA_CONF`` and ``SCYLLA_HOME`` environment variables to try to locate the configuration file.
 * ``--schema-file FILENAME`` - Read the schema definition from a file.
-* ``--system-schema KEYSPACE.TABLE`` - Use the known definition of built-in tables (only works for system tables).
+* ``--system-schema --keyspace=KEYSPACE --table=TABLE`` - Use the known definition of built-in tables (only works for system tables).
+* ``--scylla-data-dir SCYLLA_DATA_DIR_PATH --keyspace=KEYSPACE --table=TABLE`` - Read the schema tables from the data directory at the provided location, needs the keyspace and table name to be provided with ``--keyspace`` and ``--table``.
+* ``--scylla-yaml-file SCYLLA_YAML_FILE_PATH --keyspace=KEYSPACE --table=TABLE`` - Read the schema tables from the data directory path obtained from the configuration, needs the keyspace and table name to be provided with ``--keyspace`` and ``--table``.

-By default, the tool uses the first method: ``--schema-file schema.cql``; i.e. it assumes there is a schema file named ``schema.cql`` in the working directory.
-If this fails, it will exit with an error.
+By default (no schema-related options are provided), the tool will try the following sequence:
+
+* Try to load schema from ``schema.cql``.
+* Try to deduce the ScyllaDB data directory path and table names from the SStable path.
+* Try to load the schema from the ScyllaDB directory located at the standard location (``/var/lib/scylla``). For this to succeed, the table name has to be provided via ``--keyspace`` and ``--table``.
+* Try to load the schema from the ScyllaDB directory path obtained from config at the standard location (``./conf/scylla.yaml``). ``SCYLLA_CONF`` and ``SCYLLA_HOME`` environment variables are also checked. For this to succeed, the table name has to be provided via ``--keyspace`` and ``--table``.
+
+The tool stops after the first successful attempt. If none of the above succeed, an error message will be printed.
+A user provided schema in ``schema.cql`` (if present) always takes precedence over other methods. This is deliberate, to allow to manually override the schema to be used.
+
+schema.cql
+^^^^^^^^^^

 The schema file should contain all definitions needed to interpret data belonging to the table.

@@ -72,7 +91,7 @@ Note:
 * The schema file doesn't have to be called ``schema.cql``, this is just the default name. Any file name is supported (with any extension).

 Dropped columns
-^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~

 The examined sstable might have columns which were dropped from the schema definition. In this case providing the up-do-date schema will not be enough, the tool will fail when attempting to process a cell for the dropped column.
 Dropped columns can be provided to the tool in the form of insert statements into the ``system_schema.dropped_columns`` system table, in the schema definition file. Example:
@@ -282,34 +301,6 @@ The content is dumped in JSON, using the following schema:
        },
        "pos": Uint64
    }
-    )",
-                dump_index_operation},
-    /* dump-compression-info */
-        {"dump-compression-info",
-                "Dump content of sstable compression info(s)",
-    R"(
-    Dumps the content of the compression-info component. Contains compression
-    parameters and maps positions into the uncompressed data to that into compressed
-    data. Note that compression happens over chunks with configurable size, so to
-    get data at a position in the middle of a compressed chunk, the entire chunk has
-    to be decompressed.
-    For more information about the sstable components and the format itself, visit
-    https://docs.scylladb.com/architecture/sstable/.
-
-    The content is dumped in JSON, using the following schema:
-
-    $ROOT := { "$sstable_path": $SSTABLE, ... }
-
-    $SSTABLE := {
-        "name": String,
-        "options": {
-            "$option_name": String,
-            ...
-        },
-        "chunk_len": Uint,
-        "data_len": Uint64,
-        "offsets": [Uint64, ...]
-    }

 dump-compression-info
 ^^^^^^^^^^^^^^^^^^^^^
--- a/docs/operating-scylla/admin-tools/sstable2json.rst
+++ b/docs/operating-scylla/admin-tools/sstable2json.rst
@@ -4,9 +4,7 @@ SSTable2json

 This tool allows you to converts SSTable into a JSON format file.
 SSTable2json supported when using Scylla 2.x or lower version.
-In newer versions, the tool is named SSTabledump_.
-
-.. _SSTabledump: /operating-scylla/admin-tools/sstabledump
+In newer versions, the tool is named :doc:`SSTabledump </operating-scylla/admin-tools/sstabledump>`.

 .. note:: 

--- a/docs/operating-scylla/admin-tools/sstabledump.rst
+++ b/docs/operating-scylla/admin-tools/sstabledump.rst
@@ -3,11 +3,9 @@ SSTabledump

 This tool allows you to converts SSTable into a JSON format file.
 SSTabledump supported when using Scylla 3.0, Scylla Enterprise 2019.1, and newer versions.
-In older versions, the tool is named SSTable2json_.
-If you need more flexibility or want to dump more than just the data-component, see scylla-sstable_.
+In older versions, the tool is named :doc:`SSTable2json </operating-scylla/admin-tools/sstable2json>`.
+If you need more flexibility or want to dump more than just the data-component, see :doc:`scylla-sstable </operating-scylla/admin-tools/scylla-sstable>`.

-.. _SSTable2json: /operating-scylla/admin-tools/sstable2json
-.. _scylla-sstable: /operating-scylla/admin-tools/scylla-sstable

 Use the full path to the data file when executing the command.

--- a/docs/operating-scylla/admin.rst
+++ b/docs/operating-scylla/admin.rst
@@ -91,12 +91,17 @@ The :code:`scylla-server` file contains configuration related to starting up the

 .. include:: /operating-scylla/scylla-yaml.inc

+.. _admin-compression:
+
 Compression
 -----------

 In Scylla, you can configure compression at rest and compression in transit.
 For compression in transit, you can configure compression between nodes or between the client and the node.

+
+.. _admin-client-node-compression:
+
 Client - Node Compression
 ^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/docs/operating-scylla/nodetool-commands/snapshot.rst
+++ b/docs/operating-scylla/nodetool-commands/snapshot.rst
@@ -108,10 +108,7 @@ Each of the snapshots is a **hardlink** to to the SSTable directory.
 Additional Resources
 ^^^^^^^^^^^^^^^^^^^^

-* `Backup your data`_ 
-* `Scylla Snapshots`_
-
-.. _`Backup your data`: /operating-scylla/procedures/backup-restore/backup
-.. _`Scylla Snapshots`: /kb/snapshots
+* :doc:`Backup your data </operating-scylla/procedures/backup-restore/backup>`
+* :doc:`Scylla Snapshots </kb/snapshots>`

 .. include:: /rst_include/apache-copyrights.rst
--- a/docs/operating-scylla/procedures/backup-restore/backup.rst
+++ b/docs/operating-scylla/procedures/backup-restore/backup.rst
@@ -17,12 +17,8 @@ The backup includes two procedures. These are:
 Full Backup - Snapshots
 =======================

-Snapshots are taken using `nodetool snapshot`_. First, the command flushes the MemTables from memory to SSTables on disk, and afterward, it creates a hard link for each SSTable in each keyspace.
-With time, SSTables are compacted, but the hard link keeps a copy of each file. This takes up an increasing amount of disk space. It is important to clear space by `clean unnecessary snapshots`_.
-
-.. _`nodetool snapshot`: /operating-scylla/nodetool-commands/snapshot
-
-.. _`clean unnecessary snapshots`: /operating-scylla/procedures/backup-restore/delete_snapshot
+Snapshots are taken using :doc:`nodetool snapshot </operating-scylla/nodetool-commands/snapshot>`. First, the command flushes the MemTables from memory to SSTables on disk, and afterward, it creates a hard link for each SSTable in each keyspace.
+With time, SSTables are compacted, but the hard link keeps a copy of each file. This takes up an increasing amount of disk space. It is important to clear space by :doc:`clean unnecessary snapshots </operating-scylla/procedures/backup-restore/delete-snapshot>`.

 **Procedure**

@@ -77,8 +73,6 @@ Incremental Backup
 Additional Resources
 ====================

-* `Scylla Snapshots`_
+* :doc:`Scylla Snapshots </kb/snapshots>`


-.. _`Scylla Snapshots`: /kb/snapshots
-
--- a/docs/operating-scylla/procedures/cassandra-to-scylla-migration-process.rst
+++ b/docs/operating-scylla/procedures/cassandra-to-scylla-migration-process.rst
@@ -133,9 +133,7 @@ Procedure

 See the full code example `here <https://github.com/scylladb/scylla-code-samples/tree/master/dual_writes>`_

-3. On each Apache Cassandra node, take a snapshot for every keyspace using the `nodetool snapshot`_ command. This will flush all SSTables to disk and generate a ``snapshots`` folder with an epoch timestamp for each underlying table in that keyspace. 
-
-.. _`nodetool snapshot`: /operating-scylla/nodetool-commands/snapshot
+3. On each Apache Cassandra node, take a snapshot for every keyspace using the :doc:`nodetool snapshot </operating-scylla/nodetool-commands/snapshot>` comand. This will flush all SSTables to disk and generate a ``snapshots`` folder with an epoch timestamp for each underlying table in that keyspace. 

   Folder path post snapshot: ``/var/lib/cassandra/data/keyspace/table-[uuid]/snapshots/[epoch_timestamp]/``

--- a/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
+++ b/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
@@ -7,10 +7,11 @@

 .. Note:: 

-   If ``authenticator`` is set to ``PasswordAuthenticator`` - increase the replication factor of the ``system_auth`` keyspace.
-
-   For example:
-
+   If ``authenticator`` is set to ``PasswordAuthenticator``, increase the replication factor of the ``system_auth`` keyspace.
+   For example: 
+   
   ``ALTER KEYSPACE system_auth WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1' : <new_replication_factor>};``
+   
+   Ensure you run repair after you alter the keyspace. See :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`.

   It is recommended to set ``system_auth`` replication factor to the number of nodes in each DC.
--- a/docs/operating-scylla/procedures/tips/best-practices-scylla-on-docker.rst
+++ b/docs/operating-scylla/procedures/tips/best-practices-scylla-on-docker.rst
@@ -306,9 +306,8 @@ First, download the file locally to the node:

  sudo docker exec -it some-scylla.2.0.1 curl -o file.csv https://<url>.com/<path>/<path>/<file>.csv

-Once you have the ``.csv`` downloaded, you can use the CQL ``COPY FROM`` command as explained here_ to load the data into ScyllaDB.
+Once you have the ``.csv`` downloaded, you can use the CQL ``COPY FROM`` command as explained :doc:`here </cql/cqlsh>` to load the data into ScyllaDB.

-.. _here: /getting-started/cqlsh/

 Such a copy command might look like this:

--- a/docs/operating-scylla/security/security-checklist.rst
+++ b/docs/operating-scylla/security/security-checklist.rst
@@ -31,7 +31,11 @@ Encryption on Transit, Client to Node and Node to Node
 Encryption on Transit protects your communication against a 3rd interception on the network connection.
 Configure Scylla to use TLS/SSL for all the connections. Use TLS/SSL to encrypt communication between Scylla nodes and client applications.

-See:
+.. only:: enterprise
+
+    Starting with version 2023.1.1, you can run ScyllaDB Enterprise on FIPS-enabled Ubuntu, 
+    which uses FIPS 140-2 certified libraries (such as OpenSSL, GnuTLS, and more) and Linux 
+    kernel in FIPS mode.

 * :doc:`Encryption Data in Transit Client to Node </operating-scylla/security/client-node-encryption>`

--- a/docs/troubleshooting/debugging-large-partition.rst
+++ b/docs/troubleshooting/debugging-large-partition.rst
@@ -21,7 +21,7 @@ Any of the following:

  .. code-block:: none

-     WARN  2022-09-22 17:33:11,075 [shard 1]large_data - Writing large partition Some_KS/Some_table: PK[/CK[/COL]] (SIZE bytes) to SSTABLE_NAME
+     WARN  2022-09-22 17:33:11,075 [shard 1]large_data - Writing large partition Some_KS/Some_table: [COL] (SIZE bytes) to SSTABLE_NAME

  In this case, refer to :ref:`Troubleshooting Large Partition Tables <large-partition-table-configure>` for more information.

--- a/docs/troubleshooting/error-messages/create-mv.rst
+++ b/docs/troubleshooting/error-messages/create-mv.rst
@@ -4,9 +4,8 @@ A Removed Node was not Removed Properly from the Seed Node List
 Phenonoma
 ^^^^^^^^^

-Failed to create `materialized view`_ after node was removed from the cluster. 
+Failed to create :doc:`materialized view </cql/mv>` after node was removed from the cluster. 

-.. _`materialized view`: /getting-started/mv/

 Error message:

@@ -27,9 +26,7 @@ How to Verify

 Scylla logs show the error message above.

-To verify that the node wasn't remove properly use the `nodetool gossipinfo`_ command
-
-.. _`nodetool gossipinfo`: /operating-scylla/nodetool-commands/gossipinfo/
+To verify that the node wasn't remove properly use the :doc:`nodetool gossipinfo </operating-scylla/nodetool-commands/gossipinfo>` command

 For example:

--- a/docs/upgrade/_common/upgrade-guide-v4-rpm.rst
+++ b/docs/upgrade/_common/upgrade-guide-v4-rpm.rst
@@ -31,7 +31,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/index.html>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `ScyllaDB Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade Steps
 =============
@@ -58,9 +58,14 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet

 Backup the configuration file
 ------------------------------
+
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
 .. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+   sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup

 Stop ScyllaDB
 ---------------
@@ -122,29 +127,33 @@ Rollback Steps
 ==============
 Gracefully shutdown ScyllaDB
 -----------------------------
+
 .. code:: sh

   nodetool drain
- .. include:: /rst_include/scylla-commands-stop-index.rst
+   nodetool snapshot
+   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------
-#. Remove the old repo file.
+#. Restore the |SRC_VERSION| packages backed up during the upgrade.

    .. code:: sh

-       sudo rm -rf /etc/yum.repos.d/scylla.repo
+       sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo
+       sudo chown root.root /etc/yum.repos.d/scylla.repo
+       sudo chmod 644 /etc/yum.repos.d/scylla.repo

-#. Update the |SCYLLA_REPO|_  to |SRC_VERSION|.
 #. Install:

    .. code:: console

       sudo yum clean all
       sudo rm -rf /var/cache/yum
-       sudo yum remove scylla\\*tools-core
-       sudo yum downgrade scylla\\* -y
-       sudo yum install scylla
+       sudo yum downgrade scylla-\*cqlsh -y
+       sudo yum remove scylla-\*cqlsh -y
+       sudo yum downgrade scylla\* -y
+       sudo yum install scylla -y
     

 Restore the configuration file
@@ -153,18 +162,7 @@ Restore the configuration file
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for details.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ---------------------------------
@@ -182,4 +180,4 @@ Start the node

 Validate
 --------
-Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.
+Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.
--- a/docs/upgrade/_common/upgrade-guide-v4-ubuntu-and-debian.rst
+++ b/docs/upgrade/_common/upgrade-guide-v4-ubuntu-and-debian.rst
@@ -34,7 +34,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/index.html>`_ for suspending Scylla Manager (only available Scylla Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `Scylla Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `Scylla Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade steps
 =============
--- a/docs/upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p1.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p1.rst
@@ -1,68 +0,0 @@
-Upgrade Guide - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| for |OS|
-======================================================================
-
-This document is a step-by-step procedure for upgrading from |SCYLLA_NAME| |FROM| to |SCYLLA_NAME| |TO|, and rollback to 2021.1 if required.
-
-
-Applicable Versions
------------------------
-This guide covers upgrading |SCYLLA_NAME| from version |FROM| to version |TO| on |OS|. See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about supported versions.
-
-
-Upgrade Procedure
----------------------------
-
-.. note::
-   Apply the following procedure **serially** on each node. Do not move to the next node before validating the node is up and running the new version.
-
-A ScyllaDB upgrade is a rolling procedure which does **not** require full cluster shutdown.
-For each of the nodes in the cluster, you will:
-
-* Drain node and backup the data.
-* Check your current release.
-* Backup configuration file.
-* Stop ScyllaDB.
-* Download and install new ScyllaDB packages.
-* Start ScyllaDB.
-* Validate that the upgrade was successful.
-
-
-**During** the rolling upgrade it is highly recommended:
-
-* Not to use new |TO| features.
-* Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes.
-* Not to apply schema changes.
-
-Upgrade steps
-------------------------------
-Drain node and backup the data
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Before any major procedure, like an upgrade, it is recommended to backup all the data to an external device. In ScyllaDB, backup is done using the ``nodetool snapshot`` command. For **each** node in the cluster, run the following command:
-
-.. code:: sh
-
-   nodetool drain
-   nodetool snapshot
-
-Take note of the directory name that nodetool gives you, and copy all the directories having this name under ``/var/lib/scylla`` to a backup device.
-
-When the upgrade is complete (all nodes), the snapshot should be removed by ``nodetool clearsnapshot -t <snapshot>``, or you risk running out of space.
-
-
-Backup configuration file
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sh
-
-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-5.x.z
-
-Gracefully stop the node
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sh
-
-   sudo service scylla-server stop
-
-Download and install the new release
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Before upgrading, check what version you are running now using ``dpkg -s scylla-server``. You should use the same version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |FROM| version, stop right here! This guide only covers |FROM| to |TO| upgrades.
--- a/docs/upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p2.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p2.rst
@@ -1,84 +0,0 @@
-**To upgrade ScyllaDB:**
-
-#. Update the |APT|_ to |NEW_VERSION|.
-#. Install:
-
-    .. code:: sh
-
-       sudo apt-get update
-       sudo apt-get dist-upgrade scylla
-
-    Answer ‘y’ to the first two questions.
-
-Start the node
-^^^^^^^^^^^^^^^^
-
-.. code:: sh
-
-   sudo service scylla-server start
-
-Validate
-^^^^^^^^^^^^^^^^
-#. Check cluster status with ``nodetool status`` and make sure **all** nodes, including the one you just upgraded, are in UN status.
-#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"`` to check the ScyllaDB version.
-#. Check the scylla-server log (execute ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no errors.
-#. Check again after 2 minutes, to validate no new issues are introduced.
-
-Once you are sure the node upgrade is successful, move to the next node in the cluster.
-
-Rollback Procedure
-----------------------
-
-.. include:: /upgrade/_common/warning_rollback.rst
-
-The following procedure describes a rollback from ScyllaDB release |TO| to |FROM|. Apply this procedure if an upgrade from |FROM| to |TO| failed before completing on all nodes. Use this procedure only for nodes you upgraded to |TO|.
-
-ScyllaDB rollback is a rolling procedure which does **not** require full cluster shutdown.
-For each of the nodes rollback to |FROM|, you will:
-
-* Drain the node and stop ScyllaDB.
-* Downgrade to previous release.
-* Restore the configuration file.
-* Restart ScyllaDB.
-* Validate the rollback success.
-
-Apply the following procedure **serially** on each node. Do not move to the next node before validating the node is up and running with the new version.
-
-Rollback steps
------------------------
-Gracefully shutdown ScyllaDB
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sh
-
-   nodetool drain
-   sudo service scylla-server stop
-
-Downgrade to previous release
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Install:
-
-.. code:: sh
-
-   sudo apt-get install scylla=5.x.y\* scylla-server=5.x.y\* scylla-jmx=5.x.y\* scylla-tools=5.x.y\* scylla-tools-core=5.x.y\* scylla-kernel-conf=5.x.y\* scylla-conf=5.x.y\*
-
-Answer ‘y’ to the first two questions.
-
-Restore the configuration file
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sh
-
-   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-5.x.z /etc/scylla/scylla.yaml
-
-Start the node
-^^^^^^^^^^^^^^^^^^^
-
-.. code:: sh
-
-   sudo service scylla-server start
-
-Validate
-^^^^^^^^^^^^^^^^^^
-Check upgrade instruction above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.
--- a/docs/upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian.rst
@@ -1,2 +0,0 @@
-.. include:: /upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p1.rst
-.. include:: /upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p2.rst
--- a/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
@@ -32,7 +32,7 @@ Apply the following procedure **serially** on each node. Do not move to the next
 * Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
 * Not to apply schema changes

-.. note:: Before upgrading, make sure to use the latest `ScyllaDB Montioring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.

 Upgrade Steps
 =============
@@ -60,9 +60,13 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet
 Backup the configuration file
 ------------------------------

+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
 .. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+   sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup

 Gracefully stop the node
 ------------------------
--- a/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p2.rst
+++ b/docs/upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p2.rst
@@ -44,7 +44,6 @@ For each of the nodes you rollback to |SRC_VERSION|, you will:
 * Drain the node and stop Scylla
 * Retrieve the old ScyllaDB packages
 * Restore the configuration file
-* Restore system tables
 * Reload systemd configuration
 * Restart ScyllaDB
 * Validate the rollback success
@@ -59,17 +58,19 @@ Gracefully shutdown ScyllaDB
 .. code:: sh

   nodetool drain
+   nodetool snapshot
   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------
-#. Remove the old repo file.
+#. Restore the |SRC_VERSION| packages backed up during the upgrade.

    .. code:: sh

-       sudo rm -rf /etc/apt/sources.list.d/scylla.list
+       sudo cp ~/scylla.list-backup /etc/apt/sources.list.d/scylla.list
+       sudo chown root.root /etc/apt/sources.list.d/scylla.list
+       sudo chmod 644 /etc/apt/sources.list.d/scylla.list

-#. Update the |SCYLLA_REPO|_ to |SRC_VERSION|.
 #. Install:

    .. code-block::
@@ -85,18 +86,7 @@ Restore the configuration file
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ----------------------------
--- a/docs/upgrade/_common/upgrade-image-opensource.rst
+++ b/docs/upgrade/_common/upgrade-image-opensource.rst
@@ -1,49 +0,0 @@
-There are two alternative upgrade procedures:
-
-* :ref:`Upgrading ScyllaDB and simultaneously updating 3rd party and OS packages <upgrade-image-recommended-procedure>`. It is recommended if you are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04.
-
-* :ref:`Upgrading ScyllaDB without updating any external packages <upgrade-image-upgrade-guide-regular-procedure>`.
-
-.. _upgrade-image-recommended-procedure:
-
-**To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-.. versionadded:: 5.0
-
-Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command. 
-
-#. Update the |SCYLLA_REPO|_ to |NEW_VERSION|.
-
-#. Load the new repo:
-
-    .. code:: sh 
-    
-       sudo apt-get update
-
-
-#. Run the following command to update the manifest file:
-    
-    .. code:: sh 
-    
-       cat scylla-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-    
-    Where:
-
-      * ``<version>`` - The ScyllaDB version to which you are upgrading ( |NEW_VERSION| ).
-      * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-    
-    The file is included in the ScyllaDB packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-    Example:
-    
-        .. code:: sh 
-           
-           cat scylla-packages-5.1.2-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-        .. note:: 
-
-           Alternatively, you can update the manifest file with the following command:
-
-           ``sudo apt-get install $(awk '{print $1'} scylla-packages-<version>-<arch>.txt) -y``
-
-.. _upgrade-image-upgrade-guide-regular-procedure:
--- a/docs/upgrade/ami-upgrade.rst
+++ b/docs/upgrade/ami-upgrade.rst
@@ -2,21 +2,14 @@
 Upgrade ScyllaDB Image: EC2 AMI, GCP, and Azure Images
 ======================================================

-Upgrading ScyllaDB images requires updating:
+ScyllaDB images are based on **Ubuntu 22.04**.

-* ScyllaDB packages.
-* Underlying OS packages. Starting with ScyllaDB 4.6, each ScyllaDB version includes a list of 3rd party and 
-  OS packages tested with the ScyllaDB release. The list depends on the base OS:
-  
-  * ScyllaDB Open Source **5.0 and 5.1** and ScyllaDB Enterprise **2021.1, 2022.1, and 2022.2** are based on **Ubuntu 20.04**.
-  * ScyllaDB Open Source **5.2** and ScyllaDB Enterprise **2023.1** are based on **Ubuntu 22.04**.
+If you’re using the ScyllaDB official image (recommended), follow the upgrade 
+instructions on the **Debian/Ubuntu** tab in the :doc:`upgrade guide </upgrade/index/>`
+for your ScyllaDB version.

-If you're running ScyllaDB Open Source 5.0 or later or ScyllaDB Enterprise 2021.1.10 or later, you can 
-automatically update 3rd party and OS packages together with the ScyllaDB packages - by running one command. 
-
-In earlier ScyllaDB versions, you have to first update the ScyllaDB packages and then update the OS packages 
-in the next step.
-
-See the relevant :doc:`upgrade guide <./index>` for detailed instructions for upgrading your ScyllaDB version.
+If you’re using your own image and have installed ScyllaDB packages for Ubuntu or Debian, 
+follow the extended upgrade procedure on the **EC2/GCP/Azure Ubuntu image** tab 
+in the :doc:`upgrade guide </upgrade/index/>` for your ScyllaDB version.

 To check your Scylla version, run the ``scylla --version`` command.
--- a/docs/upgrade/index.rst
+++ b/docs/upgrade/index.rst
@@ -6,9 +6,9 @@ Upgrade ScyllaDB
   :titlesonly:
   :hidden:

-   ScyllaDB Open Source <upgrade-opensource/index>
-   ScyllaDB Open Source to ScyllaDB Enterprise <upgrade-to-enterprise/index>
-   ScyllaDB AMI <ami-upgrade>
+   ScyllaDB Open Source Upgrade <upgrade-opensource/index>
+   ScyllaDB Open Source to ScyllaDB Enterprise Upgrade <upgrade-to-enterprise/index>
+   ScyllaDB Image <ami-upgrade>
   ScyllaDB Enterprise <https://enterprise.docs.scylladb.com/stable/upgrade/upgrade-enterprise/index.html>

 .. raw:: html
@@ -27,7 +27,7 @@ Procedures for upgrading Scylla.

 * :doc:`Upgrade from ScyllaDB Open Source to Scylla Enterprise <upgrade-to-enterprise/index>`

-* :doc:`Upgrade ScyllaDB AMI <ami-upgrade>`
+* :doc:`Upgrade ScyllaDB Image <ami-upgrade>`

 * `Upgrade ScyllaDB Enterprise <https://enterprise.docs.scylladb.com/stable/upgrade/upgrade-enterprise/index.html>`_

--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-4.6-to-5.0/upgrade-guide-from-4.6-to-5.0-image.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-4.6-to-5.0/upgrade-guide-from-4.6-to-5.0-image.rst
@@ -1,14 +1,14 @@
-.. |OS| replace:: Ubuntu 20.04
-.. |ROLLBACK| replace:: rollback
-.. _ROLLBACK: ./#rollback-procedure
-.. |SRC_VERSION| replace:: 4.6
-.. |NEW_VERSION| replace:: 5.0
-.. |SCYLLA_NAME| replace:: Scylla Image (EC2, GCP, Azure)
-.. |PKG_NAME| replace:: scylla
-.. |SCYLLA_REPO| replace:: Scylla deb repo
-.. _SCYLLA_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.0
-.. |SCYLLA_METRICS| replace:: Scylla Metrics Update - Scylla 4.6 to 5.0
-.. _SCYLLA_METRICS: ../metric-update-4.6-to-5.0
-.. include:: /upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
-.. include:: /upgrade/_common/upgrade-image-opensource.rst
-.. include:: /upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p2.rst
+======================================================================
+Upgrade Guide - ScyllaDB Image (EC2, GCP, Azure) 4.6 to 5.0
+======================================================================
+
+
+If you are running a ScyllaDB official image (for EC2 AMI, GCP, or Azure), you 
+need to:
+
+* Download and install the new ScyllaDB release for Ubuntu. See 
+  the :doc:`upgrade guide for Ubuntu 20.04 <upgrade-guide-from-4.6-to-5.0-ubuntu-20-04>` 
+  for instructions.
+* Update underlying OS packages.
+
+See :doc:`Upgrade ScyllaDB Image </upgrade/ami-upgrade>` for details.
--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.0-to-5.1/upgrade-guide-from-5.0-to-5.1-generic.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.0-to-5.1/upgrade-guide-from-5.0-to-5.1-generic.rst
@@ -90,9 +90,25 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet

 Backup the configuration file
 ------------------------------
-.. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
+
+   .. group-tab:: RHEL/CentOS
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup

 Gracefully stop the node
 ------------------------
@@ -139,72 +155,16 @@ Download and install the new release
               sudo yum clean all
               sudo yum update scylla\* -y

-   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+.. note::

-        Before upgrading, check what version you are running now using ``dpkg -s scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
+   If you are running a ScyllaDB official image (for EC2 AMI, GCP, or Azure), 
+   you need to:

-        There are two alternative upgrade procedures:
-
-        * :ref:`Upgrading ScyllaDB and simultaneously updating 3rd party and OS packages <upgrade-image-recommended-procedure>`. It is recommended if you are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04.
-
-        * :ref:`Upgrading ScyllaDB without updating any external packages <upgrade-image-upgrade-guide-regular-procedure>`.
-
-        .. _upgrade-image-recommended-procedure:
-
-        **To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-        Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Load the new repo:
-
-            .. code:: sh
-
-               sudo apt-get update
-
-
-        #. Run the following command to update the manifest file:
-
-            .. code:: sh
-
-               cat scylla-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-
-            Where:
-
-              * ``<version>`` - The ScyllaDB version to which you are upgrading ( |NEW_VERSION| ).
-              * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-
-            The file is included in the ScyllaDB packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-            Example:
-
-                .. code:: sh
-
-                   cat scylla-packages-5.1.2-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-                .. note::
-
-                   Alternatively, you can update the manifest file with the following command:
-
-                   ``sudo apt-get install $(awk '{print $1'} scylla-packages-<version>-<arch>.txt) -y``
-
-        .. _upgrade-image-upgrade-guide-regular-procedure:
-
-        **To upgrade ScyllaDB:**
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Install the new ScyllaDB version:
-
-            .. code-block:: console
-
-               sudo apt-get clean all
-               sudo apt-get update
-               sudo apt-get dist-upgrade scylla
-
-
-        Answer ‘y’ to the first two questions.
+    * Download and install the new ScyllaDB release for Ubuntu; see 
+      the Debian/Ubuntu tab above for instructions.
+    * Update underlying OS packages.
+ 
+	See :doc:`Upgrade ScyllaDB Image </upgrade/ami-upgrade>` for details.  


 Start the node
@@ -246,7 +206,6 @@ For each of the nodes you rollback to |SRC_VERSION|, serially (i.e. one node at
 * Drain the node and stop Scylla
 * Retrieve the old ScyllaDB packages
 * Restore the configuration file
-* Restore system tables
 * Reload systemd configuration
 * Restart ScyllaDB
 * Validate the rollback success
@@ -261,25 +220,24 @@ Drain and gracefully stop the node
 .. code:: sh

   nodetool drain
+   nodetool snapshot
   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------

-..
-    TODO: downgrade for 3rd party packages in EC2/GCP/Azure - like in the upgrade section?
-
 .. tabs::

   .. group-tab:: Debian/Ubuntu

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
+               sudo cp ~/scylla.list-backup /etc/apt/sources.list.d/scylla.list
+               sudo chown root.root /etc/apt/sources.list.d/scylla.list
+               sudo chmod 644 /etc/apt/sources.list.d/scylla.list

-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
        #. Install:

            .. code-block::
@@ -292,59 +250,31 @@ Download and install the old release

   .. group-tab:: RHEL/CentOS

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/yum.repos.d/scylla.repo
+               sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo
+               sudo chown root.root /etc/yum.repos.d/scylla.repo
+               sudo chmod 644 /etc/yum.repos.d/scylla.repo

-        #. Update the |SCYLLA_RPM_SRC_REPO|_  to |SRC_VERSION|.
        #. Install:

            .. code:: console

               sudo yum clean all
               sudo rm -rf /var/cache/yum
-               sudo yum remove scylla\\*tools-core
-               sudo yum downgrade scylla\\* -y
-               sudo yum install scylla
-
-   .. group-tab:: EC2/GCP/Azure Ubuntu Image
-
-        #. Remove the old repo file.
-
-            .. code:: sh
-
-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
-
-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
-        #. Install:
-
-            .. code-block::
-
-               sudo apt-get update
-               sudo apt-get remove scylla\* -y
-               sudo apt-get install scylla
-
-        Answer ‘y’ to the first two questions.
+               sudo yum downgrade scylla-\*cqlsh -y
+               sudo yum remove scylla-\*cqlsh -y
+               sudo yum downgrade scylla\* -y
+               sudo yum install scylla -y

 Restore the configuration file
 ------------------------------
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ----------------------------
--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic.rst
@@ -98,9 +98,25 @@ When the upgrade is completed on all nodes, remove the snapshot with the ``nodet

 Backup the configuration file
 ------------------------------
-.. code:: sh

-   sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to rollback the upgrade.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
+
+   .. group-tab:: RHEL/CentOS
+
+      .. code:: sh
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup

 Gracefully stop the node
 ------------------------
@@ -147,72 +163,16 @@ Download and install the new release
               sudo yum clean all
               sudo yum update scylla\* -y

-   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+.. note::

-        Before upgrading, check what version you are running now using ``dpkg -s scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
+   If you are running a ScyllaDB official image (for EC2 AMI, GCP, or Azure), 
+   you need to:
+   
+    * Download and install the new ScyllaDB release for Ubuntu; see 
+      the Debian/Ubuntu tab above for instructions.
+    * Update underlying OS packages.

-        There are two alternative upgrade procedures:
-
-        * :ref:`Upgrading ScyllaDB and simultaneously updating 3rd party and OS packages <upgrade-image-recommended-procedure>`. It is recommended if you are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04.
-
-        * :ref:`Upgrading ScyllaDB without updating any external packages <upgrade-image-upgrade-guide-regular-procedure>`.
-
-        .. _upgrade-image-recommended-procedure:
-
-        **To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-        Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Load the new repo:
-
-            .. code:: sh
-
-               sudo apt-get update
-
-
-        #. Run the following command to update the manifest file:
-
-            .. code:: sh
-
-               cat scylla-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-
-            Where:
-
-              * ``<version>`` - The ScyllaDB version to which you are upgrading ( |NEW_VERSION| ).
-              * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-
-            The file is included in the ScyllaDB packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-            Example:
-
-                .. code:: sh
-
-                   cat scylla-packages-5.2.0-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-                .. note::
-
-                   Alternatively, you can update the manifest file with the following command:
-
-                   ``sudo apt-get install $(awk '{print $1'} scylla-packages-<version>-<arch>.txt) -y``
-
-        .. _upgrade-image-upgrade-guide-regular-procedure:
-
-        **To upgrade ScyllaDB:**
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Install the new ScyllaDB version:
-
-            .. code-block:: console
-
-               sudo apt-get clean all
-               sudo apt-get update
-               sudo apt-get dist-upgrade scylla
-
-
-        Answer ‘y’ to the first two questions.
+   See :doc:`Upgrade ScyllaDB Image </upgrade/ami-upgrade>` for details.

 (Optional) Enable consistent cluster management in the node's configuration file
 --------------------------------------------------------------------------------
@@ -294,7 +254,6 @@ For each of the nodes you rollback to |SRC_VERSION|, serially (i.e. one node at
 * Drain the node and stop Scylla
 * Retrieve the old ScyllaDB packages
 * Restore the configuration file
-* Restore system tables
 * Reload systemd configuration
 * Restart ScyllaDB
 * Validate the rollback success
@@ -309,25 +268,24 @@ Drain and gracefully stop the node
 .. code:: sh

   nodetool drain
+   nodetool snapshot
   sudo service scylla-server stop

-Download and install the old release
+Restore and install the old release
 ------------------------------------

-..
-    TODO: downgrade for 3rd party packages in EC2/GCP/Azure - like in the upgrade section?
-
 .. tabs::

   .. group-tab:: Debian/Ubuntu

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
+               sudo cp ~/scylla.list-backup /etc/apt/sources.list.d/scylla.list
+               sudo chown root.root /etc/apt/sources.list.d/scylla.list
+               sudo chmod 644 /etc/apt/sources.list.d/scylla.list

-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
        #. Install:

            .. code-block::
@@ -340,59 +298,32 @@ Download and install the old release

   .. group-tab:: RHEL/CentOS

-        #. Remove the old repo file.
+        #. Restore the |SRC_VERSION| packages backed up during the upgrade.

            .. code:: sh

-               sudo rm -rf /etc/yum.repos.d/scylla.repo
+               sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo
+               sudo chown root.root /etc/yum.repos.d/scylla.repo
+               sudo chmod 644 /etc/yum.repos.d/scylla.repo

-        #. Update the |SCYLLA_RPM_SRC_REPO|_  to |SRC_VERSION|.
        #. Install:

            .. code:: console

               sudo yum clean all
               sudo rm -rf /var/cache/yum
-               sudo yum remove scylla\\*tools-core
-               sudo yum downgrade scylla\\* -y
-               sudo yum install scylla
+               sudo yum downgrade scylla-\*cqlsh -y
+               sudo yum remove scylla-\*cqlsh -y
+               sudo yum downgrade scylla\* -y
+               sudo yum install scylla -y

-   .. group-tab:: EC2/GCP/Azure Ubuntu Image
-
-        #. Remove the old repo file.
-
-            .. code:: sh
-
-               sudo rm -rf /etc/apt/sources.list.d/scylla.list
-
-        #. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
-        #. Install:
-
-            .. code-block::
-
-               sudo apt-get update
-               sudo apt-get remove scylla\* -y
-               sudo apt-get install scylla
-
-        Answer ‘y’ to the first two questions.

 Restore the configuration file
 ------------------------------
 .. code:: sh

   sudo rm -rf /etc/scylla/scylla.yaml
-   sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
-
-Restore system tables
---------------------
-
-Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
-
-.. code:: sh
-
-    cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
-    sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
-    sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
+   sudo cp /etc/scylla/scylla.yaml-backup /etc/scylla/scylla.yaml

 Reload systemd configuration
 ----------------------------
--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.x.y-to-5.x.z/upgrade-guide-from-5.x.y-to-5.x.z-image.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.x.y-to-5.x.z/upgrade-guide-from-5.x.y-to-5.x.z-image.rst
@@ -1,16 +1,14 @@
-.. |OS| replace:: EC2, GCP, and Azure
-.. |ROLLBACK| replace:: rollback
-.. _ROLLBACK: ./#rollback-procedure
-.. |SRC_VERSION| replace:: 5.x.y
-.. |NEW_VERSION| replace:: 5.x.z
-.. |FROM| replace:: 5.x.y
-.. |TO| replace:: 5.x.z
-.. |SCYLLA_NAME| replace:: ScyllaDB Image
-.. |PKG_NAME| replace:: scylla
-.. |SCYLLA_REPO| replace:: ScyllaDB deb repo
-.. _SCYLLA_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.0
-.. |APT| replace:: ScyllaDB deb repo
-.. _APT: http://www.scylladb.com/download/
-.. include:: /upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p1.rst
-.. include:: /upgrade/_common/upgrade-image-opensource.rst
-.. include:: /upgrade/_common/upgrade-guide-v5-patch-ubuntu-and-debian-p2.rst
+======================================================================
+Upgrade Guide - ScyllaDB Image (EC2, GCP, Azure) 5.x.y to 5.x.z
+======================================================================
+
+
+If you are running a ScyllaDB official image (for EC2 AMI, GCP, or Azure), you 
+need to:
+
+* Download and install the new ScyllaDB release for Ubuntu. See 
+  the :doc:`upgrade guide for Ubuntu <upgrade-guide-from-5.x.y-to-5.x.z-ubuntu>` 
+  for instructions.
+* Update underlying OS packages.
+
+See :doc:`Upgrade ScyllaDB Image </upgrade/ami-upgrade>` for details.
--- a/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
+++ b/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
@@ -167,54 +167,27 @@ Download and install the new release

   .. group-tab:: EC2/GCP/Azure Ubuntu Image

-        Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
+      Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.

-        There are two alternative upgrade procedures: upgrading ScyllaDB and simultaneously updating 3rd party and OS packages - recommended if you 
-        are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04, and upgrading ScyllaDB without updating 
-        any external packages.
+      If you’re using the ScyllaDB official image (recommended), see
+      the **Debian/Ubuntu** tab for upgrade instructions. If you’re using your
+      own image and have installed ScyllaDB packages for Ubuntu or Debian,
+      you need to apply an extended upgrade procedure:
+      
+      #. Update the ScyllaDB deb repo (see above).
+      #. Configure Java 1.8 (see above).
+      #. Install the new ScyllaDB version with the additional 
+         ``scylla-enterprise-machine-image`` package:

-        **To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-        Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Load the new repo:
-
-            .. code:: sh
-
-               sudo apt-get update
-
-        #. Run the following command to update the manifest file:
-
-            .. code:: sh
-
-               cat scylla-enterprise-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-
-            Where:
-
-              * ``<version>`` - The ScyllaDB Enterprise version to which you are upgrading ( |NEW_VERSION| ).
-              * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-
-            The file is included in the ScyllaDB Enterprise packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-            Example:
-
-                .. code:: sh
-
-                   cat scylla-enterprise-packages-2022.2.0-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-
-                .. note::
-
-                   Alternatively, you can update the manifest file with the following command:
-
-                   ``sudo apt-get install $(awk '{print $1'} scylla-enterprise-packages-<version>-<arch>.txt) -y``
-
-
-
-        To upgrade ScyllaDB without updating any external packages, follow the :ref:`download and installation instructions for Debian/Ubuntu <upgrade-debian-ubuntu-5.2-to-enterprise-2023.1>`.
+          .. code::
+         
+           sudo apt-get clean all
+           sudo apt-get update
+           sudo apt-get dist-upgrade scylla-enterprise
+           sudo apt-get dist-upgrade scylla-enterprise-machine-image

+      #. Run ``scylla_setup`` without running ``io_setup``.
+      #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.

 Start the node
 --------------
--- a/docs/using-scylla/cassandra-compatibility.rst
+++ b/docs/using-scylla/cassandra-compatibility.rst
@@ -101,32 +101,32 @@ Consistency Level (read and write)
 | LOCAL_SERIAL                        | |v|:sup:`*`  |
 +-------------------------------------+--------------+

-:sup:`*` From ScyllaDB 4.0. See `Scylla LWT`_
+:sup:`*` From ScyllaDB 4.0. See :doc:`Scylla LWT </using-scylla/lwt>`.


 Snitches
 ^^^^^^^^
-+-------------------------------------+--------+
-|   Options                           | Support|
-+=====================================+========+
-| SimpleSnitch_                       |   |v|  |
-+-------------------------------------+--------+
-| RackInferringSnitch_                |   |v|  |
-+-------------------------------------+--------+
-| PropertyFileSnitch                  |   |x|  |
-+-------------------------------------+--------+
-| GossipingPropertyFileSnitch_        |   |v|  |
-+-------------------------------------+--------+
-| Dynamic snitching                   |   |x|  |
-+-------------------------------------+--------+
-| EC2Snitch_                          |   |v|  |
-+-------------------------------------+--------+
-| EC2MultiRegionSnitch_               |   |v|  |
-+-------------------------------------+--------+
-| GoogleCloudSnitch_                  |   |v|  |
-+-------------------------------------+--------+
-| CloudstackSnitch                    |   |x|  |
-+-------------------------------------+--------+
+-----------------------------------------------------------------------------+--------+
+|   Options                                                                   | Support|
+=============================================================================+========+
+|:ref:`SimpleSnitch <snitch-simple-snitch>`                                   |   |v|  |
+-----------------------------------------------------------------------------+--------+
+| :ref:`RackInferringSnitch <snitch-rack-inferring-snitch>`                   |   |v|  |
+-----------------------------------------------------------------------------+--------+
+| PropertyFileSnitch                                                          |   |x|  |
+-----------------------------------------------------------------------------+--------+
+| :ref:`GossipingPropertyFileSnitch <snitch-gossiping-property-file-snitch>`  |   |v|  |
+-----------------------------------------------------------------------------+--------+
+| Dynamic snitching                                                           |   |x|  |
+-----------------------------------------------------------------------------+--------+
+| :ref:`EC2Snitch <snitch-ec2-snitch>`                                        |   |v|  |
+-----------------------------------------------------------------------------+--------+
+| :ref:`EC2MultiRegionSnitch <snitch-ec2-multi-region-snitch>`                |   |v|  |
+-----------------------------------------------------------------------------+--------+
+| :ref:`GoogleCloudSnitch <GoogleCloudSnitch>`                                |   |v|  |
+-----------------------------------------------------------------------------+--------+
+| CloudstackSnitch                                                            |   |x|  |
+-----------------------------------------------------------------------------+--------+

 Partitioners
 ^^^^^^^^^^^^
@@ -148,61 +148,61 @@ Partitioners

 Protocol Options
 ^^^^^^^^^^^^^^^^
-+-------------------------------------+--------------+
-|   Options                           | Support      |
-+=====================================+==============+
-| Encryption_                         |   |v|        |
-+-------------------------------------+--------------+
-| Authentication_                     |   |v|        |
-+-------------------------------------+--------------+
-| Compression_  (see below)           |   |v|        |
-+-------------------------------------+--------------+
+--------------------------------------------------------------------------+--------------+
+|   Options                                                                | Support      |
+==========================================================================+==============+
+| :doc:`Encryption </operating-scylla/security/client-node-encryption>`    |   |v|        |
+--------------------------------------------------------------------------+--------------+
+| :doc:`Authentication </operating-scylla/security/authentication>`        |   |v|        |
+--------------------------------------------------------------------------+--------------+
+| :ref:`Compression <admin-compression>`  (see below)                      |   |v|        |
+--------------------------------------------------------------------------+--------------+


 Compression
 ^^^^^^^^^^^
-+-------------------------------------+--------------+
-|   Options                           | Support      |
-+=====================================+==============+
-|CQL Compression                      |   |v|        |
-+-------------------------------------+--------------+
-| LZ4                                 |   |v|        |
-+-------------------------------------+--------------+
-| Snappy                              |   |v|        |
-+-------------------------------------+--------------+
-| `Node to Node Compression`_         |   |v|        |
-+-------------------------------------+--------------+
-| `Client to Node Compression`_       |   |v|        |
-+-------------------------------------+--------------+
+-------------------------------------------------------------------+--------------+
+|   Options                                                         | Support      |
+===================================================================+==============+
+|CQL Compression                                                    |   |v|        |
+-------------------------------------------------------------------+--------------+
+| LZ4                                                               |   |v|        |
+-------------------------------------------------------------------+--------------+
+| Snappy                                                            |   |v|        |
+-------------------------------------------------------------------+--------------+
+| :ref:`Node to Node Compression <internode-compression>`           |   |v|        |
+-------------------------------------------------------------------+--------------+
+| :ref:`Client to Node Compression <admin-client-node-compression>` |   |v|        |
+-------------------------------------------------------------------+--------------+

 Backup and Restore
 ^^^^^^^^^^^^^^^^^^
-+-------------------------------------+--------------+
-|   Options                           | Support      |
-+=====================================+==============+
-| Snapshot_                           |   |v|        |
-+-------------------------------------+--------------+
-| `Incremental backup`_               |   |v|        |
-+-------------------------------------+--------------+
-| Restore_                            |   |v|        |
-+-------------------------------------+--------------+
+-----------------------------------------------------------------------+--------------+
+|   Options                                                             | Support      |
+=======================================================================+==============+
+| :ref:`Snapshot <backup-full-backup-snapshots>`                        |   |v|        |
+-----------------------------------------------------------------------+--------------+
+| :ref:`Incremental backup <backup-incremental-backup>`                 |   |v|        |
+-----------------------------------------------------------------------+--------------+
+| :doc:`Restore </operating-scylla/procedures/backup-restore/restore>`  |   |v|        |
+-----------------------------------------------------------------------+--------------+

 Repair and Consistency
 ^^^^^^^^^^^^^^^^^^^^^^
-+-------------------------------------+--------------+
-|   Options                           | Support      |
-+=====================================+==============+
-| `Nodetool Repair`_                  |   |v|        |
-+-------------------------------------+--------------+
-| Incremental Repair                  | |x|          |
-+-------------------------------------+--------------+
-|`Hinted Handoff`_                    | |v|          |
-+-------------------------------------+--------------+
-|`Lightweight transactions`_          |  |v|:sup:`*` |
-+-------------------------------------+--------------+
+----------------------------------------------------------------------+--------------+
+|   Options                                                            | Support      |
+======================================================================+==============+
+| :doc:`Nodetool Repair </operating-scylla/nodetool-commands/repair>`  |   |v|        |
+----------------------------------------------------------------------+--------------+
+| Incremental Repair                                                   | |x|          |
+----------------------------------------------------------------------+--------------+
+|:doc:`Hinted Handoff </architecture/anti-entropy/hinted-handoff>`     | |v|          |
+----------------------------------------------------------------------+--------------+
+|:doc:`Lightweight Transactions </using-scylla/lwt>`                   |  |v|:sup:`*` |
+----------------------------------------------------------------------+--------------+


-:sup:`*` From ScyllaDB 4.0. See `Scylla LWT`_
+:sup:`*` From ScyllaDB 4.0. See :doc:`Scylla LWT </using-scylla/lwt>`.

 Replica Replacement Strategy
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -230,15 +230,15 @@ Security
 Indexing and Caching
 ^^^^^^^^^^^^^^^^^^^^^

-+-------------------------------------+-----------------------------------------+
-|   Options                           | Support                                 |
-+=====================================+=========================================+
-|row / key cache                      | |x| (More on `Scylla memory and cache`_)|
-+-------------------------------------+-----------------------------------------+
-|`Secondary Index`_                   | |v| :sup:`*`                            |
-+-------------------------------------+-----------------------------------------+
-|`Materialized Views`_                |  |v|:sup:`*`                            |
-+-------------------------------------+-----------------------------------------+
+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|   Options                                                    | Support                                                                              |
+==============================================================+======================================================================================+
+|row / key cache                                               | |x| (More on `Scylla memory and cache <http://www.scylladb.com/technology/memory/>`_)|
+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|:doc:`Secondary Index </using-scylla/secondary-indexes>`      | |v| :sup:`*`                                                                         |
+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|:doc:`Materialized Views </using-scylla/materialized-views>`  |  |v|:sup:`*`                                                                         |
+--------------------------------------------------------------+--------------------------------------------------------------------------------------+

 :sup:`*` In ScyllaDB Open Source and ScyllaDB Enterprise from 2019.1

@@ -269,32 +269,6 @@ Additional Features

 :sup:`*`  Experimental 

-.. _`Secondary Index`: /using-scylla/secondary-indexes/
-.. _`Lightweight Transactions`: /using-scylla/lwt/
-.. _`Materialized Views`: /using-scylla/materialized-views/
-.. _`Node to Node Compression`: /operating-scylla/admin/#internode-compression
-.. _`Client to Node Compression`: /operating-scylla/admin/#client-node-compression
-.. _`Compression`: /operating-scylla/admin/#compression
-.. _`Scylla LWT`: /using-scylla/lwt/
-.. _401: https://github.com/scylladb/scylla/issues/401
-.. _1141: https://github.com/scylladb/scylla/issues/1141
-.. _1619: https://github.com/scylladb/scylla/issues/1619
-.. _577: https://github.com/scylladb/scylla/issues/577
-.. _`Scylla memory and cache`: http://www.scylladb.com/technology/memory/
-.. _Encryption: /operating-scylla/security/client_node_encryption/
-.. _Authentication: /operating-scylla/security/authentication/
-.. _Authorization: /operating-scylla/security/authorization/
-.. _`Nodetool Repair`: /operating-scylla/nodetool-commands/repair/
-.. _Snapshot: /operating-scylla/procedures/backup-restore/backup/#full-backup-snapshots
-.. _`Incremental backup`: /operating-scylla/procedures/backup-restore/backup/#incremental-backup
-.. _Restore: /operating-scylla/procedures/backup-restore/restore/
-.. _SimpleSnitch: /operating-scylla/system-configuration/snitch/#simplesnitch
-.. _RackInferringSnitch: /operating-scylla/system-configuration/snitch/#rackinferringsnitch
-.. _GossipingPropertyFileSnitch: /operating-scylla/system-configuration/snitch/#gossipingpropertyfilesnitch/
-.. _EC2Snitch: /operating-scylla/system-configuration/snitch/#ec2snitch/
-.. _EC2MultiRegionSnitch: /operating-scylla/system-configuration/snitch/#ec2multiregionsnitch
-.. _GoogleCloudSnitch: /operating-scylla/system-configuration/snitch/#googlecloudsnitch
-.. _`Hinted Handoff`: /architecture/anti-entropy/hinted-handoff/

 CQL Command Compatibility
 -------------------------
@@ -384,17 +358,17 @@ Create Table Att
 Create Table Compaction
 .......................

-+----------------------------------------+-------------------------------------+
-| Feature                                | Support                             |
-+========================================+=====================================+
-| SizeTieredCompactionStrategy_ (STCS)   | |v|                                 |
-+----------------------------------------+-------------------------------------+
-|LeveledCompactionStrategy_ (LCS)        | |v|                                 |
-+----------------------------------------+-------------------------------------+
-|DateTieredCompactionStrategy (DTCS)     | |v|  :sup:`*`                       |
-+----------------------------------------+-------------------------------------+
-|TimeWindowCompactionStrategy_ (TWCS)    | |v|                                 |
-+----------------------------------------+-------------------------------------+
+----------------------------------------------------+-------------------------------------+
+| Feature                                            | Support                             |
+====================================================+=====================================+
+| :ref:`SizeTieredCompactionStrategy <STCS>` (STCS)  | |v|                                 |
+----------------------------------------------------+-------------------------------------+
+|:ref:`LeveledCompactionStrategy <LCS>` (LCS)        | |v|                                 |
+----------------------------------------------------+-------------------------------------+
+|DateTieredCompactionStrategy (DTCS)                 | |v|  :sup:`*`                       |
+----------------------------------------------------+-------------------------------------+
+|:ref:`TimeWindowCompactionStrategy <TWCS>` (TWCS)   | |v|                                 |
+----------------------------------------------------+-------------------------------------+

 :sup:`*`  Deprecated in ScyllaDB 4.0, use TWCS instead

@@ -534,15 +508,6 @@ Index commands
 |DROP INDEX                              | |v|                                 |
 +----------------------------------------+-------------------------------------+

-
-.. _SizeTieredCompactionStrategy: /getting-started/compaction/#size-tiered-compaction-strategy
-
-.. _LeveledCompactionStrategy: /getting-started/compaction/#leveled-compaction-strategy
-
-.. _TimeWindowCompactionStrategy: /getting-started/compaction/#time-window-compactionstrategy
-
-.. _1432: https://github.com/scylladb/scylla/issues/1432
-
 .. include:: /rst_include/apache-copyrights-index.rst

 .. include:: /rst_include/apache-copyrights-index-all-attributes.rst
--- a/docs/using-scylla/cdc/cdc-stream-generations.rst
+++ b/docs/using-scylla/cdc/cdc-stream-generations.rst
@@ -124,58 +124,9 @@ Example: The Next Generation

   There are two entries with the same base partition key, but in different streams. One of them corresponds to the write made before the generation change, the other --- to the write made after the change.

-After the operating CDC generation changes, all writes with timestamps greater than or equal to the new generation's timestamp will use the new stream IDs. If you try to perform a write with a timestamp that is smaller than the new generation's timestamp, the write may be rejected, depending on the node you're connected to:
+After the operating CDC generation changes, all writes with timestamps greater than or equal to the new generation's timestamp will use the new stream IDs.

-* if the clock of the node you're connected to reports earlier time than the generation's timestamp, it will allow the write to be performed.
-* Otherwise, the write will be rejected.
-
-Therefore, if you've configured the driver to generate timestamps for you, make sure that the clock of the machine your driver is running on is not too desynchronized with the clock of the node you're connecting to. That way you can minimize the chance of writes being rejected while a new node is being bootstrapped.
-
-Example: rejecting writes to an old generation
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This is a continuation of the :ref:`previous example <next-gen>`; a second node was bootstrapped recently, thus a new generation superseded the previous one.
-
-#. Get the timestamp of the latest generation as an integer:
-
-   .. code-block:: cql
-
-    SELECT tounixtimestamp(time) FROM system_distributed.cdc_generation_timestamps WHERE key = 'timestamps';
-
-   result:
-
-   .. code-block:: none
-
-     system.tounixtimestamp(time)
-    ------------------------------
-                    1585152329484
-                    1585140283006
-
-    (2 rows)
-
-   Generation timestamps have millisecond resolution. Here, the latest generation's timestamp is equal to ``1585152329484`` milliseconds.
-
-#. Try to perform a write with a slightly smaller timestamp (remember that the ``USING TIMESTAMP`` clause expects a timestamp in **microseconds**):
-
-   .. code-block:: cql
-
-    INSERT INTO ks.t (pk, ck, v) VALUES (0, 0, 0) USING TIMESTAMP 1585152329483000;
-
-   result:
-
-   .. code-block:: none
-
-    InvalidRequest: Error from server: code=2200 [Invalid query] message="cdc: attempted to get a stream from an earlier generation than the currently used one. With CDC you cannot send writes with timestamps too far into the past, because that would break consistency properties (write timestamp: 2020/03/25 16:05:29, current generation started at: 2020/03/25 16:05:29)"
-
-   The write was rejected.
-
-#. Perform a write with a timestamp equal to the generation's timestamp:
-
-   .. code-block:: cql
-
-    INSERT INTO ks.t (pk, ck, v) VALUES (0, 0, 0) USING TIMESTAMP 1585152329484000;
-
-   The write succeeds.
+If the clock of the node you're connected to reports time distant from the write's timestamp, it may reject the write. If you've configured the driver to generate timestamps for you, make sure that the clock of the machine your driver is running on is not too desynchronized with the clock of the node you're connecting to. That way you can minimize the chance of writes being rejected.

 The first generation's timestamp
 --------------------------------
--- a/docs/using-scylla/drivers/index.rst
+++ b/docs/using-scylla/drivers/index.rst
@@ -14,7 +14,7 @@ Scylla Drivers
 You can use Scylla with:

 * :doc:`Apache Cassandra CQL Compatible Drivers <cql-drivers/index>`
-* :doc:`AWS DynamoDB Compatible API Drivers <dynamo-drivers/index>`
+* :doc:`Amazon DynamoDB Compatible API Drivers <dynamo-drivers/index>`

 Additional drivers coming soon!

--- a/generic_server.cc
+++ b/generic_server.cc
@@ -55,8 +55,12 @@ static bool is_broken_pipe_or_connection_reset(std::exception_ptr ep) {
    try {
        std::rethrow_exception(ep);
    } catch (const std::system_error& e) {
-        return e.code().category() == std::system_category()
-            && (e.code().value() == EPIPE || e.code().value() == ECONNRESET);
+        return (e.code().category() == std::system_category()
+            && (e.code().value() == EPIPE || e.code().value() == ECONNRESET))
+            // tls version:
+            || (e.code().category() == tls::error_category()
+            && (e.code().value() == tls::ERROR_PREMATURE_TERMINATION))
+            ;
    } catch (...) {}
    return false;
 }
--- a/Show More
+++ b/Show More