doc: fix rollback in the 5.0-to-5.1 upgrade guide

This commit fixes the rollback procedure in the 5.0-to-5.1 upgrade guide: - The "Restore system tables" step is removed. - The "Restore the configuration file" command is fixed. - The "Gracefully shutdown ScyllaDB" command is fixed. In addition, there are the following updates to be in sync with the tests: - The "Backup the configuration file" step is extended to include a command to backup the packages. - The Rollback procedure is extended to restore the backup packages. - The Reinstallation section is fixed for RHEL. Also, I've the section removed the rollback section for images, as it's not correct or relevant. Refs https://github.com/scylladb/scylladb/issues/11907 This commit must be backported to branch-5.4, branch-5.2, and branch-5.1 Closes scylladb/scylladb#16154 (cherry picked from commit 7ad0b92559)
Update seastar submodule
2023-12-05 15:08:58 +02:00 · 2023-12-05 10:42:50 +03:00 · 2023-11-30 15:01:40 +02:00 · 2023-11-30 14:14:56 +02:00 · 2023-11-27 18:16:47 +00:00 · 2023-11-26 17:09:36 +02:00
314 changed files with 14307 additions and 2191 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.1.0-dev
+VERSION=5.1.19

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -34,6 +34,7 @@
 #include "expressions.hh"
 #include "conditions.hh"
 #include "cql3/constants.hh"
+#include "cql3/util.hh"
 #include <optional>
 #include "utils/overloaded_functor.hh"
 #include <seastar/json/json_elements.hh>
@@ -87,17 +88,20 @@ json::json_return_type make_streamed(rjson::value&& value) {
        // move objects to coroutine frame.
        auto los = std::move(os);
        auto lrs = std::move(rs);
+        std::exception_ptr ex;
        try {
            co_await rjson::print(*lrs, los);
-            co_await los.flush();
-            co_await los.close();
        } catch (...) {
            // at this point, we cannot really do anything. HTTP headers and return code are
            // already written, and quite potentially a portion of the content data.
            // just log + rethrow. It is probably better the HTTP server closes connection
            // abruptly or something...
-            elogger.error("Unhandled exception in data streaming: {}", std::current_exception());
-            throw;
+            ex = std::current_exception();
+            elogger.error("Exception during streaming HTTP response: {}", ex);
+        }
+        co_await los.close();
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
        }
        co_return;
    };
@@ -438,6 +442,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -460,6 +469,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
            rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
            describe_key_schema(view_entry, *vptr, key_attribute_types);
+            // Add projection type
+            rjson::value projection = rjson::empty_object();
+            rjson::add(projection, "ProjectionType", "ALL");
+            // FIXME: we have to get ProjectionType from the schema when it is added
+            rjson::add(view_entry, "Projection", std::move(projection));
            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
            rjson::push_back(index_array, std::move(view_entry));
@@ -917,9 +931,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            if  (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
                add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
            }
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -974,9 +989,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -2329,21 +2345,22 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
    return item;
 }

-std::vector<rjson::value> executor::describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get) {
-    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
-    query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
+future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get) {
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        describe_single_item(selection, result_row, attrs_to_get, item);
+        describe_single_item(*selection, result_row, *attrs_to_get, item);
        ret.push_back(std::move(item));
+        co_await coroutine::maybe_yield();
    }
-    return ret;
+    co_return ret;
 }

 static bool check_needs_read_before_write(const parsed::value& v) {
@@ -3223,8 +3240,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                std::vector<rjson::value> jsons = describe_multi_item(schema, partition_slice, *selection, *qr.query_result, *attrs_to_get);
-                return make_ready_future<std::vector<rjson::value>>(std::move(jsons));
+                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get));
            });
            response_futures.push_back(std::move(f));
        }
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -222,11 +222,11 @@ public:
        const query::result&,
        const std::optional<attrs_to_get>&);

-    static std::vector<rjson::value> describe_multi_item(schema_ptr schema,
-        const query::partition_slice& slice,
-        const cql3::selection::selection& selection,
-        const query::result& query_result,
-        const std::optional<attrs_to_get>& attrs_to_get);
+    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
+        const query::partition_slice&& slice,
+        shared_ptr<cql3::selection::selection> selection,
+        foreign_ptr<lw_shared_ptr<query::result>> query_result,
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<bytes_opt>&,
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -142,19 +142,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id() < t2.schema()->id();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id() == streams_start 
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1946,7 +1946,7 @@
         "operations":[
            {
               "method":"POST",
-               "summary":"Reset local schema",
+               "summary":"Forces this node to recalculate versions of schema objects.",
               "type":"void",
               "nickname":"reset_local_schema",
               "produces":[
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -17,36 +17,42 @@ namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    fd::get_all_endpoint_states.set(r, [&g](std::unique_ptr<request> req) {
-        std::vector<fd::endpoint_state> res;
-        for (auto i : g.get_endpoint_states()) {
-            fd::endpoint_state val;
-            val.addrs = boost::lexical_cast<std::string>(i.first);
-            val.is_alive = i.second.is_alive();
-            val.generation = i.second.get_heart_beat_state().get_generation();
-            val.version = i.second.get_heart_beat_state().get_heart_beat_version();
-            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
-            for (auto a : i.second.get_application_state_map()) {
-                fd::version_value version_val;
-                // We return the enum index and not it's name to stay compatible to origin
-                // method that the state index are static but the name can be changed.
-                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
-                version_val.value = a.second.value;
-                version_val.version = a.second.version;
-                val.application_state.push(version_val);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::vector<fd::endpoint_state> res;
+            for (auto i : g.get_endpoint_states()) {
+                fd::endpoint_state val;
+                val.addrs = boost::lexical_cast<std::string>(i.first);
+                val.is_alive = i.second.is_alive();
+                val.generation = i.second.get_heart_beat_state().get_generation();
+                val.version = i.second.get_heart_beat_state().get_heart_beat_version();
+                val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+                for (auto a : i.second.get_application_state_map()) {
+                    fd::version_value version_val;
+                    // We return the enum index and not it's name to stay compatible to origin
+                    // method that the state index are static but the name can be changed.
+                    version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                    version_val.value = a.second.value;
+                    version_val.version = a.second.version;
+                    val.application_state.push(version_val);
+                }
+                res.push_back(val);
            }
-            res.push_back(val);
-        }
-        return make_ready_future<json::json_return_type>(res);
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_up_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_up_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_up_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_down_endpoint_count.set(r, [&g](std::unique_ptr<request> req) {
-        int res = g.get_down_endpoint_count();
-        return make_ready_future<json::json_return_type>(res);
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            int res = g.get_down_endpoint_count();
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    fd::get_phi_convict_threshold.set(r, [] (std::unique_ptr<request> req) {
@@ -54,11 +60,13 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_simple_states.set(r, [&g] (std::unique_ptr<request> req) {
-        std::map<sstring, sstring> nodes_status;
-        for (auto& entry : g.get_endpoint_states()) {
-            nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
-        }
-        return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        return g.container().invoke_on(0, [] (gms::gossiper& g) {
+            std::map<sstring, sstring> nodes_status;
+            for (auto& entry : g.get_endpoint_states()) {
+                nodes_status.emplace(entry.first.to_sstring(), entry.second.is_alive() ? "UP" : "DOWN");
+            }
+            return make_ready_future<json::json_return_type>(map_to_key_value<fd::mapper>(nodes_status));
+        });
    });

    fd::set_phi_convict_threshold.set(r, [](std::unique_ptr<request> req) {
@@ -67,13 +75,15 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
-        auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
-        if (!state) {
-            return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
-        }
-        std::stringstream ss;
-        g.append_endpoint_state(ss, *state);
-        return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
+            auto* state = g.get_endpoint_state_for_endpoint_ptr(gms::inet_address(req->param["addr"]));
+            if (!state) {
+                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+            }
+            std::stringstream ss;
+            g.append_endpoint_state(ss, *state);
+            return make_ready_future<json::json_return_type>(sstring(ss.str()));
+        });
    });

    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -19,9 +19,11 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
        return container_to_vec(res);
    });

-    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (const_req req) {
-        auto res = g.get_live_members();
-        return container_to_vec(res);
+
+    httpd::gossiper_json::get_live_endpoint.set(r, [&g] (std::unique_ptr<request> req) {
+        return g.get_live_members_synchronized().then([] (auto res) {
+            return make_ready_future<json::json_return_type>(container_to_vec(res));
+        });
    });

    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (const_req req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -184,17 +184,21 @@ future<json::json_return_type> set_tables_autocompaction(http_context& ctx, cons
 }

 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
-    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::start_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+    ss::stop_native_transport.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -216,17 +220,21 @@ void unset_transport_controller(http_context& ctx, routes& r) {
 }

 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
-    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.request_stop_server();
+    ss::stop_rpc_server.set(r, [&ctx, &ctl] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.request_stop_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
-        return smp::submit_to(0, [&] {
-            return ctl.start_server();
+    ss::start_rpc_server.set(r, [&ctx, &ctl](std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx, &ctl] {
+            return with_scheduling_group(ctx.db.local().get_statement_scheduling_group(), [&ctl] () mutable {
+                return ctl.start_server();
+            });
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -610,6 +618,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, column_families);
        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
            auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
                return db.find_uuid(keyspace, cf_name);
@@ -634,6 +643,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, column_families);
        return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
            if (!is_cleanup_allowed) {
@@ -653,7 +663,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
                for (auto& id : table_ids) {
                    replica::table& t = db.find_column_family(id);
-                    co_await cm.perform_cleanup(owned_ranges_ptr, t.as_table_state());
+                    co_await t.perform_cleanup_compaction(owned_ranges_ptr);
                }
                co_return;
            }).then([]{
@@ -663,6 +673,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
+        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, tables);
        co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
            bool needed = false;
            for (const auto& table : tables) {
@@ -676,6 +687,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, column_families, exclude_current_version);
        return ctx.db.invoke_on_all([=] (replica::database& db) {
            auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
            return do_for_each(column_families, [=, &db](sstring cfname) {
@@ -691,6 +703,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
        auto &db = ctx.db.local();
        if (column_families.empty()) {
            co_await db.flush_on_all(keyspace);
@@ -702,6 +715,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_


    ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("decommission");
        return ss.local().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -717,6 +731,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
        auto host_id = req->get_query_param("host_id");
        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
+        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<gms::inet_address>();
        for (std::string n : ignore_nodes_strs) {
            try {
@@ -789,6 +804,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("drain");
        return ss.local().drain().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -822,12 +838,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("stop_gossiping");
        return ss.local().stop_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("start_gossiping");
        return ss.local().start_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -930,6 +948,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
        auto source_dc = req->get_query_param("source_dc");
+        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -962,17 +981,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::reset_local_schema.set(r, [&sys_ks](std::unique_ptr<request> req) {
+    ss::reset_local_schema.set(r, [&ss](std::unique_ptr<request> req) -> future<json::json_return_type> {
        // FIXME: We should truncate schema tables if more than one node in the cluster.
-        auto& sp = service::get_storage_proxy();
-        auto& fs = sp.local().features();
-        return db::schema_tables::recalculate_schema_version(sys_ks, sp, fs).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        apilog.info("reset_local_schema");
+        co_await ss.local().reload_schema();
+        co_return json_void();
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
+        apilog.info("set_trace_probability: probability={}", probability);
        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -1010,6 +1028,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto ttl = req->get_query_param("ttl");
        auto threshold = req->get_query_param("threshold");
        auto fast = req->get_query_param("fast");
+        apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
        try {
            return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
                if (threshold != "") {
@@ -1036,6 +1055,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, keyspace, tables, true);
    });

@@ -1043,6 +1063,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, keyspace, tables, false);
    });

--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -66,36 +66,48 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+// Based on Cassandra's resolveRegular function:
+//  - https://github.com/apache/cassandra/blob/e4f31b73c21b04966269c5ac2d3bd2562e5f6c63/src/java/org/apache/cassandra/db/rows/Cells.java#L79-L119
+//
+// Note: the ordering algorithm for cell is the same as for rows,
+// except that the cell value is used to break a tie in case all other attributes are equal.
+// See compare_row_marker_for_merge.
 std::strong_ordering
 compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    // Largest write timestamp wins.
    if (left.timestamp() != right.timestamp()) {
        return left.timestamp() <=> right.timestamp();
    }
+    // Tombstones always win reconciliation with live cells of the same timestamp
    if (left.is_live() != right.is_live()) {
        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
    }
    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
+        // Prefer expiring cells (which will become tombstones at some future date) over live cells.
+        // See https://issues.apache.org/jira/browse/CASSANDRA-14592
        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
+        // If both are expiring, choose the cell with the latest expiry or derived write time.
        if (left.is_live_and_has_ttl()) {
+            // Prefer cell with latest expiry
            if (left.expiry() != right.expiry()) {
                return left.expiry() <=> right.expiry();
-            } else {
-                // prefer the cell that was written later,
-                // so it survives longer after it expires, until purged.
+            } else if (right.ttl() != left.ttl()) {
+                // The cell write time is derived by (expiry - ttl).
+                // Prefer the cell that was written later,
+                // so it survives longer after it expires, until purged,
+                // as it become purgeable gc_grace_seconds after it was written.
+                //
+                // Note that this is an extension to Cassandra's algorithm
+                // which stops at the expiration time, and if equal,
+                // move forward to compare the cell values.
                return right.ttl() <=> left.ttl();
            }
        }
+        // The cell with the largest value wins, if all other attributes of the cells are identical.
+        // This is quite arbitrary, but still required to break the tie in a deterministic way.
+        return compare_unsigned(left.value(), right.value());
    } else {
        // Both are deleted

--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -55,6 +55,7 @@ future<bool> default_role_row_satisfies(
        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
+                internal_distributed_query_state(),
                {meta::DEFAULT_SUPERUSER_NAME},
                cql3::query_processor::cache_internal::yes).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -41,6 +41,7 @@
 #include "mutation_compactor.hh"
 #include "leveled_manifest.hh"
 #include "dht/token.hh"
+#include "dht/partition_filter.hh"
 #include "mutation_writer/shard_based_splitting_writer.hh"
 #include "mutation_writer/partition_based_splitting_writer.hh"
 #include "mutation_source_metadata.hh"
@@ -165,7 +166,7 @@ std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
 }

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
    auto timestamp = table_s.min_memtable_timestamp();
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -176,6 +177,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
            hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
        }
        if (sst->filter_has_key(*hk)) {
+            bloom_filter_checks++;
            timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
        }
    }
@@ -398,9 +400,12 @@ private:

 class formatted_sstables_list {
    bool _include_origin = true;
-    std::vector<sstring> _ssts;
+    std::vector<std::string> _ssts;
 public:
    formatted_sstables_list() = default;
+    void reserve(size_t n) {
+        _ssts.reserve(n);
+    }
    explicit formatted_sstables_list(const std::vector<shared_sstable>& ssts, bool include_origin) : _include_origin(include_origin) {
        _ssts.reserve(ssts.size());
        for (const auto& sst : ssts) {
@@ -419,9 +424,7 @@ public:
 };

 std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst) {
-    os << "[";
-    os << boost::algorithm::join(lst._ssts, ",");
-    os << "]";
+    fmt::print(os, "[{}]", fmt::join(lst._ssts, ","));
    return os;
 }

@@ -446,6 +449,7 @@ protected:
    uint64_t _start_size = 0;
    uint64_t _end_size = 0;
    uint64_t _estimated_partitions = 0;
+    uint64_t _bloom_filter_checks = 0;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
    bool _contains_multi_fragment_runs = false;
@@ -555,13 +559,13 @@ protected:
        return bool(_sstable_set);
    }

-    compaction_writer create_gc_compaction_writer() const {
+    compaction_writer create_gc_compaction_writer(utils::UUID gc_run) const {
        auto sst = _sstable_creator(this_shard_id());

        auto&& priority = _io_priority;
        auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
        sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
-        cfg.run_identifier = _run_identifier;
+        cfg.run_identifier = gc_run;
        cfg.monitor = monitor.get();
        auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
        return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
@@ -582,8 +586,14 @@ protected:
    // When compaction finishes, all the temporary sstables generated here will be deleted and removed
    // from table's sstable set.
    compacted_fragments_writer get_gc_compacted_fragments_writer() {
+        // because the temporary sstable run can overlap with the non-gc sstables run created by
+        // get_compacted_fragments_writer(), we have to use a different run_id. the gc_run_id is
+        // created here as:
+        // 1. it can be shared across all sstables created by this writer
+        // 2. it is optional, as gc writer is not always used
+        auto gc_run = utils::make_random_uuid();
        return compacted_fragments_writer(*this,
-             [this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
+             [this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
             [this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
             _stop_request_observable);
    }
@@ -600,8 +610,8 @@ protected:
        return _used_garbage_collected_sstables;
    }

-    bool enable_garbage_collected_sstable_writer() const noexcept {
-        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept {
+        return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
    }
 public:
    compaction& operator=(const compaction&) = delete;
@@ -623,6 +633,7 @@ private:
    future<> setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
        formatted_sstables_list formatted_msg;
+        formatted_msg.reserve(_sstables.size());
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
        min_max_tracker<api::timestamp_type> timestamp_tracker;

@@ -736,6 +747,7 @@ protected:
                .ended_at = ended_at,
                .start_size = _start_size,
                .end_size = _end_size,
+                .bloom_filter_checks = _bloom_filter_checks,
            },
        };

@@ -755,7 +767,7 @@ protected:
        log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
                report_finish_desc(),
                _input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
-                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
+                std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_start_size, duration),
                _cdata.total_partitions, _cdata.total_keys_written);

        return ret;
@@ -776,7 +788,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
        };
    }

@@ -894,51 +906,6 @@ void compacted_fragments_writer::consume_end_of_stream() {
    }
 }

-class reshape_compaction : public compaction {
-public:
-    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
-        : compaction(table_s, std::move(descriptor), cdata) {
-    }
-
-    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
-    }
-
-    flat_mutation_reader_v2 make_sstable_reader() const override {
-        return _compacting->make_local_shard_sstable_reader(_schema,
-                _permit,
-                query::full_partition_range,
-                _schema->full_slice(),
-                _io_priority,
-                tracing::trace_state_ptr(),
-                ::streamed_mutation::forwarding::no,
-                ::mutation_reader::forwarding::no,
-                default_read_monitor_generator());
-    }
-
-    std::string_view report_start_desc() const override {
-        return "Reshaping";
-    }
-
-    std::string_view report_finish_desc() const override {
-        return "Reshaped";
-    }
-
-    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
-        auto sst = _sstable_creator(this_shard_id());
-        setup_new_sstable(sst);
-
-        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
-    }
-
-    virtual void stop_sstable_writer(compaction_writer* writer) override {
-        if (writer) {
-            finish_new_sstable(writer);
-        }
-    }
-};
-
 class regular_compaction : public compaction {
    // keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
    mutable compaction_read_monitor_generator _monitor_generator;
@@ -1048,12 +1015,13 @@ private:
    }

    void update_pending_ranges() {
-        if (!_sstable_set || _sstable_set->all()->empty() || _cdata.pending_replacements.empty()) { // set can be empty for testing scenario.
+        auto pending_replacements = std::exchange(_cdata.pending_replacements, {});
+        if (!_sstable_set || _sstable_set->all()->empty() || pending_replacements.empty()) { // set can be empty for testing scenario.
            return;
        }
        // Releases reference to sstables compacted by this compaction or another, both of which belongs
        // to the same column family
-        for (auto& pending_replacement : _cdata.pending_replacements) {
+        for (auto& pending_replacement : pending_replacements) {
            for (auto& sst : pending_replacement.removed) {
                // Set may not contain sstable to be removed because this compaction may have started
                // before the creation of that sstable.
@@ -1067,35 +1035,76 @@ private:
            }
        }
        _selector.emplace(_sstable_set->make_incremental_selector());
-        _cdata.pending_replacements.clear();
+    }
+};
+
+class reshape_compaction : public regular_compaction {
+private:
+    bool has_sstable_replacer() const noexcept {
+        return bool(_replacer);
+    }
+public:
+    reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
+            : regular_compaction(table_s, std::move(descriptor), cdata) {
+    }
+
+    virtual sstables::sstable_set make_sstable_set_for_input() const override {
+        return sstables::make_partitioned_sstable_set(_schema, false);
+    }
+
+    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
+    virtual bool enable_garbage_collected_sstable_writer() const noexcept override {
+        return _max_sstable_size != std::numeric_limits<uint64_t>::max() && bool(_replacer);
+    }
+
+    flat_mutation_reader_v2 make_sstable_reader() const override {
+        return _compacting->make_local_shard_sstable_reader(_schema,
+                _permit,
+                query::full_partition_range,
+                _schema->full_slice(),
+                _io_priority,
+                tracing::trace_state_ptr(),
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no,
+                default_read_monitor_generator());
+    }
+
+    std::string_view report_start_desc() const override {
+        return "Reshaping";
+    }
+
+    std::string_view report_finish_desc() const override {
+        return "Reshaped";
+    }
+
+    virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
+        auto sst = _sstable_creator(this_shard_id());
+        setup_new_sstable(sst);
+
+        sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
+        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (writer) {
+            if (has_sstable_replacer()) {
+                regular_compaction::stop_sstable_writer(writer);
+            } else {
+                finish_new_sstable(writer);
+            }
+        }
+    }
+
+    virtual void on_end_of_compaction() override {
+        if (has_sstable_replacer()) {
+            regular_compaction::on_end_of_compaction();
+        }
    }
 };

 class cleanup_compaction final : public regular_compaction {
-    class incremental_owned_ranges_checker {
-        const dht::token_range_vector& _sorted_owned_ranges;
-        mutable dht::token_range_vector::const_iterator _it;
-    public:
-        incremental_owned_ranges_checker(const dht::token_range_vector& sorted_owned_ranges)
-                : _sorted_owned_ranges(sorted_owned_ranges)
-                , _it(_sorted_owned_ranges.begin()) {
-        }
-
-        // Must be called with increasing token values.
-        bool belongs_to_current_node(const dht::token& t) const {
-            // While token T is after a range Rn, advance the iterator.
-            // iterator will be stopped at a range which either overlaps with T (if T belongs to node),
-            // or at a range which is after T (if T doesn't belong to this node).
-            while (_it != _sorted_owned_ranges.end() && _it->after(t, dht::token_comparator())) {
-                _it++;
-            }
-
-            return _it != _sorted_owned_ranges.end() && _it->contains(t, dht::token_comparator());
-        }
-    };
-
    owned_ranges_ptr _owned_ranges;
-    incremental_owned_ranges_checker _owned_ranges_checker;
+    mutable dht::incremental_owned_ranges_checker _owned_ranges_checker;
 private:
    // Called in a seastar thread
    dht::partition_range_vector
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -80,8 +80,10 @@ struct compaction_data {
    }

    void stop(sstring reason) {
-        stop_requested = std::move(reason);
-        abort.request_abort();
+        if (!abort.abort_requested()) {
+            stop_requested = std::move(reason);
+            abort.request_abort();
+        }
    }
 };

@@ -90,12 +92,15 @@ struct compaction_stats {
    uint64_t start_size = 0;
    uint64_t end_size = 0;
    uint64_t validation_errors = 0;
+    // Bloom filter checks during max purgeable calculation
+    uint64_t bloom_filter_checks = 0;

    compaction_stats& operator+=(const compaction_stats& r) {
        ended_at = std::max(ended_at, r.ended_at);
        start_size += r.start_size;
        end_size += r.end_size;
        validation_errors += r.validation_errors;
+        bloom_filter_checks += r.bloom_filter_checks;
        return *this;
    }
    friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -7,14 +7,17 @@
 */

 #include "compaction_manager.hh"
+#include "compaction_descriptor.hh"
 #include "compaction_strategy.hh"
 #include "compaction_backlog_manager.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstables_manager.hh"
+#include <memory>
 #include <seastar/core/metrics.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/switch_to.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "sstables/exceptions.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "utils/fb_utilities.hh"
@@ -75,6 +78,23 @@ public:
            _compacting.erase(sst);
        }
    }
+
+    class update_me : public compaction_manager::task::on_replacement {
+        compacting_sstable_registration& _registration;
+        public:
+            update_me(compacting_sstable_registration& registration)
+                : _registration{registration} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.release_compacting(sstables);
+            }
+            void on_addition(const std::vector<sstables::shared_sstable>& sstables) override {
+                _registration.register_compacting(sstables);
+            }
+    };
+
+    auto update_on_sstable_replacement() {
+        return update_me(*this);
+    }
 };

 sstables::compaction_data compaction_manager::create_compaction_data() {
@@ -276,7 +296,7 @@ compaction_manager::task::task(compaction_manager& mgr, compaction::table_state*
    , _description(std::move(desc))
 {}

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_manager::task> task, throw_if_stopping do_throw_if_stopping) {
    _tasks.push_back(task);
    auto unregister_task = defer([this, task] {
        _tasks.remove(task);
@@ -289,6 +309,9 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
        co_return res;
    } catch (sstables::compaction_stopped_exception& e) {
        cmlog.info("{}: stopped, reason: {}", *task, e.what());
+        if (do_throw_if_stopping) {
+            throw;
+        }
    } catch (sstables::compaction_aborted_exception& e) {
        cmlog.error("{}: aborted, reason: {}", *task, e.what());
        _stats.errors++;
@@ -307,14 +330,14 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
    co_return std::nullopt;
 }

-future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+future<sstables::compaction_result> compaction_manager::task::compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge) {
    if (!descriptor.sstables.size()) {
        // if there is nothing to compact, just return.
        co_return sstables::compaction_result{};
    }

    bool should_update_history = this->should_update_history(descriptor.options.type());
-    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, std::move(release_exhausted), std::move(can_purge));
+    sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), cdata, on_replace, std::move(can_purge));

    if (should_update_history) {
        co_await update_history(*_compacting_table, res, cdata);
@@ -322,8 +345,11 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables_a

    co_return res;
 }
-future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted, can_purge_tombstones can_purge) {
+
+future<sstables::compaction_result> compaction_manager::task::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, can_purge_tombstones can_purge,
+                                                                               sstables::offstrategy offstrategy) {
    compaction::table_state& t = *_compacting_table;
+
    if (can_purge) {
        descriptor.enable_garbage_collection(t.main_sstable_set());
    }
@@ -331,15 +357,26 @@ future<sstables::compaction_result> compaction_manager::task::compact_sstables(s
        auto sst = t.make_sstable();
        return sst;
    };
-    descriptor.replacer = [this, &t, release_exhausted] (sstables::compaction_completion_desc desc) {
+
+    descriptor.replacer = [this, &t, &on_replace, offstrategy] (sstables::compaction_completion_desc desc) {
        t.get_compaction_strategy().notify_completion(desc.old_sstables, desc.new_sstables);
        _cm.propagate_replacement(t, desc.old_sstables, desc.new_sstables);
+        // on_replace updates the compacting registration with the old and new
+        // sstables. while on_compaction_completion() removes the old sstables
+        // from the table's sstable set, and adds the new ones to the sstable
+        // set.
+        // since the regular compactions exclude the sstables in the sstable
+        // set which are currently being compacted, if we want to ensure the
+        // exclusive access of compactions to an sstable we should guard it
+        // with the registration when adding/removing it to/from the sstable
+        // set. otherwise, the regular compaction would pick it up in the time
+        // window, where the sstables:
+        // - are still in the main set
+        // - are not being compacted.
+        on_replace.on_addition(desc.new_sstables);
        auto old_sstables = desc.old_sstables;
-        t.on_compaction_completion(std::move(desc), sstables::offstrategy::no).get();
-        // Calls compaction manager's task for this compaction to release reference to exhausted SSTables.
-        if (release_exhausted) {
-            release_exhausted(old_sstables);
-        }
+        t.on_compaction_completion(std::move(desc), offstrategy).get();
+        on_replace.on_removal(old_sstables);
    };

    co_return co_await sstables::compact_sstables(std::move(descriptor), cdata, t);
@@ -377,9 +414,7 @@ protected:
        sstables::compaction_strategy cs = t->get_compaction_strategy();
        sstables::compaction_descriptor descriptor = cs.get_major_compaction_job(*t, _cm.get_candidates(*t));
        auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
-        auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-            compacting.release_compacting(exhausted_sstables);
-        };
+        auto on_replace = compacting.update_on_sstable_replacement();
        setup_new_compaction(descriptor.run_identifier);

        cmlog.info0("User initiated compaction started on behalf of {}.{}", t->schema()->ks_name(), t->schema()->cf_name());
@@ -391,7 +426,7 @@ protected:
        // the exclusive lock can be freed to let regular compaction run in parallel to major
        lock_holder.return_all();

-        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace);

        finish_compaction();

@@ -438,12 +473,12 @@ protected:
    }
 };

-future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job) {
+future<> compaction_manager::run_custom_job(compaction::table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping) {
    if (_state != state::enabled) {
        return make_ready_future<>();
    }

-    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job))).discard_result();
+    return perform_task(make_shared<custom_compaction_task>(*this, &t, type, desc, std::move(job)), do_throw_if_stopping).discard_result();
 }

 future<> compaction_manager::update_static_shares(float static_shares) {
@@ -637,6 +672,7 @@ sstables::compaction_stopped_exception compaction_manager::task::make_compaction

 compaction_manager::compaction_manager(config cfg, abort_source& as)
    : _cfg(std::move(cfg))
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), [this] () -> float {
        _last_backlog = backlog();
        auto b = _last_backlog / available_memory();
@@ -670,6 +706,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as)

 compaction_manager::compaction_manager()
    : _cfg(config{ .available_memory = 1 })
+    , _compaction_submission_timer(compaction_sg().cpu, compaction_submission_callback())
    , _compaction_controller(make_compaction_controller(compaction_sg(), 1, [] () -> float { return 1.0; }))
    , _backlog_manager(_compaction_controller)
    , _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
@@ -726,38 +763,46 @@ void compaction_manager::register_metrics() {
 void compaction_manager::enable() {
    assert(_state == state::none || _state == state::disabled);
    _state = state::enabled;
-    _compaction_submission_timer.arm(periodic_compaction_submission_interval());
-    postponed_compactions_reevaluation();
+    _compaction_submission_timer.arm_periodic(periodic_compaction_submission_interval());
+    _waiting_reevalution = postponed_compactions_reevaluation();
 }

 std::function<void()> compaction_manager::compaction_submission_callback() {
    return [this] () mutable {
        for (auto& e: _compaction_state) {
-            submit(*e.first);
+            postpone_compaction_for_table(e.first);
        }
+        reevaluate_postponed_compactions();
    };
 }

-void compaction_manager::postponed_compactions_reevaluation() {
-    _waiting_reevalution = repeat([this] {
-        return _postponed_reevaluation.wait().then([this] {
-            if (_state != state::enabled) {
-                _postponed.clear();
-                return stop_iteration::yes;
-            }
-            auto postponed = std::move(_postponed);
-            try {
-                for (auto& t : postponed) {
-                    auto s = t->schema();
-                    cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
-                    submit(*t);
+future<> compaction_manager::postponed_compactions_reevaluation() {
+     while (true) {
+        co_await _postponed_reevaluation.when();
+        if (_state != state::enabled) {
+            _postponed.clear();
+            co_return;
+        }
+        // A task_state being reevaluated can re-insert itself into postponed list, which is the reason
+        // for moving the list to be processed into a local.
+        auto postponed = std::exchange(_postponed, {});
+        try {
+            for (auto it = postponed.begin(); it != postponed.end();) {
+                compaction::table_state* t = *it;
+                it = postponed.erase(it);
+                // skip reevaluation of a table_state that became invalid post its removal
+                if (!_compaction_state.contains(t)) {
+                    continue;
                }
-            } catch (...) {
-                _postponed = std::move(postponed);
+                auto s = t->schema();
+                cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
+                submit(*t);
+                co_await coroutine::maybe_yield();
            }
-            return stop_iteration::no;
-        });
-    });
+        } catch (...) {
+            _postponed.insert(postponed.begin(), postponed.end());
+        }
+    }
 }

 void compaction_manager::reevaluate_postponed_compactions() noexcept {
@@ -842,6 +887,20 @@ future<> compaction_manager::really_do_stop() {
    cmlog.info("Stopped");
 }

+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
+}
+
 void compaction_manager::do_stop() noexcept {
    if (_state == state::none || _state == state::stopped) {
        return;
@@ -849,7 +908,10 @@ void compaction_manager::do_stop() noexcept {

    try {
        _state = state::stopped;
-        _stop_future = really_do_stop();
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        cmlog.error("Failed to stop the manager: {}", std::current_exception());
    }
@@ -941,9 +1003,7 @@ protected:
            }
            auto compacting = compacting_sstable_registration(_cm, descriptor.sstables);
            auto weight_r = compaction_weight_registration(&_cm, weight);
-            auto release_exhausted = [&compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = compacting.update_on_sstable_replacement();
            cmlog.debug("Accepted compaction job: task={} ({} sstable(s)) of weight {} for {}.{}",
                fmt::ptr(this), descriptor.sstables.size(), weight, t.schema()->ks_name(), t.schema()->cf_name());

@@ -952,7 +1012,7 @@ protected:

            try {
                bool should_update_history = this->should_update_history(descriptor.options.type());
-                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, std::move(release_exhausted));
+                sstables::compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
                finish_compaction();
                if (should_update_history) {
                    // update_history can take a long time compared to
@@ -1050,83 +1110,66 @@ public:
    bool performed() const noexcept {
        return _performed;
    }
-
+private:
    future<> run_offstrategy_compaction(sstables::compaction_data& cdata) {
-        // This procedure will reshape sstables in maintenance set until it's ready for
-        // integration into main set.
-        // It may require N reshape rounds before the set satisfies the strategy invariant.
-        // This procedure also only updates maintenance set at the end, on success.
-        // Otherwise, some overlapping could be introduced in the set after each reshape
-        // round, progressively degrading read amplification until integration happens.
-        // The drawback of this approach is the 2x space requirement as the old sstables
-        // will only be deleted at the end. The impact of this space requirement is reduced
-        // by the fact that off-strategy is serialized across all tables, meaning that the
-        // actual requirement is the size of the largest table's maintenance set.
+        // Incrementally reshape the SSTables in maintenance set. The output of each reshape
+        // round is merged into the main set. The common case is that off-strategy input
+        // is mostly disjoint, e.g. repair-based node ops, then all the input will be
+        // reshaped in a single round. The incremental approach allows us to be space
+        // efficient (avoiding a 100% overhead) as we will incrementally replace input
+        // SSTables from maintenance set by output ones into main set.

        compaction::table_state& t = *_compacting_table;
-        const auto& maintenance_sstables = t.maintenance_sstable_set();

-        const auto old_sstables = boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_sstables.all());
-        std::vector<sstables::shared_sstable> reshape_candidates = old_sstables;
-        std::vector<sstables::shared_sstable> sstables_to_remove;
-        std::unordered_set<sstables::shared_sstable> new_unused_sstables;
-
-        auto cleanup_new_unused_sstables_on_failure = defer([&new_unused_sstables] {
-            for (auto& sst : new_unused_sstables) {
-                sst->mark_for_deletion();
-            }
-        });
+        // Filter out sstables that require view building, to avoid a race between off-strategy
+        // and view building. Refs: #11882
+        auto get_reshape_candidates = [&t] () {
+            auto maintenance_ssts = t.maintenance_sstable_set().all();
+            return boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_ssts
+                | boost::adaptors::filtered([](const sstables::shared_sstable& sst) {
+                        return !sst->requires_view_building();
+                }));
+        };

        auto get_next_job = [&] () -> std::optional<sstables::compaction_descriptor> {
            auto& iop = service::get_local_streaming_priority(); // run reshape in maintenance mode
-            auto desc = t.get_compaction_strategy().get_reshaping_job(reshape_candidates, t.schema(), iop, sstables::reshape_mode::strict);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), iop, sstables::reshape_mode::strict);
            return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

+        std::exception_ptr err;
        while (auto desc = get_next_job()) {
-            desc->creator = [this, &new_unused_sstables, &t] (shard_id dummy) {
-                auto sst = t.make_sstable();
-                new_unused_sstables.insert(sst);
-                return sst;
-            };
-            auto input = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(desc->sstables);
+            auto compacting = compacting_sstable_registration(_cm, desc->sstables);
+            auto on_replace = compacting.update_on_sstable_replacement();

-            auto ret = co_await sstables::compact_sstables(std::move(*desc), cdata, t);
-            _performed = true;
-
-            // update list of reshape candidates without input but with output added to it
-            auto it = boost::remove_if(reshape_candidates, [&] (auto& s) { return input.contains(s); });
-            reshape_candidates.erase(it, reshape_candidates.end());
-            std::move(ret.new_sstables.begin(), ret.new_sstables.end(), std::back_inserter(reshape_candidates));
-
-            // If compaction strategy is unable to reshape input data in a single round, it may happen that a SSTable A
-            // created in round 1 will be compacted in a next round producing SSTable B. As SSTable A is no longer needed,
-            // it can be removed immediately. Let's remove all such SSTables immediately to reduce off-strategy space requirement.
-            // Input SSTables from maintenance set can only be removed later, as SSTable sets are only updated on completion.
-            auto can_remove_now = [&] (const sstables::shared_sstable& s) { return new_unused_sstables.contains(s); };
-            for (auto&& sst : input) {
-                if (can_remove_now(sst)) {
-                    co_await sst->unlink();
-                    new_unused_sstables.erase(std::move(sst));
-                } else {
-                    sstables_to_remove.push_back(std::move(sst));
-                }
+            try {
+                sstables::compaction_result _ = co_await compact_sstables(std::move(*desc), _compaction_data, on_replace,
+                                                                          compaction_manager::can_purge_tombstones::no,
+                                                                          sstables::offstrategy::yes);
+            } catch (sstables::compaction_stopped_exception&) {
+                // If off-strategy compaction stopped on user request, let's not discard the partial work.
+                // Therefore, both un-reshaped and reshaped data will be integrated into main set, allowing
+                // regular compaction to continue from where off-strategy left off.
+                err = std::current_exception();
+                break;
            }
+            _performed = true;
        }

-        // at this moment reshape_candidates contains a set of sstables ready for integration into main set
-        auto completion_desc = sstables::compaction_completion_desc{
-            .old_sstables = std::move(old_sstables),
-            .new_sstables = std::move(reshape_candidates)
-        };
-        co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
+        // There might be some remaining sstables in maintenance set that didn't require reshape, or the
+        // user has aborted off-strategy. So we can only integrate them into the main set, such that
+        // they become candidates for regular compaction. We cannot hold them forever in maintenance set,
+        // as that causes read and space amplification issues.
+        if (auto sstables = get_reshape_candidates(); sstables.size()) {
+            auto completion_desc = sstables::compaction_completion_desc{
+                .old_sstables = sstables, // removes from maintenance set.
+                .new_sstables = sstables, // adds into main set.
+            };
+            co_await t.on_compaction_completion(std::move(completion_desc), sstables::offstrategy::yes);
+        }

-        cleanup_new_unused_sstables_on_failure.cancel();
-        // By marking input sstables for deletion instead, the ones which require view building will stay in the staging
-        // directory until they're moved to the main dir when the time comes. Also, that allows view building to resume
-        // on restart if there's a crash midway.
-        for (auto& sst : sstables_to_remove) {
-            sst->mark_for_deletion();
+        if (err) {
+            co_await coroutine::return_exception_ptr(std::move(err));
        }
    }
 protected:
@@ -1147,9 +1190,11 @@ protected:
            std::exception_ptr ex;
            try {
                compaction::table_state& t = *_compacting_table;
-                auto maintenance_sstables = t.maintenance_sstable_set().all();
-                cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
-                        t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                {
+                    auto maintenance_sstables = t.maintenance_sstable_set().all();
+                    cmlog.info("Starting off-strategy compaction for {}.{}, {} candidates were found",
+                               t.schema()->ks_name(), t.schema()->cf_name(), maintenance_sstables->size());
+                }
                co_await run_offstrategy_compaction(_compaction_data);
                finish_compaction();
                cmlog.info("Done with off-strategy compaction for {}.{}", t.schema()->ks_name(), t.schema()->cf_name());
@@ -1222,9 +1267,7 @@ private:
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, _options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
-            auto release_exhausted = [this] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
-                _compacting.release_compacting(exhausted_sstables);
-            };
+            auto on_replace = _compacting.update_on_sstable_replacement();

            setup_new_compaction(descriptor.run_identifier);

@@ -1233,7 +1276,7 @@ private:

            std::exception_ptr ex;
            try {
-                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, std::move(release_exhausted), _can_purge);
+                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace, _can_purge);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return res;  // done with current sstable
@@ -1390,14 +1433,26 @@ protected:
        co_return std::nullopt;
    }
 private:
-    // Releases reference to cleaned files such that respective used disk space can be freed.
-    void release_exhausted(std::vector<sstables::shared_sstable> exhausted_sstables) {
-        _compacting.release_compacting(exhausted_sstables);
-    }
-
    future<> run_cleanup_job(sstables::compaction_descriptor descriptor) {
        co_await coroutine::switch_to(_cm.compaction_sg().cpu);

+        // Releases reference to cleaned files such that respective used disk space can be freed.
+        using update_registration = compacting_sstable_registration::update_me;
+        class release_exhausted : public update_registration {
+            sstables::compaction_descriptor& _desc;
+        public:
+            release_exhausted(compacting_sstable_registration& registration, sstables::compaction_descriptor& desc)
+                : update_registration{registration}
+                , _desc{desc} {}
+            void on_removal(const std::vector<sstables::shared_sstable>& sstables) override {
+                auto exhausted = boost::copy_range<std::unordered_set<sstables::shared_sstable>>(sstables);
+                std::erase_if(_desc.sstables, [&] (const sstables::shared_sstable& sst) {
+                    return exhausted.contains(sst);
+                });
+                update_registration::on_removal(sstables);
+            }
+        };
+        release_exhausted on_replace{_compacting, descriptor};
        for (;;) {
            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_cm._compaction_controller.backlog_of_shares(200), _cm.available_memory()));
            _cm.register_backlog_tracker(user_initiated);
@@ -1405,8 +1460,7 @@ private:
            std::exception_ptr ex;
            try {
                setup_new_compaction(descriptor.run_identifier);
-                co_await compact_sstables_and_update_history(descriptor, _compaction_data,
-                                          std::bind(&cleanup_sstables_compaction_task::release_exhausted, this, std::placeholders::_1));
+                co_await compact_sstables_and_update_history(descriptor, _compaction_data, on_replace);
                finish_compaction();
                _cm.reevaluate_postponed_compactions();
                co_return;  // done with current job
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -29,6 +29,7 @@
 #include "compaction.hh"
 #include "compaction_weight_registration.hh"
 #include "compaction_backlog_manager.hh"
+#include "compaction/compaction_descriptor.hh"
 #include "strategy_control.hh"
 #include "backlog_controller.hh"
 #include "seastarx.hh"
@@ -36,6 +37,8 @@

 class compacting_sstable_registration;

+using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
+
 // Compaction manager provides facilities to submit and track compaction jobs on
 // behalf of existing tables.
 class compaction_manager {
@@ -122,11 +125,20 @@ public:

        virtual ~task();

+        // called when a compaction replaces the exhausted sstables with the new set
+        struct on_replacement {
+            virtual ~on_replacement() {}
+            // called after the replacement completes
+            // @param sstables the old sstable which are replaced in this replacement
+            virtual void on_removal(const std::vector<sstables::shared_sstable>& sstables) = 0;
+            // called before the replacement happens
+            // @param sstables the new sstables to be added to the table's sstable set
+            virtual void on_addition(const std::vector<sstables::shared_sstable>& sstables) = 0;
+        };
+
    protected:
        virtual future<compaction_stats_opt> do_run() = 0;

-        using throw_if_stopping = bool_class<struct throw_if_stopping_tag>;
-
        state switch_state(state new_state);

        future<semaphore_units<named_semaphore_exception_factory>> acquire_semaphore(named_semaphore& sem, size_t units = 1);
@@ -143,12 +155,10 @@ public:
        // otherwise, returns stop_iteration::no after sleep for exponential retry.
        future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);

-        // Compacts set of SSTables according to the descriptor.
-        using release_exhausted_func_t = std::function<void(const std::vector<sstables::shared_sstable>& exhausted_sstables)>;
-        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
-                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
-        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, release_exhausted_func_t release_exhausted,
+        future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
                                  can_purge_tombstones can_purge = can_purge_tombstones::yes);
+        future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
+                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstables::offstrategy offstrategy = sstables::offstrategy::no);
        future<> update_history(compaction::table_state& t, const sstables::compaction_result& res, const sstables::compaction_data& cdata);
        bool should_update_history(sstables::compaction_type ct) {
            return ct == sstables::compaction_type::Compaction;
@@ -279,10 +289,10 @@ private:
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
-    timer<lowres_clock> _compaction_submission_timer = timer<lowres_clock>(compaction_submission_callback());
    static constexpr std::chrono::seconds periodic_compaction_submission_interval() { return std::chrono::seconds(3600); }

    config _cfg;
+    timer<lowres_clock> _compaction_submission_timer;
    compaction_controller _compaction_controller;
    compaction_backlog_manager _backlog_manager;
    optimized_optional<abort_source::subscription> _early_abort_subscription;
@@ -295,7 +305,7 @@ private:
    class strategy_control;
    std::unique_ptr<strategy_control> _strategy_control;
 private:
-    future<compaction_stats_opt> perform_task(shared_ptr<task>);
+    future<compaction_stats_opt> perform_task(shared_ptr<task>, throw_if_stopping do_throw_if_stopping = throw_if_stopping::no);

    future<> stop_tasks(std::vector<shared_ptr<task>> tasks, sstring reason);
    future<> update_throughput(uint32_t value_mbs);
@@ -330,7 +340,7 @@ private:
    // table still exists and compaction is not disabled for the table.
    inline bool can_proceed(compaction::table_state* t) const;

-    void postponed_compactions_reevaluation();
+    future<> postponed_compactions_reevaluation();
    void reevaluate_postponed_compactions() noexcept;
    // Postpone compaction for a table that couldn't be executed due to ongoing
    // similar-sized compaction.
@@ -440,7 +450,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&)> job, throw_if_stopping do_throw_if_stopping);

    class compaction_reenabler {
        compaction_manager& _cm;
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -409,7 +409,9 @@ public:
                l0_old_ssts.push_back(std::move(sst));
            }
        }
-        _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        if (l0_old_ssts.size() || l0_new_ssts.size()) {
+            _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        }
    }
 };

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -144,6 +144,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    auto max_sstable_size_in_bytes = _max_sstable_size_in_mb * 1024 * 1024;

+    leveled_manifest::logger.debug("get_reshaping_job: mode={} input.size={} max_sstable_size_in_bytes={}", mode == reshape_mode::relaxed ? "relaxed" : "strict", input.size(), max_sstable_size_in_bytes);
+
    for (auto& sst : input) {
        auto sst_level = sst->get_sstable_level();
        if (sst_level > leveled_manifest::MAX_LEVELS - 1) {
@@ -200,10 +202,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

        auto [disjoint, overlapping_sstables] = is_disjoint(level_info[level], tolerance(level));
        if (!disjoint) {
-            auto ideal_level = ideal_level_for_input(input, max_sstable_size_in_bytes);
-            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so compacting everything on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
-            // Unfortunately no good limit to limit input size to max_sstables for LCS major
-            compaction_descriptor desc(std::move(input), iop, ideal_level, max_sstable_size_in_bytes);
+            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, found {} overlapping SSTables, so the level will be entirely compacted on behalf of {}.{}", level, overlapping_sstables, schema->ks_name(), schema->cf_name());
+            compaction_descriptor desc(std::move(level_info[level]), iop, level, max_sstable_size_in_bytes);
            desc.options = compaction_type_options::make_reshape();
            return desc;
        }
@@ -229,6 +229,9 @@ leveled_compaction_strategy::get_cleanup_compaction_jobs(table_state& table_s, s
 }

 unsigned leveled_compaction_strategy::ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size) {
+    if (!max_sstable_size) {
+        return 1;
+    }
    auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
        double inv_log_fanout = 1.0f / std::log(fanout);
        return log(x) * inv_log_fanout;
--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -10,88 +10,6 @@
 #pragma once

 #include "dht/i_partitioner.hh"
-#include <optional>
-#include <variant>
-
-// Wraps ring_position_view so it is compatible with old-style C++: default
-// constructor, stateless comparators, yada yada.
-class compatible_ring_position_view {
-    const ::schema* _schema = nullptr;
-    // Optional to supply a default constructor, no more.
-    std::optional<dht::ring_position_view> _rpv;
-public:
-    constexpr compatible_ring_position_view() = default;
-    compatible_ring_position_view(const schema& s, dht::ring_position_view rpv)
-        : _schema(&s), _rpv(rpv) {
-    }
-    const dht::ring_position_view& position() const {
-        return *_rpv;
-    }
-    const ::schema& schema() const {
-        return *_schema;
-    }
-    friend std::strong_ordering tri_compare(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return dht::ring_position_tri_compare(*x._schema, *x._rpv, *y._rpv);
-    }
-    friend bool operator<(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) < 0;
-    }
-    friend bool operator<=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) <= 0;
-    }
-    friend bool operator>(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) > 0;
-    }
-    friend bool operator>=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) >= 0;
-    }
-    friend bool operator==(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) == 0;
-    }
-    friend bool operator!=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
-        return tri_compare(x, y) != 0;
-    }
-};
-
-// Wraps ring_position so it is compatible with old-style C++: default
-// constructor, stateless comparators, yada yada.
-class compatible_ring_position {
-    schema_ptr _schema;
-    // Optional to supply a default constructor, no more.
-    std::optional<dht::ring_position> _rp;
-public:
-    constexpr compatible_ring_position() = default;
-    compatible_ring_position(schema_ptr s, dht::ring_position rp)
-        : _schema(std::move(s)), _rp(std::move(rp)) {
-    }
-    dht::ring_position_view position() const {
-        return *_rp;
-    }
-    const ::schema& schema() const {
-        return *_schema;
-    }
-    friend std::strong_ordering tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return dht::ring_position_tri_compare(*x._schema, *x._rp, *y._rp);
-    }
-    friend bool operator<(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) < 0;
-    }
-    friend bool operator<=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) <= 0;
-    }
-    friend bool operator>(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) > 0;
-    }
-    friend bool operator>=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) >= 0;
-    }
-    friend bool operator==(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) == 0;
-    }
-    friend bool operator!=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) != 0;
-    }
-};

 // Wraps ring_position or ring_position_view so either is compatible with old-style C++: default
 // constructor, stateless comparators, yada yada.
@@ -99,37 +17,22 @@ public:
 // on callers to keep ring position alive, allow lookup on containers that don't support different
 // key types, and also avoiding unnecessary copies.
 class compatible_ring_position_or_view {
-    // Optional to supply a default constructor, no more.
-    std::optional<std::variant<compatible_ring_position, compatible_ring_position_view>> _crp_or_view;
+    schema_ptr _schema;
+    lw_shared_ptr<dht::ring_position> _rp;
+    dht::ring_position_view_opt _rpv; // optional only for default ctor, nothing more
 public:
-    constexpr compatible_ring_position_or_view() = default;
+    compatible_ring_position_or_view() = default;
    explicit compatible_ring_position_or_view(schema_ptr s, dht::ring_position rp)
-        : _crp_or_view(compatible_ring_position(std::move(s), std::move(rp))) {
+        : _schema(std::move(s)), _rp(make_lw_shared<dht::ring_position>(std::move(rp))), _rpv(dht::ring_position_view(*_rp)) {
    }
    explicit compatible_ring_position_or_view(const schema& s, dht::ring_position_view rpv)
-        : _crp_or_view(compatible_ring_position_view(s, rpv)) {
+        : _schema(s.shared_from_this()), _rpv(rpv) {
    }
-    dht::ring_position_view position() const {
-        struct rpv_accessor {
-            dht::ring_position_view operator()(const compatible_ring_position& crp) {
-                return crp.position();
-            }
-            dht::ring_position_view operator()(const compatible_ring_position_view& crpv) {
-                return crpv.position();
-            }
-        };
-        return std::visit(rpv_accessor{}, *_crp_or_view);
+    const dht::ring_position_view& position() const {
+        return *_rpv;
    }
    friend std::strong_ordering tri_compare(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
-        struct schema_accessor {
-            const ::schema& operator()(const compatible_ring_position& crp) {
-                return crp.schema();
-            }
-            const ::schema& operator()(const compatible_ring_position_view& crpv) {
-                return crpv.schema();
-            }
-        };
-        return dht::ring_position_tri_compare(std::visit(schema_accessor{}, *x._crp_or_view), x.position(), y.position());
+        return dht::ring_position_tri_compare(*x._schema, x.position(), y.position());
    }
    friend bool operator<(const compatible_ring_position_or_view& x, const compatible_ring_position_or_view& y) {
        return tri_compare(x, y) < 0;
--- a/configure.py
+++ b/configure.py
@@ -630,6 +630,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
                        help='Link libyaml-cpp statically')
 arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
                        help='Enable(1)/disable(0)compiler debug information generation for tests')
+arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
+                        help='Enable(1)/disable(0)compiler debug information generation for perf tests')
 arg_parser.add_argument('--python', action='store', dest='python', default='python3',
                        help='Python3 path')
 arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
@@ -1423,6 +1425,7 @@ linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
+perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'

 # Strip if debuginfo is disabled, otherwise we end up with partial
 # debug info from the libraries we static link with
@@ -1954,7 +1957,8 @@ with open(buildfile, 'w') as f:
                    # So we strip the tests by default; The user can very
                    # quickly re-link the test unstripped by adding a "_g"
                    # to the test name, e.g., "ninja build/release/testname_g"
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                    link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
@@ -2070,7 +2074,8 @@ with open(buildfile, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -460,8 +460,7 @@ orderByClause[raw::select_statement::parameters::orderings_type& orderings]
    ;

 jsonValue returns [expression value]
-    :
-    | s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
+    : s=STRING_LITERAL { $value = untyped_constant{untyped_constant::string, $s.text}; }
    | m=marker         { $value = std::move(m); }
    ;

@@ -1396,7 +1395,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
@@ -1500,8 +1499,7 @@ marker returns [expression value]
    ;

 intValue returns [expression value]
-    :
-    | t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
+    : t=INTEGER     { $value = untyped_constant{untyped_constant::integer, $t.text}; }
    | e=marker      { $value = std::move(e); }
    ;

--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -10,6 +10,7 @@

 #include "cql3/attributes.hh"
 #include "cql3/column_identifier.hh"
+#include <optional>

 namespace cql3 {

@@ -56,16 +57,16 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    }
 }

-int32_t attributes::get_time_to_live(const query_options& options) {
+std::optional<int32_t> attributes::get_time_to_live(const query_options& options) {
    if (!_time_to_live.has_value())
-        return 0;
+        return std::nullopt;

    cql3::raw_value tval = expr::evaluate(*_time_to_live, options);
    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of TTL");
    }
    if (tval.is_unset_value()) {
-        return 0;
+        return std::nullopt;
    }

    int32_t ttl;
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -42,7 +42,7 @@ public:

    int64_t get_timestamp(int64_t now, const query_options& options);

-    int32_t get_time_to_live(const query_options& options);
+    std::optional<int32_t> get_time_to_live(const query_options& options);

    db::timeout_clock::duration get_timeout(const query_options& options) const;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1457,7 +1457,7 @@ expression search_and_replace(const expression& e,
                    };
                },
                [&] (const binary_operator& oper) -> expression {
-                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
+                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
                },
                [&] (const column_mutation_attribute& cma) -> expression {
                    return column_mutation_attribute{cma.kind, recurse(cma.column)};
--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -165,8 +165,6 @@ static data_value castas_fctn_from_dv_to_string(data_value from) {
    return from.type()->to_string_impl(from);
 }

-// FIXME: Add conversions for counters, after they are fully implemented...
-
 static constexpr unsigned next_power_of_2(unsigned val) {
    unsigned ret = 1;
    while (ret <= val) {
@@ -370,6 +368,26 @@ castas_fctn get_castas_fctn(data_type to_type, data_type from_type) {
        return castas_fctn_from_dv_to_string;
    case cast_switch_case_val(kind::utf8, kind::ascii):
        return castas_fctn_simple<sstring, sstring>;
+
+    case cast_switch_case_val(kind::byte, kind::counter):
+        return castas_fctn_simple<int8_t, int64_t>;
+    case cast_switch_case_val(kind::short_kind, kind::counter):
+        return castas_fctn_simple<int16_t, int64_t>;
+    case cast_switch_case_val(kind::int32, kind::counter):
+        return castas_fctn_simple<int32_t, int64_t>;
+    case cast_switch_case_val(kind::long_kind, kind::counter):
+        return castas_fctn_simple<int64_t, int64_t>;
+    case cast_switch_case_val(kind::float_kind, kind::counter):
+        return castas_fctn_simple<float, int64_t>;
+    case cast_switch_case_val(kind::double_kind, kind::counter):
+        return castas_fctn_simple<double, int64_t>;
+    case cast_switch_case_val(kind::varint, kind::counter):
+        return castas_fctn_simple<utils::multiprecision_int, int64_t>;
+    case cast_switch_case_val(kind::decimal, kind::counter):
+        return castas_fctn_from_integer_to_decimal<int64_t>;
+    case cast_switch_case_val(kind::ascii, kind::counter):
+    case cast_switch_case_val(kind::utf8, kind::counter):
+        return castas_fctn_to_string<int64_t>;
    }
    throw exceptions::invalid_request_exception(format("{} cannot be cast to {}", from_type->name(), to_type->name()));
 }
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -94,11 +94,6 @@ functions::init() noexcept {
        if (type == cql3_type::blob) {
            continue;
        }
-        // counters are not supported yet
-        if (type.is_counter()) {
-            warn(unimplemented::cause::COUNTERS);
-            continue;
-        }

        declare(make_to_blob_function(type.get_type()));
        declare(make_from_blob_function(type.get_type()));
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -22,6 +22,7 @@
 #include "db/config.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "hashers.hh"
+#include "utils/error_injection.hh"

 namespace cql3 {

@@ -599,6 +600,14 @@ query_processor::get_statement(const sstring_view& query, const service::client_
 std::unique_ptr<raw::parsed_statement>
 query_processor::parse_statement(const sstring_view& query) {
    try {
+        {
+            const char* error_injection_key = "query_processor-parse_statement-test_failure";
+            utils::get_local_injector().inject(error_injection_key, [&]() {
+                if (query.find(error_injection_key) != sstring_view::npos) {
+                    throw std::runtime_error(error_injection_key);
+                }
+            });
+        }
        auto statement = util::do_with_parser(query,  std::mem_fn(&cql3_parser::CqlParser::query));
        if (!statement) {
            throw exceptions::syntax_exception("Parsing failed");
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -81,7 +81,7 @@ public:

    virtual sstring assignment_testable_source_context() const override {
        auto&& name = _type->field_name(_field);
-        auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
+        auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
        return format("{}.{}", _selected, sname);
    }

--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -435,7 +435,7 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
        clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
        // FIXME: push to upper layer so it happens once per row
        auto static_and_regular_columns = expr::get_non_pk_values(selection, static_row, row);
-        return expr::is_satisfied_by(
+        bool multi_col_clustering_satisfied = expr::is_satisfied_by(
                clustering_columns_restrictions,
                expr::evaluation_inputs{
                    .partition_key = &partition_key,
@@ -444,6 +444,9 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
                    .selection = &selection,
                    .options = &_options,
                });
+        if (!multi_col_clustering_satisfied) {
+            return false;
+        }
    }

    auto static_row_iterator = static_row.iterator();
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -261,6 +261,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (options.getSerialConsistency() == null)
        throw new InvalidRequestException("Invalid empty serial consistency level");
 #endif
+    for (size_t i = 0; i < _statements.size(); ++i) {
+        _statements[i].statement->validate_primary_key_restrictions(options.for_statement(i));
+    }
+
    if (_has_conditions) {
        ++_stats.cas_batches;
        _stats.statements_in_cas_batches += _statements.size();
--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -119,6 +119,9 @@ std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::resu

 const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas_row_update& op) const {
    static const clustering_key empty_ckey = clustering_key::make_empty();
+    if (_key.empty()) {
+        throw exceptions::invalid_request_exception("partition key ranges empty - probably caused by an unset value");
+    }
    const partition_key& pkey = _key.front().start()->value().key().value();
    // We must ignore statement clustering column restriction when
    // choosing a row to check the conditions. If there is no
@@ -130,6 +133,9 @@ const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas
    //   CREATE TABLE t(p int, c int, s int static, v int, PRIMARY KEY(p, c));
    //   INSERT INTO t(p, s) VALUES(1, 1);
    //   UPDATE t SET v=1 WHERE p=1 AND c=1 IF s=1;
+    if (op.ranges.empty()) {
+        throw exceptions::invalid_request_exception("clustering key ranges empty - probably caused by an unset value");
+    }
    const clustering_key& ckey = op.ranges.front().start() ?  op.ranges.front().start()->value() : empty_ckey;
    auto row = _rows.find_row(pkey, ckey);
    if (row == nullptr && !ckey.is_empty() &&
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -20,6 +20,7 @@
 #include "tombstone_gc.hh"
 #include "db/per_partition_rate_limit_extension.hh"
 #include "db/per_partition_rate_limit_options.hh"
+#include "utils/bloom_calculations.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -152,6 +153,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
    }

+    if (get_simple(KW_BF_FP_CHANCE)) {
+        double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
+        double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
+        if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
+            throw exceptions::configuration_exception(format(
+                "{} must be larger than {} and less than or equal to 1.0 (got {})",
+                KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
+        }
+    }
+
    speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
 }

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -15,6 +15,7 @@
 #include "cql3/util.hh"
 #include "validation.hh"
 #include "db/consistency_level_validations.hh"
+#include <optional>
 #include <seastar/core/shared_ptr.hh>
 #include <boost/range/adaptor/transformed.hpp>
 #include <boost/range/adaptor/map.hpp>
@@ -92,8 +93,9 @@ bool modification_statement::is_timestamp_set() const {
    return attrs->is_timestamp_set();
 }

-gc_clock::duration modification_statement::get_time_to_live(const query_options& options) const {
-    return gc_clock::duration(attrs->get_time_to_live(options));
+std::optional<gc_clock::duration> modification_statement::get_time_to_live(const query_options& options) const {
+    std::optional<int32_t> ttl = attrs->get_time_to_live(options);
+    return ttl ? std::make_optional<gc_clock::duration>(*ttl) : std::nullopt;
 }

 future<> modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
@@ -109,9 +111,6 @@ future<> modification_statement::check_access(query_processor& qp, const service

 future<std::vector<mutation>>
 modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
-    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
-        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
-    }
    auto cl = options.get_consistency();
    auto json_cache = maybe_prepare_json_cache(options);
    auto keys = build_partition_keys(options, json_cache);
@@ -250,6 +249,12 @@ modification_statement::execute_without_checking_exception_message(query_process
    return modify_stage(this, seastar::ref(qp), seastar::ref(qs), seastar::cref(options));
 }

+void modification_statement::validate_primary_key_restrictions(const query_options& options) const {
+    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
+        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
+    }
+}
+
 future<::shared_ptr<cql_transport::messages::result_message>>
 modification_statement::do_execute(query_processor& qp, service::query_state& qs, const query_options& options) const {
    if (has_conditions() && options.get_protocol_version() == 1) {
@@ -260,6 +265,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    inc_cql_stats(qs.get_client_state().is_internal());

+    validate_primary_key_restrictions(options);
+
    if (has_conditions()) {
        return execute_with_condition(qp, qs, options);
    }
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -128,7 +128,7 @@ public:

    bool is_timestamp_set() const;

-    gc_clock::duration get_time_to_live(const query_options& options) const;
+    std::optional<gc_clock::duration> get_time_to_live(const query_options& options) const;

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

@@ -229,6 +229,8 @@ public:
    // True if this statement needs to read only static column values to check if it can be applied.
    bool has_only_static_column_conditions() const { return !_has_regular_column_conditions && _has_static_column_conditions; }

+    void validate_primary_key_restrictions(const query_options& options) const;
+
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor& qp, service::query_state& qs, const query_options& options) const override;

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -120,7 +120,10 @@ schema_altering_statement::execute(query_processor& qp, service::query_state& st
    }

    return execute0(qp, state, options).then([this, &state, internal](::shared_ptr<messages::result_message> result) {
-        auto permissions_granted_fut = internal
+        // We don't want to grant the permissions to the supposed creator even if the statement succeeded if it's an internal query
+        // or if the query did not actually create the item, i.e. the query is bounced to another shard or it's a IF NOT EXISTS
+        // query where the item already exists.
+        auto permissions_granted_fut = internal || !result->is_schema_change()
                ? make_ready_future<>()
                : grant_permissions_to_creator(state.get_client_state());
        return permissions_granted_fut.then([result = std::move(result)] {
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -1499,7 +1499,7 @@ parallelized_select_statement::do_execute(

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
    auto timeout_duration = get_timeout(state.get_client_state(), options);
-    auto timeout = db::timeout_clock::now() + timeout_duration;
+    auto timeout = lowres_system_clock::now() + timeout_duration;
    auto reductions = _selection->get_reductions();

    query::forward_request req = {
@@ -1571,11 +1571,16 @@ void select_statement::maybe_jsonize_select_clause(data_dictionary::database db,
        std::vector<data_type> selector_types;
        std::vector<const column_definition*> defs;
        selector_names.reserve(_select_clause.size());
+        selector_types.reserve(_select_clause.size());
        auto selectables = selection::raw_selector::to_selectables(_select_clause, *schema);
        selection::selector_factories factories(selection::raw_selector::to_selectables(_select_clause, *schema), db, schema, defs);
        auto selectors = factories.new_instances();
        for (size_t i = 0; i < selectors.size(); ++i) {
-            selector_names.push_back(selectables[i]->to_string());
+            if (_select_clause[i]->alias) {
+                selector_names.push_back(_select_clause[i]->alias->to_string());
+            } else {
+                selector_names.push_back(selectables[i]->to_string());
+            }
            selector_types.push_back(selectors[i]->get_type());
        }

--- a/cql3/update_parameters.hh
+++ b/cql3/update_parameters.hh
@@ -93,7 +93,7 @@ public:
    };
    // Note: value (mutation) only required to contain the rows we are interested in
 private:
-    const gc_clock::duration _ttl;
+    const std::optional<gc_clock::duration> _ttl;
    // For operations that require a read-before-write, stores prefetched cell values.
    // For CAS statements, stores values of conditioned columns.
    // Is a reference to an outside prefetch_data container since a CAS BATCH statement
@@ -106,7 +106,7 @@ public:
    const query_options& _options;

    update_parameters(const schema_ptr schema_, const query_options& options,
-            api::timestamp_type timestamp, gc_clock::duration ttl, const prefetch_data& prefetched)
+            api::timestamp_type timestamp, std::optional<gc_clock::duration> ttl, const prefetch_data& prefetched)
        : _ttl(ttl)
        , _prefetched(prefetched)
        , _timestamp(timestamp)
@@ -127,11 +127,7 @@ public:
    }

    atomic_cell make_cell(const abstract_type& type, const raw_value_view& value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
-        auto ttl = _ttl;
-
-        if (ttl.count() <= 0) {
-            ttl = _schema->default_time_to_live();
-        }
+        auto ttl = this->ttl();

        return value.with_value([&] (const FragmentedView auto& v) {
            if (ttl.count() > 0) {
@@ -143,11 +139,7 @@ public:
    };

    atomic_cell make_cell(const abstract_type& type, const managed_bytes_view& value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
-        auto ttl = _ttl;
-
-        if (ttl.count() <= 0) {
-            ttl = _schema->default_time_to_live();
-        }
+        auto ttl = this->ttl();

        if (ttl.count() > 0) {
            return atomic_cell::make_live(type, _timestamp, value, _local_deletion_time + ttl, ttl, cm);
@@ -169,7 +161,7 @@ public:
    }

    gc_clock::duration ttl() const {
-        return _ttl.count() > 0 ? _ttl : _schema->default_time_to_live();
+        return _ttl.value_or(_schema->default_time_to_live());
    }

    gc_clock::time_point expiry() const {
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -204,7 +204,7 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
                        std::move(strategy_options),
                        durable_writes,
                        std::move(cf_defs),
-                        user_types_metadata{},
+                        std::move(user_types),
                        storage_options{}) { }

 keyspace_metadata::keyspace_metadata(std::string_view name,
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -2031,7 +2031,7 @@ future<> db::commitlog::segment_manager::shutdown() {
        }
    }
    co_await _shutdown_promise->get_shared_future();
-    clogger.info("Commitlog shutdown complete");
+    clogger.debug("Commitlog shutdown complete");
 }

 void db::commitlog::segment_manager::add_file_to_dispose(named_file f, dispose_mode mode) {
@@ -2094,6 +2094,9 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
    clogger.debug("Discarding segments {}", ftd);

    for (auto& [f, mode] : ftd) {
+        // `f.remove_file()` resets known_size to 0, so remember the size here,
+        // in order to subtract it from total_size_on_disk accurately.
+        size_t size = f.known_size();
        try {
            if (f) {
                co_await f.close();
@@ -2110,7 +2113,6 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
                }
            }

-            auto size = f.known_size();
            auto usage = totals.total_size_on_disk;
            auto next_usage = usage - size;

@@ -2144,7 +2146,7 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
        // or had such an exception that we consider the file dead
        // anyway. In either case we _remove_ the file size from
        // footprint, because it is no longer our problem.
-        totals.total_size_on_disk -= f.known_size();
+        totals.total_size_on_disk -= size;
    }

    // #8376 - if we had an error in recycling (disk rename?), and no elements
--- a/db/config.cc
+++ b/db/config.cc
@@ -821,6 +821,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Unused, true, "Enable SSTables 'md' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , sstable_format(this, "sstable_format", value_status::Used, "me", "Default sstable file format", {"mc", "md", "me"})
+    , table_digest_insensitive_to_expiry(this, "table_digest_insensitive_to_expiry", liveness::MustRestart, value_status::Used, true,
+            "When enabled, per-table schema digest calculation ignores empty partitions.")
    , enable_dangerous_direct_import_of_cassandra_counters(this, "enable_dangerous_direct_import_of_cassandra_counters", value_status::Used, false, "Only turn this option on if you want to import tables from Cassandra containing counters, and you are SURE that no counters in that table were created in a version earlier than Cassandra 2.1."
        " It is not enough to have ever since upgraded to newer versions of Cassandra. If you EVER used a version earlier than 2.1 in the cluster where these SSTables come from, DO NOT TURN ON THIS OPTION! You will corrupt your data. You have been warned.")
    , enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance")
@@ -899,6 +901,10 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Ignore truncation record stored in system tables as if tables were never truncated.")
    , force_schema_commit_log(this, "force_schema_commit_log", value_status::Used, false,
        "Use separate schema commit log unconditionally rater than after restart following discovery of cluster-wide support for it.")
+    , nodeops_watchdog_timeout_seconds(this, "nodeops_watchdog_timeout_seconds", liveness::LiveUpdate, value_status::Used, 120, "Time in seconds after which node operations abort when not hearing from the coordinator")
+    , nodeops_heartbeat_interval_seconds(this, "nodeops_heartbeat_interval_seconds", liveness::LiveUpdate, value_status::Used, 10, "Period of heartbeat ticks in node operations")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -326,6 +326,7 @@ public:
    named_value<bool> enable_sstables_mc_format;
    named_value<bool> enable_sstables_md_format;
    named_value<sstring> sstable_format;
+    named_value<bool> table_digest_insensitive_to_expiry;
    named_value<bool> enable_dangerous_direct_import_of_cassandra_counters;
    named_value<bool> enable_shard_aware_drivers;
    named_value<bool> enable_ipv6_dns_lookup;
@@ -379,6 +380,11 @@ public:
    named_value<bool> ignore_truncation_record;
    named_value<bool> force_schema_commit_log;

+    named_value<uint32_t> nodeops_watchdog_timeout_seconds;
+    named_value<uint32_t> nodeops_heartbeat_interval_seconds;
+
+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const log_cli::options&) const;

    const db::extensions& extensions() const;
--- a/db/schema_features.hh
+++ b/db/schema_features.hh
@@ -24,6 +24,10 @@ enum class schema_feature {
    PER_TABLE_PARTITIONERS,
    SCYLLA_KEYSPACES,
    SCYLLA_AGGREGATES,
+
+    // When enabled, schema_mutations::digest() will skip empty mutations (with only tombstones),
+    // so that the digest remains the same after schema tables are compacted.
+    TABLE_DIGEST_INSENSITIVE_TO_EXPIRY,
 };

 using schema_features = enum_set<super_enum<schema_feature,
@@ -33,7 +37,8 @@ using schema_features = enum_set<super_enum<schema_feature,
    schema_feature::CDC_OPTIONS,
    schema_feature::PER_TABLE_PARTITIONERS,
    schema_feature::SCYLLA_KEYSPACES,
-    schema_feature::SCYLLA_AGGREGATES
+    schema_feature::SCYLLA_AGGREGATES,
+    schema_feature::TABLE_DIGEST_INSENSITIVE_TO_EXPIRY
    >>;

 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -89,15 +89,18 @@ static bool is_extra_durable(const sstring& ks_name, const sstring& cf_name) {
 /** system.schema_* tables used to store keyspace/table/type attributes prior to C* 3.0 */
 namespace db {

-schema_ctxt::schema_ctxt(const db::config& cfg, std::shared_ptr<data_dictionary::user_types_storage> uts)
-    : _extensions(cfg.extensions())
+schema_ctxt::schema_ctxt(const db::config& cfg, std::shared_ptr<data_dictionary::user_types_storage> uts, 
+                         const gms::feature_service& features, replica::database* db)
+    : _db(db)
+    , _features(features)
+    , _extensions(cfg.extensions())
    , _murmur3_partitioner_ignore_msb_bits(cfg.murmur3_partitioner_ignore_msb_bits())
    , _schema_registry_grace_period(cfg.schema_registry_grace_period())
    , _user_types(std::move(uts))
 {}

-schema_ctxt::schema_ctxt(const replica::database& db)
-    : schema_ctxt(db.get_config(), db.as_user_types_storage())
+schema_ctxt::schema_ctxt(replica::database& db)
+    : schema_ctxt(db.get_config(), db.as_user_types_storage(), db.features(), &db)
 {}

 schema_ctxt::schema_ctxt(distributed<replica::database>& db)
@@ -144,7 +147,8 @@ static future<> merge_tables_and_views(distributed<service::storage_proxy>& prox
    std::map<utils::UUID, schema_mutations>&& tables_before,
    std::map<utils::UUID, schema_mutations>&& tables_after,
    std::map<utils::UUID, schema_mutations>&& views_before,
-    std::map<utils::UUID, schema_mutations>&& views_after);
+    std::map<utils::UUID, schema_mutations>&& views_after,
+    bool reload);

 struct [[nodiscard]] user_types_to_drop final {
    seastar::noncopyable_function<future<> ()> drop;
@@ -157,7 +161,7 @@ static future<user_types_to_drop> merge_types(distributed<service::storage_proxy
 static future<> merge_functions(distributed<service::storage_proxy>& proxy, schema_result before, schema_result after);
 static future<> merge_aggregates(distributed<service::storage_proxy>& proxy, schema_result before, schema_result after, schema_result scylla_before, schema_result scylla_after);

-static future<> do_merge_schema(distributed<service::storage_proxy>&, std::vector<mutation>, bool do_flush);
+static future<> do_merge_schema(distributed<service::storage_proxy>&, std::vector<mutation>, bool do_flush, bool reload);

 using computed_columns_map = std::unordered_map<bytes, column_computation_ptr>;
 static computed_columns_map get_computed_columns(const schema_mutations& sm);
@@ -937,18 +941,18 @@ future<> update_schema_version_and_announce(sharded<db::system_keyspace>& sys_ks
 * @throws ConfigurationException If one of metadata attributes has invalid value
 * @throws IOException If data was corrupted during transportation or failed to apply fs operations
 */
-future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations)
+future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations, bool reload)
 {
    if (this_shard_id() != 0) {
        // mutations must be applied on the owning shard (0).
        co_await smp::submit_to(0, [&, fmuts = freeze(mutations)] () mutable -> future<> {
-            return merge_schema(sys_ks, proxy, feat, unfreeze(fmuts));
+            return merge_schema(sys_ks, proxy, feat, unfreeze(fmuts), reload);
        });
        co_return;
    }
    co_await with_merge_lock([&] () mutable -> future<> {
        bool flush_schema = proxy.local().get_db().local().get_config().flush_schema_tables_after_modification();
-        co_await do_merge_schema(proxy, std::move(mutations), flush_schema);
+        co_await do_merge_schema(proxy, std::move(mutations), flush_schema, reload);
        co_await update_schema_version_and_announce(sys_ks, proxy, feat.cluster_schema_features());
    });
 }
@@ -1084,7 +1088,7 @@ future<> store_column_mapping(distributed<service::storage_proxy>& proxy, schema
    co_await proxy.local().mutate_locally(std::move(muts), tracing::trace_state_ptr());
 }

-static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
+static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush, bool reload)
 {
    slogger.trace("do_merge_schema: {}", mutations);
    schema_ptr s = keyspaces();
@@ -1099,6 +1103,12 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
        delete_schema_version(mutation);
    }

+    if (reload) {
+        for (auto&& ks : proxy.local().get_db().local().get_non_system_keyspaces()) {
+            keyspaces.emplace(ks);
+        }
+    }
+
    // current state of the schema
    auto&& old_keyspaces = co_await read_schema_for_keyspaces(proxy, KEYSPACES, keyspaces);
    auto&& old_column_families = co_await read_tables_for_keyspaces(proxy, keyspaces, tables());
@@ -1134,7 +1144,7 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
    auto types_to_drop = co_await merge_types(proxy, std::move(old_types), std::move(new_types));
    co_await merge_tables_and_views(proxy,
        std::move(old_column_families), std::move(new_column_families),
-        std::move(old_views), std::move(new_views));
+        std::move(old_views), std::move(new_views), reload);
    co_await merge_functions(proxy, std::move(old_functions), std::move(new_functions));
    co_await merge_aggregates(proxy, std::move(old_aggregates), std::move(new_aggregates), std::move(old_scylla_aggregates), std::move(new_scylla_aggregates));
    co_await types_to_drop.drop();
@@ -1244,6 +1254,7 @@ enum class schema_diff_side {
 static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy,
    std::map<utils::UUID, schema_mutations>&& before,
    std::map<utils::UUID, schema_mutations>&& after,
+    bool reload,
    noncopyable_function<schema_ptr (schema_mutations sm, schema_diff_side)> create_schema)
 {
    schema_diff d;
@@ -1264,6 +1275,13 @@ static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy
        slogger.info("Altering {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
        d.altered.emplace_back(schema_diff::altered_schema{s_before, s});
    }
+    if (reload) {
+        for (auto&& key: diff.entries_in_common) {
+            auto s = create_schema(std::move(after.at(key)), schema_diff_side::right);
+            slogger.info("Reloading {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
+            d.altered.emplace_back(schema_diff::altered_schema {s, s});
+        }
+    }
    return d;
 }

@@ -1276,12 +1294,13 @@ static future<> merge_tables_and_views(distributed<service::storage_proxy>& prox
    std::map<utils::UUID, schema_mutations>&& tables_before,
    std::map<utils::UUID, schema_mutations>&& tables_after,
    std::map<utils::UUID, schema_mutations>&& views_before,
-    std::map<utils::UUID, schema_mutations>&& views_after)
+    std::map<utils::UUID, schema_mutations>&& views_after,
+    bool reload)
 {
-    auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), [&] (schema_mutations sm, schema_diff_side) {
+    auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), reload, [&] (schema_mutations sm, schema_diff_side) {
        return create_table_from_mutations(proxy, std::move(sm));
    });
-    auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), [&] (schema_mutations sm, schema_diff_side side) {
+    auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), reload, [&] (schema_mutations sm, schema_diff_side side) {
        // The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
        // If we don't do it we are leaving a window where write commands to this schema are illegal.
        // There are 3 possibilities:
@@ -2020,6 +2039,33 @@ std::vector<shared_ptr<cql3::functions::user_function>> create_functions_from_sc
    return ret;
 }

+std::vector<shared_ptr<cql3::functions::user_aggregate>> create_aggregates_from_schema_partition(
+        replica::database& db, lw_shared_ptr<query::result_set> result, lw_shared_ptr<query::result_set> scylla_result) {
+    std::unordered_multimap<sstring, const query::result_set_row*> scylla_aggs;
+    if (scylla_result) {
+        for (const auto& scylla_row : scylla_result->rows()) {
+            auto scylla_agg_name = scylla_row.get_nonnull<sstring>("aggregate_name");
+            scylla_aggs.emplace(scylla_agg_name, &scylla_row);
+        }
+    }
+
+    std::vector<shared_ptr<cql3::functions::user_aggregate>> ret;
+    for (const auto& row : result->rows()) {
+        auto agg_name = row.get_nonnull<sstring>("aggregate_name");
+        auto agg_args = read_arg_types(db, row, row.get_nonnull<sstring>("keyspace_name"));
+        const query::result_set_row *scylla_row_ptr = nullptr;
+        for (auto [it, end] = scylla_aggs.equal_range(agg_name); it != end; ++it) {
+            auto scylla_agg_args = read_arg_types(db, *it->second, it->second->get_nonnull<sstring>("keyspace_name"));
+            if (agg_args == scylla_agg_args) {
+                scylla_row_ptr = it->second;
+                break;
+            }
+        }
+        ret.emplace_back(create_aggregate(db, row, scylla_row_ptr));
+    }
+    return ret;
+}
+
 /*
 * User type metadata serialization/deserialization
 */
@@ -2875,7 +2921,7 @@ schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations
    if (version) {
        builder.with_version(*version);
    } else {
-        builder.with_version(sm.digest());
+        builder.with_version(sm.digest(ctxt.features().cluster_schema_features()));
    }

    if (auto partitioner = sm.partitioner()) {
@@ -3106,7 +3152,7 @@ view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm
    if (version) {
        builder.with_version(*version);
    } else {
-        builder.with_version(sm.digest());
+        builder.with_version(sm.digest(ctxt.features().cluster_schema_features()));
    }

    auto base_id = row.get_nonnull<utils::UUID>("base_table_id");
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -14,6 +14,7 @@
 #include "schema_fwd.hh"
 #include "schema_features.hh"
 #include "hashing.hh"
+#include "gms/feature_service.hh"
 #include "schema_mutations.hh"
 #include "types/map.hh"
 #include "query-result-set.hh"
@@ -66,8 +67,8 @@ class config;

 class schema_ctxt {
 public:
-    schema_ctxt(const config&, std::shared_ptr<data_dictionary::user_types_storage> uts);
-    schema_ctxt(const replica::database&);
+    schema_ctxt(const config&, std::shared_ptr<data_dictionary::user_types_storage> uts, const gms::feature_service&, replica::database* = nullptr);
+    schema_ctxt(replica::database&);
    schema_ctxt(distributed<replica::database>&);
    schema_ctxt(distributed<service::storage_proxy>&);

@@ -87,7 +88,17 @@ public:
        return *_user_types;
    }

+    const gms::feature_service& features() const {
+        return _features;
+    }
+
+    replica::database* get_db() const {
+        return _db;
+    }
+
 private:
+    replica::database* _db;
+    const gms::feature_service& _features;
    const db::extensions& _extensions;
    const unsigned _murmur3_partitioner_ignore_msb_bits;
    const uint32_t _schema_registry_grace_period;
@@ -184,7 +195,7 @@ future<mutation> read_keyspace_mutation(distributed<service::storage_proxy>&, co
 // Must be called on shard 0.
 future<semaphore_units<>> hold_merge_lock() noexcept;

-future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations);
+future<> merge_schema(sharded<db::system_keyspace>& sys_ks, distributed<service::storage_proxy>& proxy, gms::feature_service& feat, std::vector<mutation> mutations, bool reload);

 // Recalculates the local schema version.
 //
@@ -209,6 +220,8 @@ std::vector<user_type> create_types_from_schema_partition(keyspace_metadata& ks,

 std::vector<shared_ptr<cql3::functions::user_function>> create_functions_from_schema_partition(replica::database& db, lw_shared_ptr<query::result_set> result);

+std::vector<shared_ptr<cql3::functions::user_aggregate>> create_aggregates_from_schema_partition(replica::database& db, lw_shared_ptr<query::result_set> result, lw_shared_ptr<query::result_set> scylla_result);
+
 std::vector<mutation> make_create_function_mutations(shared_ptr<cql3::functions::user_function> func, api::timestamp_type timestamp);

 std::vector<mutation> make_drop_function_mutations(shared_ptr<cql3::functions::user_function> func, api::timestamp_type timestamp);
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -3234,11 +3234,11 @@ mutation system_keyspace::make_group0_history_state_id_mutation(
        using namespace std::chrono;
        assert(*gc_older_than >= gc_clock::duration{0});

-        auto ts_millis = duration_cast<milliseconds>(microseconds{ts});
-        auto gc_older_than_millis = duration_cast<milliseconds>(*gc_older_than);
-        assert(gc_older_than_millis < ts_millis);
+        auto ts_micros = microseconds{ts};
+        auto gc_older_than_micros = duration_cast<microseconds>(*gc_older_than);
+        assert(gc_older_than_micros < ts_micros);

-        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_millis - gc_older_than_millis);
+        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_micros - gc_older_than_micros);
        // We want to delete all entries with IDs smaller than `tomb_upper_bound`
        // but the deleted range is of the form (x, +inf) since the schema is reversed.
        auto range = query::clustering_range::make_starting_with({
--- a/db/view/build_progress_virtual_reader.hh
+++ b/db/view/build_progress_virtual_reader.hh
@@ -197,7 +197,7 @@ public:
            streamed_mutation::forwarding fwd,
            mutation_reader::forwarding fwd_mr) {
        return flat_mutation_reader_v2(std::make_unique<build_progress_reader>(
-                std::move(s),
+                s,
                std::move(permit),
                _db.find_column_family(s->ks_name(), system_keyspace::v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
                range,
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -10,8 +10,6 @@
 #include "log.hh"
 #include "utils/latency.hh"

-#include <seastar/core/when_all.hh>
-
 static logging::logger mylog("row_locking");

 row_locker::row_locker(schema_ptr s)
@@ -76,35 +74,32 @@ row_locker::lock_pk(const dht::decorated_key& pk, bool exclusive, db::timeout_cl
 future<row_locker::lock_holder>
 row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
    mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
+    auto ck = cpk;
+    // Create a two-level lock entry for the partition if it doesn't exist already.
    auto i = _two_level_locks.try_emplace(pk, this).first;
+    // The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
+    // Initiating read locking in the background below ensures that even if the two-level lock is currently
+    // write-locked, releasing the write-lock will synchronously engage any waiting
+    // locks and will keep the entry alive.
    future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
-    auto j = i->second._row_locks.find(cpk);
-    if (j == i->second._row_locks.end()) {
-        // Not yet locked, need to create the lock. This makes a copy of cpk.
-        try {
-            j = i->second._row_locks.emplace(cpk, lock_type()).first;
-        } catch(...) {
-            // If this emplace() failed, e.g., out of memory, we fail. We
-            // could do nothing - the partition lock we already started
-            // taking will be unlocked automatically after being locked.
-            // But it's better form to wait for the work we started, and it
-            // will also allow us to remove the hash-table row we added.
-            return lock_partition.then([ex = std::current_exception()] (auto lock) {
-                // The lock is automatically released when "lock" goes out of scope.
-                // TODO: unlock (lock = {}) now, search for the partition in the
-                // hash table (we know it's still there, because we held the lock until
-                // now) and remove the unused lock from the hash table if still unused.
-                return make_exception_future<row_locker::lock_holder>(std::current_exception());
-            });
-        }
-    }
    single_lock_stats &single_lock_stats = exclusive ? stats.exclusive_row : stats.shared_row;
    single_lock_stats.operations_currently_waiting_for_lock++;
    utils::latency_counter waiting_latency;
    waiting_latency.start();
-    future<lock_type::holder> lock_row = exclusive ? j->second.hold_write_lock(timeout) : j->second.hold_read_lock(timeout);
-    return when_all_succeed(std::move(lock_partition), std::move(lock_row))
-    .then_unpack([this, pk = &i->first, cpk = &j->first, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency)] (auto lock1, auto lock2) mutable {
+    return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
+        auto j = row_locks->find(ck);
+        if (j == row_locks->end()) {
+            // Not yet locked, need to create the lock.
+            j = row_locks->emplace(std::move(ck), lock_type()).first;
+        }
+        auto* cpk = &j->first;
+        auto& row_lock = j->second;
+        // Like to the two-level lock entry above, the row_lock entry we've just created
+        // is guaranteed to be kept alive as long as it's locked.
+        // Initiating read/write locking in the background below ensures that.
+        auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
+        return lock_row.then([this, pk, cpk, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), lock1 = std::move(lock1)] (auto lock2) mutable {
+        // FIXME: indentation
        lock1.release();
        lock2.release();
        waiting_latency.stop();
@@ -112,6 +107,7 @@ row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& c
        single_lock_stats.lock_acquisitions++;
        single_lock_stats.operations_currently_waiting_for_lock--;
        return lock_holder(this, pk, cpk, exclusive);
+        });
    });
 }

--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -123,6 +123,9 @@ const column_definition* view_info::view_column(const column_definition& base_de

 void view_info::set_base_info(db::view::base_info_ptr base_info) {
    _base_info = std::move(base_info);
+    // Forget the cached objects which may refer to the base schema.
+    _select_statement = nullptr;
+    _partition_slice = std::nullopt;
 }

 // A constructor for a base info that can facilitate reads and writes from the materialized view.
@@ -868,13 +871,18 @@ void view_updates::generate_update(
    bool same_row = true;
    for (auto col_id : col_ids) {
        auto* after = update.cells().find_cell(col_id);
-        // Note: multi-cell columns can't be part of the primary key.
        auto& cdef = _base->regular_column_at(col_id);
        if (existing) {
            auto* before = existing->cells().find_cell(col_id);
+            // Note that this cell is necessarily atomic, because col_ids are
+            // view key columns, and keys must be atomic.
            if (before && before->as_atomic_cell(cdef).is_live()) {
                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
+                    // We need to compare just the values of the keys, not
+                    // metadata like the timestamp. This is because below,
+                    // if the old and new view row have the same key, we need
+                    // to be sure to reach the update_entry() case.
+                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
                    if (cmp != 0) {
                        same_row = false;
                    }
@@ -894,7 +902,13 @@ void view_updates::generate_update(
            if (same_row) {
                update_entry(base_key, update, *existing, now);
            } else {
-                replace_entry(base_key, update, *existing, now);
+                // This code doesn't work if the old and new view row have the
+                // same key, because if they do we get both data and tombstone
+                // for the same timestamp (now) and the tombstone wins. This
+                // is why we need the "same_row" case above - it's not just a
+                // performance optimization.
+                delete_old_entry(base_key, *existing, update, now);
+                create_entry(base_key, update, now);
            }
        } else {
            delete_old_entry(base_key, *existing, update, now);
@@ -938,8 +952,12 @@ future<stop_iteration> view_update_builder::stop() const {
    return make_ready_future<stop_iteration>(stop_iteration::yes);
 }

-future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::build_some() {
+future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> view_update_builder::build_some() {
    (void)co_await advance_all();
+    if (!_update && !_existing) {
+        // Tell the caller there is no more data to build.
+        co_return std::nullopt;
+    }
    bool do_advance_updates = false;
    bool do_advance_existings = false;
    if (_update && _update->is_partition_start()) {
@@ -1313,7 +1331,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_mv_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
@@ -1462,6 +1480,8 @@ future<> view_builder::start(service::migration_manager& mm) {
            (void)_build_step.trigger();
            return make_ready_future<>();
        });
+    }).handle_exception_type([] (const seastar::sleep_aborted& e) {
+        vlogger.debug("start aborted: {}", e.what());
    }).handle_exception([] (std::exception_ptr eptr) {
        vlogger.error("start failed: {}", eptr);
        return make_ready_future<>();
@@ -2056,15 +2076,20 @@ public:
 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    gc_clock::time_point now = gc_clock::now();
-    auto consumer = compact_for_query_v2<view_builder::consumer>(
+    auto compaction_state = make_lw_shared<compact_for_query_state_v2>(
            *step.reader.schema(),
            now,
            step.pslice,
            batch_size,
-            query::max_partitions,
-            view_builder::consumer{*this, step, now});
-    consumer.consume_new_partition(step.current_key); // Initialize the state in case we're resuming a partition
+            query::max_partitions);
+    auto consumer = compact_for_query_v2<view_builder::consumer>(compaction_state, view_builder::consumer{*this, step, now});
    auto built = step.reader.consume_in_thread(std::move(consumer));
+    if (auto ds = std::move(*compaction_state).detach_state()) {
+        if (ds->current_tombstone) {
+            step.reader.unpop_mutation_fragment(mutation_fragment_v2(*step.reader.schema(), step.reader.permit(), std::move(*ds->current_tombstone)));
+        }
+        step.reader.unpop_mutation_fragment(mutation_fragment_v2(*step.reader.schema(), step.reader.permit(), std::move(ds->partition_start)));
+    }

    _as.check();

@@ -2146,32 +2171,33 @@ update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog bac
    return std::max(backlog, _max.load(std::memory_order_relaxed));
 }

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name) {
-    return sys_dist_ks.view_status(ks_name, cf_name).then([] (std::unordered_map<utils::UUID, sstring>&& view_statuses) {
-        return boost::algorithm::any_of(view_statuses | boost::adaptors::map_values, [] (const sstring& view_status) {
-            return view_status == "STARTED";
+future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const sstring& ks_name,
+        const sstring& cf_name) {
+    using view_statuses_type = std::unordered_map<utils::UUID, sstring>;
+    return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) {
+        return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) {
+            // Only consider status of known hosts.
+            return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first);
        });
    });
 }

-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason) {
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason) {
    if (is_internal_keyspace(t.schema()->ks_name())) {
        return make_ready_future<bool>(false);
    }
    if (reason == streaming::stream_reason::repair && !t.views().empty()) {
        return make_ready_future<bool>(true);
    }
-    return do_with(t.views(), [&sys_dist_ks] (auto& views) {
+    return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
        return map_reduce(views,
-                [&sys_dist_ks] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, view->ks_name(), view->cf_name()); },
+                [&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
                false,
                std::logical_or<bool>());
    });
 }

-const size_t view_updating_consumer::buffer_size_soft_limit{1 * 1024 * 1024};
-const size_t view_updating_consumer::buffer_size_hard_limit{2 * 1024 * 1024};
-
 void view_updating_consumer::do_flush_buffer() {
    _staging_reader_handle.pause();

@@ -2194,6 +2220,10 @@ void view_updating_consumer::do_flush_buffer() {
 }

 void view_updating_consumer::flush_builder() {
+    _buffer.emplace_back(_mut_builder->flush());
+}
+
+void view_updating_consumer::end_builder() {
    _mut_builder->consume_end_of_partition();
    if (auto mut_opt = _mut_builder->consume_end_of_stream()) {
        _buffer.emplace_back(std::move(*mut_opt));
@@ -2202,11 +2232,9 @@ void view_updating_consumer::flush_builder() {
 }

 void view_updating_consumer::maybe_flush_buffer_mid_partition() {
-    if (_buffer_size >= buffer_size_hard_limit) {
+    if (_buffer_size >= _buffer_size_hard_limit) {
        flush_builder();
-        auto dk = _buffer.back().decorated_key();
        do_flush_buffer();
-        consume_new_partition(dk);
    }
 }

--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -154,10 +154,7 @@ private:
    void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
-    void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
-        create_entry(base_key, update, now);
-        delete_old_entry(base_key, existing, update, now);
-    }
+    void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
 };

 class view_update_builder {
@@ -188,7 +185,15 @@ public:
    }
    view_update_builder(view_update_builder&& other) noexcept = default;

-    future<utils::chunked_vector<frozen_mutation_and_schema>> build_some();
+
+    // build_some() works on batches of 100 (max_rows_for_view_updates)
+    // updated rows, but can_skip_view_updates() can decide that some of
+    // these rows do not effect the view, and as a result build_some() can
+    // fewer than 100 rows - in extreme cases even zero (see issue #12297).
+    // So we can't use an empty returned vector to signify that the view
+    // update building is done - and we wrap the return value in an
+    // std::optional, which is disengaged when the iteration is done.
+    future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> build_some();

    future<> close() noexcept;

--- a/db/view/view_update_checks.hh
+++ b/db/view/view_update_checks.hh
@@ -22,9 +22,13 @@ class system_distributed_keyspace;

 }

+namespace locator {
+class token_metadata;
+}
+
 namespace db::view {

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name);
-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason);
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason);

 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -14,6 +14,7 @@
 #include "db/view/view_updating_consumer.hh"
 #include "sstables/sstables.hh"
 #include "readers/evictable.hh"
+#include "dht/partition_filter.hh"

 static logging::logger vug_logger("view_update_generator");

@@ -84,10 +85,11 @@ future<> view_update_generator::start() {
                            service::get_local_streaming_priority(),
                            nullptr,
                            ::mutation_reader::forwarding::no);
+                    auto close_sr = deferred_close(staging_sstable_reader);

                    inject_failure("view_update_generator_consume_staging_sstable");
-                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle));
-                    staging_sstable_reader.close().get();
+                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle),
+                        dht::incremental_owned_ranges_checker::make_partition_filter(_db.get_keyspace_local_ranges(s->ks_name())));
                    if (result == stop_iteration::yes) {
                        break;
                    }
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -33,8 +33,17 @@ public:
    // We prefer flushing on partition boundaries, so at the end of a partition,
    // we flush on reaching the soft limit. Otherwise we continue accumulating
    // data. We flush mid-partition if we reach the hard limit.
-    static const size_t buffer_size_soft_limit;
-    static const size_t buffer_size_hard_limit;
+    static constexpr size_t buffer_size_soft_limit_default = 1 * 1024 * 1024;
+    static constexpr size_t buffer_size_hard_limit_default = 2 * 1024 * 1024;
+private:
+    size_t _buffer_size_soft_limit = buffer_size_soft_limit_default;
+    size_t _buffer_size_hard_limit = buffer_size_hard_limit_default;
+public:
+    // Meant only for usage in tests.
+    void set_buffer_size_limit_for_testing_purposes(size_t sz) {
+        _buffer_size_soft_limit = sz;
+        _buffer_size_hard_limit = sz;
+    }

 private:
    schema_ptr _schema;
@@ -49,6 +58,7 @@ private:
 private:
    void do_flush_buffer();
    void flush_builder();
+    void end_builder();
    void maybe_flush_buffer_mid_partition();

 public:
@@ -71,7 +81,11 @@ public:

    void consume_new_partition(const dht::decorated_key& dk) {
        _mut_builder.emplace(_schema);
-        _mut_builder->consume_new_partition(dk);
+        // Further accounting is inaccurate as we base it on the consumed
+        // mutation-fragments, not on their final form in the mutation.
+        // This is good enough, as long as the difference is small and mostly
+        // constant (per fragment).
+        _buffer_size += _mut_builder->consume_new_partition(dk).memory_usage(*_schema);
    }

    void consume(tombstone t) {
@@ -113,8 +127,8 @@ public:
        if (_as->abort_requested()) {
            return stop_iteration::yes;
        }
-        flush_builder();
-        if (_buffer_size >= buffer_size_soft_limit) {
+        end_builder();
+        if (_buffer_size >= _buffer_size_soft_limit) {
            do_flush_buffer();
        }
        return stop_iteration::no;
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -10,6 +10,7 @@
 #include "sharder.hh"
 #include <seastar/core/seastar.hh>
 #include "dht/token-sharding.hh"
+#include "dht/partition_filter.hh"
 #include "utils/class_registrator.hh"
 #include "types.hh"
 #include "utils/murmur_hash.hh"
@@ -362,4 +363,10 @@ split_range_to_shards(dht::partition_range pr, const schema& s) {
    return ret;
 }

+flat_mutation_reader_v2::filter incremental_owned_ranges_checker::make_partition_filter(const dht::token_range_vector& sorted_owned_ranges) {
+    return [checker = incremental_owned_ranges_checker(sorted_owned_ranges)] (const dht::decorated_key& dk) mutable {
+        return checker.belongs_to_current_node(dk.token());
+    };
+}
+
 }
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -11,6 +11,7 @@

 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/util/optimized_optional.hh>
 #include "types.hh"
 #include "keys.hh"
 #include "utils/managed_bytes.hh"
@@ -317,6 +318,9 @@ class ring_position_view {
    const dht::token* _token; // always not nullptr
    const partition_key* _key; // Can be nullptr
    int8_t _weight;
+private:
+    ring_position_view() noexcept : _token(nullptr), _key(nullptr), _weight(0) { }
+    explicit operator bool() const noexcept { return bool(_token); }
 public:
    using token_bound = ring_position::token_bound;
    struct after_key_tag {};
@@ -404,9 +408,11 @@ public:
    after_key is_after_key() const { return after_key(_weight == 1); }

    friend std::ostream& operator<<(std::ostream&, ring_position_view);
+    friend class optimized_optional<ring_position_view>;
 };

 using ring_position_ext_view = ring_position_view;
+using ring_position_view_opt = optimized_optional<ring_position_view>;

 //
 // Represents position in the ring of partitions, where partitions are ordered
--- a/dht/partition_filter.hh
+++ b/dht/partition_filter.hh
@@ -0,0 +1,41 @@
+/*
+ * Modified by ScyllaDB
+ * Copyright (C) 2015-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
+ */
+
+#pragma once
+
+#include "dht/i_partitioner.hh"
+#include "readers/flat_mutation_reader_v2.hh"
+
+namespace dht {
+
+class incremental_owned_ranges_checker {
+    const dht::token_range_vector& _sorted_owned_ranges;
+    mutable dht::token_range_vector::const_iterator _it;
+public:
+    incremental_owned_ranges_checker(const dht::token_range_vector& sorted_owned_ranges)
+            : _sorted_owned_ranges(sorted_owned_ranges)
+            , _it(_sorted_owned_ranges.begin()) {
+    }
+
+    // Must be called with increasing token values.
+    bool belongs_to_current_node(const dht::token& t) {
+        // While token T is after a range Rn, advance the iterator.
+        // iterator will be stopped at a range which either overlaps with T (if T belongs to node),
+        // or at a range which is after T (if T doesn't belong to this node).
+        while (_it != _sorted_owned_ranges.end() && _it->after(t, dht::token_comparator())) {
+            _it++;
+        }
+
+        return _it != _sorted_owned_ranges.end() && _it->contains(t, dht::token_comparator());
+    }
+
+    static flat_mutation_reader_v2::filter make_partition_filter(const dht::token_range_vector& sorted_owned_ranges);
+};
+
+} // dht
--- a/direct_failure_detector/failure_detector.cc
+++ b/direct_failure_detector/failure_detector.cc
@@ -478,7 +478,15 @@ static future<bool> ping_with_timeout(pinger::endpoint_id id, clock::timepoint_t

    auto f = pinger.ping(id, timeout_as);
    auto sleep_and_abort = [] (clock::timepoint_t timeout, abort_source& timeout_as, clock& c) -> future<> {
-        co_await c.sleep_until(timeout, timeout_as);
+        co_await c.sleep_until(timeout, timeout_as).then_wrapped([&timeout_as] (auto&& f) {
+            // Avoid throwing if sleep was aborted.
+            if (f.failed() && timeout_as.abort_requested()) {
+                // Expected (if ping() resolved first or we were externally aborted).
+                f.ignore_ready_future();
+                return make_ready_future<>();
+            }
+            return std::move(f);
+        });
        if (!timeout_as.abort_requested()) {
            // We resolved before `f`. Abort the operation.
            timeout_as.request_abort();
@@ -501,8 +509,6 @@ static future<bool> ping_with_timeout(pinger::endpoint_id id, clock::timepoint_t
    // Wait on the sleep as well (it should return shortly, being aborted) so we don't discard the future.
    try {
        co_await std::move(sleep_and_abort);
-    } catch (const sleep_aborted&) {
-        // Expected (if `f` resolved first or we were externally aborted).
    } catch (...) {
        // There should be no other exceptions, but just in case... log it and discard,
        // we want to propagate exceptions from `f`, not from sleep.
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -42,7 +42,8 @@ if __name__ == '__main__':
        if systemd_unit.available('systemd-coredump@.service'):
            dropin = '''
 [Service]
-TimeoutStartSec=infinity
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
 '''[1:-1]
            os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
            with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
--- a/dist/common/scripts/scylla_cpuset_setup
+++ b/dist/common/scripts/scylla_cpuset_setup
@@ -36,6 +36,9 @@ if __name__ == '__main__':
    except:
        pass
    if cpuset != args.cpuset or smp != args.smp:
+        if os.path.exists('/etc/scylla.d/perftune.yaml'):
+            os.remove('/etc/scylla.d/perftune.yaml')
+
        cfg.set('CPUSET', '{cpuset}{smp}'.format( \
                cpuset='--cpuset {} '.format(args.cpuset) if args.cpuset else '', \
                smp='--smp {} '.format(args.smp) if args.smp else '' \
--- a/dist/common/scripts/scylla_fstrim_setup
+++ b/dist/common/scripts/scylla_fstrim_setup
@@ -16,7 +16,7 @@ if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
-    systemd_unit('scylla-fstrim.timer').unmask()
    systemd_unit('scylla-fstrim.timer').enable()
+    systemd_unit('scylla-fstrim.timer').start()
    if is_redhat_variant() or is_arch() or is_suse_variant():
        systemd_unit('fstrim.timer').disable()
--- a/dist/common/scripts/scylla_kernel_check
+++ b/dist/common/scripts/scylla_kernel_check
@@ -25,7 +25,7 @@ if __name__ == '__main__':
    run('dd if=/dev/zero of=/var/tmp/kernel-check.img bs=1M count=128', shell=True, check=True, stdout=DEVNULL, stderr=DEVNULL)
    run('mkfs.xfs /var/tmp/kernel-check.img', shell=True, check=True, stdout=DEVNULL, stderr=DEVNULL)
    run('mount /var/tmp/kernel-check.img /var/tmp/mnt -o loop', shell=True, check=True, stdout=DEVNULL, stderr=DEVNULL)
-    ret = run('iotune --fs-check --evaluation-directory /var/tmp/mnt', shell=True).returncode
+    ret = run('iotune --fs-check --evaluation-directory /var/tmp/mnt --default-log-level error', shell=True).returncode
    run('umount /var/tmp/mnt', shell=True, check=True)
    shutil.rmtree('/var/tmp/mnt')
    os.remove('/var/tmp/kernel-check.img')
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -16,41 +16,72 @@ import distro
 from scylla_util import *
 from subprocess import run

-def get_mode_cpuset(nic, mode):
-    mode_cpu_mask = out('/opt/scylladb/scripts/perftune.py --tune net --nic {} --mode {} --get-cpu-mask-quiet'.format(nic, mode))
-    return hex2list(mode_cpu_mask)
-
 def get_cur_cpuset():
    cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
    cpuset = cfg.get('CPUSET')
    return re.sub(r'^--cpuset (.+)$', r'\1', cpuset).strip()

-def get_tune_mode(nic):
+def cpu_mask_is_zero(cpu_mask):
+    """
+    The cpu_mask is a comma-separated list of 32-bit hex values with possibly omitted zero components,
+            e.g. 0xffff,,0xffff
+    We want to estimate if the whole mask is all-zeros.
+    :param cpu_mask: hwloc-calc generated CPU mask
+    :return: True if mask is zero, False otherwise
+    """
+    for cur_cpu_mask in cpu_mask.split(','):
+        if cur_cpu_mask and int(cur_cpu_mask, 16) != 0:
+            return False
+
+    return True
+
+def get_irq_cpu_mask():
+    """
+    Return an irq_cpu_mask corresponding to a value written in cpuset.conf
+
+    Let's use the "CPU masks invariant": irq_cpu_mask | compute_cpu_mask == cpu_mask.
+
+    This function is called when  we are generating a perftune.yaml meaning that there are no restrictions on
+    cpu_mask defined.
+
+    And this means that in the context of this function call cpu_mask is "all CPUs", or in hwloc-cal lingo - 'all'.
+
+    (For any "special" value of a cpu_mask a user needs to write his/her own perftune.yaml)
+
+    Mentioned above means that in order to calculate an irq_cpu_mask that corresponds to a compute_cpu_mask defined
+    using --cpuset in cpuset.conf and cpu_mask == 'all' we need to invert bits from the compute_cpu_mask in the 'all'
+    mask.
+
+    This can be achieved by running the following hwloc-calc command:
+
+    hwloc-calc --pi all ~PU:X ~PU:Y ~PU:Z ...
+
+    where X,Y,Z,... are either a single CPU index or a CPU range.
+
+    For example, if we have the following cpuset:
+
+    0,2-7,17-24,35
+
+    to get irq_cpu_mask we want to run the following command:
+
+    hwloc-calc --pi all ~PU:0 ~PU:2-7 ~PU:17-24 ~PU:35
+    """
+
    if not os.path.exists('/etc/scylla.d/cpuset.conf'):
        raise Exception('/etc/scylla.d/cpuset.conf not found')
    cur_cpuset = get_cur_cpuset()
-    mq_cpuset = get_mode_cpuset(nic, 'mq')
-    sq_cpuset = get_mode_cpuset(nic, 'sq')
-    sq_split_cpuset = get_mode_cpuset(nic, 'sq_split')

-    if cur_cpuset == mq_cpuset:
-        return 'mq'
-    elif cur_cpuset == sq_cpuset:
-        return 'sq'
-    elif cur_cpuset == sq_split_cpuset:
-        return 'sq_split'
-    else:
-        raise Exception('tune mode not found')
+    hwloc_cmd = "/opt/scylladb/bin/hwloc-calc --pi all {}".\
+        format(" ".join(['~PU:{}'.format(c) for c in cur_cpuset.split(",")]))

-def config_updated():
-    perftune_mtime = os.path.getmtime('/etc/scylla.d/perftune.yaml')
-    cpuset_mtime = os.path.getmtime('/etc/scylla.d/cpuset.conf')
-    sysconfig_mtime = os.path.getmtime(sysconfdir_p() / 'scylla-server')
-    print("perftune_mtime < cpuset_mtime:{}".format(perftune_mtime < cpuset_mtime))
-    print("perftune_mtime < sysconfig_mtime:{}".format(perftune_mtime < sysconfig_mtime))
-    if perftune_mtime < cpuset_mtime or perftune_mtime < sysconfig_mtime:
-        return True
-    return False
+    irq_cpu_mask = out(hwloc_cmd).strip()
+
+    # If the generated mask turns out to be all-zeros then it means that all present CPUs are used in cpuset.conf.
+    # In such a case irq_cpu_mask has to be all-CPUs too, a.k.a. MQ mode.
+    if cpu_mask_is_zero(irq_cpu_mask):
+        irq_cpu_mask = out("/opt/scylladb/bin/hwloc-calc all").strip()
+
+    return irq_cpu_mask

 def create_perftune_conf(cfg):
    """
@@ -65,8 +96,10 @@ def create_perftune_conf(cfg):
        nic = cfg.get('IFNAME')
        if not nic:
            nic = 'eth0'
-        mode = get_tune_mode(nic)
-        params += '--tune net --nic "{nic}" --mode {mode}'.format(nic=nic, mode=mode)
+        irq_cpu_mask = get_irq_cpu_mask()
+        # Note that 'irq_cpu_mask' is a coma separated list of 32-bits wide masks.
+        # Therefore, we need to put it in quotes.
+        params += '--tune net --nic "{nic}" --irq-cpu-mask "{irq_cpu_mask}"'.format(nic=nic, irq_cpu_mask=irq_cpu_mask)

    if cfg.has_option('SET_CLOCKSOURCE') and cfg.get('SET_CLOCKSOURCE') == 'yes':
        params += ' --tune system --tune-clock'
@@ -75,7 +108,7 @@ def create_perftune_conf(cfg):
        params += ' --write-back-cache=false'

    if len(params) > 0:
-        if os.path.exists('/etc/scylla.d/perftune.yaml') and not config_updated():
+        if os.path.exists('/etc/scylla.d/perftune.yaml'):
            return True

        params += ' --dump-options-file'
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -9,6 +9,7 @@

 import os
 import argparse
+import distutils.util
 import pwd
 import grp
 import sys
@@ -16,7 +17,7 @@ import stat
 import distro
 from pathlib import Path
 from scylla_util import *
-from subprocess import run
+from subprocess import run, SubprocessError

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -37,11 +38,14 @@ if __name__ == '__main__':
                        help='force constructing RAID when only one disk is specified')
    parser.add_argument('--raid-level', default='0',
                        help='specify RAID level')
-    parser.add_argument('--online-discard', default=True,
+    parser.add_argument('--online-discard', default="True",
                        help='Enable XFS online discard (trim SSD cells after file deletion)')

    args = parser.parse_args()

+    # Allow args.online_discard to be used as a boolean value
+    args.online_discard = distutils.util.strtobool(args.online_discard)
+
    root = args.root.rstrip('/')
    if args.volume_role == 'all':
        mount_at=root
@@ -125,9 +129,12 @@ if __name__ == '__main__':
                procs.append(proc)
    for proc in procs:
        proc.wait()
+    for disk in disks:
+        run(f'wipefs -a {disk}', shell=True, check=True)
    if raid:
        run('udevadm settle', shell=True, check=True)
        run('mdadm --create --verbose --force --run {raid} --level={level} -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, level=args.raid_level, nr_disk=len(disks), disks=args.disks.replace(',', ' ')), shell=True, check=True)
+        run(f'wipefs -a {fsdev}', shell=True, check=True)
        run('udevadm settle', shell=True, check=True)

    major_minor = os.stat(fsdev).st_rdev
@@ -137,7 +144,9 @@ if __name__ == '__main__':
    # stalling. The minimum block size for crc enabled filesystems is 1024,
    # and it also cannot be smaller than the sector size.
    block_size = max(1024, sector_size)
-    run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
+    run('udevadm settle', shell=True, check=True)
+    run(f'mkfs.xfs -b size={block_size} {fsdev} -K', shell=True, check=True)
+    run('udevadm settle', shell=True, check=True)

    if is_debian_variant():
        confpath = '/etc/mdadm/mdadm.conf'
@@ -153,6 +162,11 @@ if __name__ == '__main__':
    os.makedirs(mount_at, exist_ok=True)

    uuid = out(f'blkid -s UUID -o value {fsdev}')
+    if not uuid:
+        raise Exception(f'Failed to get UUID of {fsdev}')
+
+    uuidpath = f'/dev/disk/by-uuid/{uuid}'
+
    after = 'local-fs.target'
    wants = ''
    if raid and args.raid_level != '0':
@@ -169,7 +183,7 @@ After={after}{wants}
 DefaultDependencies=no

 [Mount]
-What=/dev/disk/by-uuid/{uuid}
+What={uuidpath}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}
@@ -191,8 +205,16 @@ WantedBy=multi-user.target
    systemd_unit.reload()
    if args.raid_level != '0':
        md_service.start()
-    mount = systemd_unit(mntunit_bn)
-    mount.start()
+    try:
+        mount = systemd_unit(mntunit_bn)
+        mount.start()
+    except SubprocessError as e:
+        if not os.path.exists(uuidpath):
+            print(f'\nERROR: {uuidpath} is not found\n')
+        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
+            print(f'\nERROR: {uuidpath} is not block device\n')
+        raise e
+
    if args.enable_on_nextboot:
        mount.enable()
    uid = pwd.getpwnam('scylla').pw_uid
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -214,7 +214,7 @@ if __name__ == '__main__':
                        help='skip raid setup')
    parser.add_argument('--raid-level-5', action='store_true', default=False,
                        help='use RAID5 for RAID volume')
-    parser.add_argument('--online-discard', default=True,
+    parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
                        help='Configure XFS to discard unused blocks as soon as files are deleted')
    parser.add_argument('--nic',
                        help='specify NIC')
@@ -458,7 +458,7 @@ if __name__ == '__main__':
        args.no_raid_setup = not raid_setup
        if raid_setup:
            level = '5' if raid_level_5 else '0'
-            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
+            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')

        coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
        args.no_coredump_setup = not coredump_setup
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -68,7 +68,12 @@ class ScyllaSetup:

    def cqlshrc(self):
        home = os.environ['HOME']
-        hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
+        if self._rpcAddress:
+            hostname = self._rpcAddress
+        elif self._listenAddress:
+            hostname = self._listenAddress
+        else:
+            hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)

--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version}-%{release} %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release} %{product}-kernel-conf = %{version}-%{release} %{product}-jmx = %{version}-%{release} %{product}-tools = %{version}-%{release} %{product}-tools-core = %{version}-%{release} %{product}-node-exporter = %{version}-%{release}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -54,7 +54,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
+Requires:       %{product}-conf  = %{version}-%{release} %{product}-python3 = %{version}-%{release}
 Conflicts:      abrt
 AutoReqProv:    no

--- a/docs/_utils/redirects.yaml
+++ b/docs/_utils/redirects.yaml
@@ -1,58 +1,10 @@
 ### a dictionary of redirections
 #old path: new path

-# removing the old Monitoring Stack documentation from the ScyllaDB docs
-
-/stable/operating-scylla/monitoring/index.html: https://monitoring.docs.scylladb.com/stable/
-/stable/upgrade/upgrade-monitor/index.html: https://monitoring.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-monitor/upgrade-guide-from-monitoring-1.x-to-monitoring-2.x.html: https://monitoring.docs.scylladb.com/stable/upgrade/upgrade-guide-from-monitoring-1.x-to-monitoring-2.x.html
-/stable/upgrade/upgrade-monitor/upgrade-guide-from-monitoring-2.x-to-monitoring-2.y.html: https://monitoring.docs.scylladb.com/stable/upgrade/upgrade-guide-from-monitoring-2.x-to-monitoring-2.y.html
-/stable/upgrade/upgrade-monitor/upgrade-guide-from-monitoring-2.x-to-monitoring-3.y.html: https://monitoring.docs.scylladb.com/stable/upgrade/upgrade-guide-from-monitoring-2.x-to-monitoring-3.y.html
-/stable/upgrade/upgrade-monitor/upgrade-guide-from-monitoring-3.x-to-monitoring-3.y.html: https://monitoring.docs.scylladb.com/stable/upgrade/upgrade-guide-from-monitoring-3.x-to-monitoring-3.y.html
-
 # removing the old Operator documentation from the ScyllaDB docs

 /stable/operating-scylla/scylla-operator/index.html: https://operator.docs.scylladb.com/stable/

-### removing the old Scylla Manager documentation from the ScyllaDB docs
-
-/stable/operating-scylla/manager/index.html: https://manager.docs.scylladb.com/
-/stable/upgrade/upgrade-manager/index.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-maintenance-1.x.y-to-1.x.z/index.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-maintenance-1.x.y-to-1.x.z/upgrade-guide-from-manager-1.x.y-to-1.x.z-CentOS.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-maintenance-1.x.y-to-1.x.z/upgrade-guide-from-manager-1.x.y-to-1.x.z-ubuntu.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-manager-1.0.x-to-1.1.x.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-manager-1.1.x-to-1.2.x.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.2-to-1.3/index.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.2-to-1.3/upgrade-guide-from-manager-1.2.x-to-1.3.x-CentOS.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.2-to-1.3/upgrade-guide-from-manager-1.2.x-to-1.3.x-ubuntu.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.2-to-1.3/manager-metric-update-1.2-to-1.3.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.3-to-1.4/index.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.3-to-1.4/upgrade-guide-from-manager-1.3.x-to-1.4.x-CentOS.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.3-to-1.4/upgrade-guide-from-manager-1.3.x-to-1.4.x-ubuntu.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.3-to-1.4/manager-metric-update-1.3-to-1.4.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.4-to-2.0/index.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.4-to-2.0/upgrade-guide-from-manager-1.4.x-to-2.0.x.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-1.4-to-2.0/manager-metric-update-1.4-to-2.0.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-2.x.a-to-2.y.b/index.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-2.x.a-to-2.y.b/upgrade-2.x.a-to-2.y.b.html: https://manager.docs.scylladb.com/stable/upgrade/index.html
-/stable/upgrade/upgrade-manager/upgrade-guide-from-2.x.a-to-2.y.b/upgrade-row-level-repair.html: https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/
-stable/operating-scylla/manager/2.1/index.html: https://manager.docs.scylladb.com/
-/stable/operating-scylla/manager/2.1/architecture.html: https://manager.docs.scylladb.com/
-/stable/operating-scylla/manager/2.1/install.html: https://manager.docs.scylladb.com/stable/install-scylla-manager.html
-/stable/operating-scylla/manager/2.1/install-agent.html: https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html
-/stable/operating-scylla/manager/2.1/add-a-cluster.html: https://manager.docs.scylladb.com/stable/add-a-cluster.html
-/stable/operating-scylla/manager/2.1/repair.html: https://manager.docs.scylladb.com/stable/repair/index.html
-/stable/operating-scylla/manager/2.1/backup.html: https://manager.docs.scylladb.com/stable/backup/index.html
-/stable/operating-scylla/manager/2.1/extract-schema-from-backup.html: https://manager.docs.scylladb.com/stable/sctool/backup.html
-/stable/operating-scylla/manager/2.1/restore-a-backup.html: https://manager.docs.scylladb.com/stable/restore/index.html
-/stable/operating-scylla/manager/2.1/health-check.html: https://manager.docs.scylladb.com/stable/health-check.html
-/stable/operating-scylla/manager/2.1/sctool.html: https://manager.docs.scylladb.com/stable/sctool/index.html
-/stable/operating-scylla/manager/2.1/monitoring-manager-integration.html: https://manager.docs.scylladb.com/stable/scylla-monitoring.html
-/stable/operating-scylla/manager/2.1/use-a-remote-db.html: https://manager.docs.scylladb.com/
-/stable/operating-scylla/manager/2.1/configuration-file.html: https://manager.docs.scylladb.com/stable/config/scylla-manager-config.html
-/stable/operating-scylla/manager/2.1/agent-configuration-file.html: https://manager.docs.scylladb.com/stable/config/scylla-manager-agent-config.html
-
 ### moving the CQL reference files to the new cql folder

 /stable/getting-started/ddl.html: /stable/cql/ddl.html
@@ -1108,14 +1060,14 @@ tls-ssl/index.html: /stable/operating-scylla/security
 /using-scylla/integrations/integration_kairos/index.html: /stable/using-scylla/integrations/integration-kairos
 /upgrade/ami_upgrade/index.html: /stable/upgrade/ami-upgrade

-/scylla-cloud/cloud-setup/gcp-vpc-peering/index.html: /stable/scylla-cloud/cloud-setup/GCP/gcp-vpc-peering
-/scylla-cloud/cloud-setup/GCP/gcp-vcp-peering/index.html: /stable/scylla-cloud/cloud-setup/GCP/gcp-vpc-peering
+/scylla-cloud/cloud-setup/gcp-vpc-peering/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/gcp-vpc-peering.html
+/scylla-cloud/cloud-setup/GCP/gcp-vcp-peering/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/gcp-vpc-peering.html

 # move scylla cloud for AWS to dedicated directory
-/scylla-cloud/cloud-setup/aws-vpc-peering/index.html: /stable/scylla-cloud/cloud-setup/AWS/aws-vpc-peering
-/scylla-cloud/cloud-setup/cloud-prom-proxy/index.html: /stable/scylla-cloud/cloud-setup/AWS/cloud-prom-proxy
-/scylla-cloud/cloud-setup/outposts/index.html: /stable/scylla-cloud/cloud-setup/AWS/outposts
-/scylla-cloud/cloud-setup/scylla-cloud-byoa/index.html: /stable/scylla-cloud/cloud-setup/AWS/scylla-cloud-byoa
+/scylla-cloud/cloud-setup/aws-vpc-peering/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/aws-vpc-peering.html
+/scylla-cloud/cloud-setup/cloud-prom-proxy/index.html: https://cloud.docs.scylladb.com/stable/monitoring/cloud-prom-proxy.html
+/scylla-cloud/cloud-setup/outposts/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/outposts.html
+/scylla-cloud/cloud-setup/scylla-cloud-byoa/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/scylla-cloud-byoa.html
 /scylla-cloud/cloud-services/scylla_cloud_costs/index.html: /stable/scylla-cloud/cloud-services/scylla-cloud-costs
 /scylla-cloud/cloud-services/scylla_cloud_managin_versions/index.html: /stable/scylla-cloud/cloud-services/scylla-cloud-managin-versions
 /scylla-cloud/cloud-services/scylla_cloud_support_alerts_sla/index.html: /stable/scylla-cloud/cloud-services/scylla-cloud-support-alerts-sla
--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -134,7 +134,7 @@ isolation policy for a specific table can be overridden by tagging the table
 This section provides only a very brief introduction to Alternator's
 design. A much more detailed document about the features of the DynamoDB
 API and how they are, or could be, implemented in Scylla can be found in:
-https://docs.google.com/document/d/1i4yjF5OSAazAY_-T8CBce9-2ykW4twx_E_Nt2zDoOVs
+<https://docs.google.com/document/d/1i4yjF5OSAazAY_-T8CBce9-2ykW4twx_E_Nt2zDoOVs>

 Almost all of Alternator's source code (except some initialization code)
 can be found in the alternator/ subdirectory of Scylla's source code.
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -26,7 +26,7 @@ request for this single URL to many different backend nodes. Such a
 load-balancing setup is *not* included inside Alternator. You should either
 set one up, or configure the client library to do the load balancing itself.
 Instructions for doing this can be found in:
-https://github.com/scylladb/alternator-load-balancing/
+<https://github.com/scylladb/alternator-load-balancing/>

 ## Write isolation policies

@@ -125,7 +125,7 @@ All of this is not yet implemented in Alternator.
 Scylla has an advanced and extensive monitoring framework for inspecting
 and graphing hundreds of different metrics of Scylla's usage and performance.
 Scylla's monitoring stack, based on Grafana and Prometheus, is described in
-https://docs.scylladb.com/operating-scylla/monitoring/.
+<https://docs.scylladb.com/operating-scylla/monitoring/>.
 This monitoring stack is different from DynamoDB's offering - but Scylla's
 is significantly more powerful and gives the user better insights on
 the internals of the database and its performance.
@@ -160,7 +160,7 @@ experimental:
  One thing that this implementation is still missing is that expiration
  events appear in the Streams API as normal deletions - without the
  distinctive marker on deletions which are really expirations.
-  https://github.com/scylladb/scylla/issues/5060
+  <https://github.com/scylladb/scylla/issues/5060>

 * The DynamoDB Streams API for capturing change is supported, but still
  considered experimental so needs to be enabled explicitly with the
@@ -172,12 +172,12 @@ experimental:
  * While in DynamoDB data usually appears in the stream less than a second
    after it was written, in Alternator Streams there is currently a 10
    second delay by default.
-    https://github.com/scylladb/scylla/issues/6929
+    <https://github.com/scylladb/scylla/issues/6929>
  * Some events are represented differently in Alternator Streams. For
    example, a single PutItem is represented by a REMOVE + MODIFY event,
    instead of just a single MODIFY or INSERT.
-    https://github.com/scylladb/scylla/issues/6930
-    https://github.com/scylladb/scylla/issues/6918
+    <https://github.com/scylladb/scylla/issues/6930>
+    <https://github.com/scylladb/scylla/issues/6918>

 ## Unimplemented API features

@@ -189,18 +189,18 @@ they should be easy to detect. Here is a list of these unimplemented features:
 * Currently in Alternator, a GSI (Global Secondary Index) can only be added
  to a table at table creation time. Unlike DynamoDB which also allows adding
  a GSI (but not an LSI) to an existing table using an UpdateTable operation.
-  https://github.com/scylladb/scylla/issues/5022
+  <https://github.com/scylladb/scylla/issues/5022>

 * GSI (Global Secondary Index) and LSI (Local Secondary Index) may be
  configured to project only a subset of the base-table attributes to the
  index. This option is not yet respected by Alternator - all attributes
  are projected. This wastes some disk space when it is not needed.
-  https://github.com/scylladb/scylla/issues/5036
+  <https://github.com/scylladb/scylla/issues/5036>

 * DynamoDB's new multi-item transaction feature (TransactWriteItems,
  TransactGetItems) is not supported. Note that the older single-item
  conditional updates feature are fully supported.
-  https://github.com/scylladb/scylla/issues/5064
+  <https://github.com/scylladb/scylla/issues/5064>

 * Alternator does not yet support the DynamoDB API calls that control which
  table is available in which data center (DC): CreateGlobalTable,
@@ -211,19 +211,19 @@ they should be easy to detect. Here is a list of these unimplemented features:
  If a DC is added after a table is created, the table won't be visible from
  the new DC and changing that requires a CQL "ALTER TABLE" statement to
  modify the table's replication strategy.
-  https://github.com/scylladb/scylla/issues/5062
+  <https://github.com/scylladb/scylla/issues/5062>

 * Recently DynamoDB added support, in addition to the DynamoDB Streams API,
  also for the similar Kinesis Streams. Alternator doesn't support this yet,
  and the related operations DescribeKinesisStreamingDestination,
  DisableKinesisStreamingDestination, and EnableKinesisStreamingDestination.
-  https://github.com/scylladb/scylla/issues/8786
+  <https://github.com/scylladb/scylla/issues/8786>

 * The on-demand backup APIs are not supported: CreateBackup, DescribeBackup,
  DeleteBackup, ListBackups, RestoreTableFromBackup.
  For now, users can use Scylla's existing backup solutions such as snapshots
  or Scylla Manager.
-  https://github.com/scylladb/scylla/issues/5063
+  <https://github.com/scylladb/scylla/issues/5063>

 * Continuous backup (the ability to restore any point in time) is also not
  supported: UpdateContinuousBackups, DescribeContinuousBackups,
@@ -237,28 +237,28 @@ they should be easy to detect. Here is a list of these unimplemented features:
  BillingMode option is ignored by Alternator, and if a provisioned throughput
  is specified, it is ignored. Requests which are asked to return the amount
  of provisioned throughput used by the request do not return it in Alternator.
-  https://github.com/scylladb/scylla/issues/5068
+  <https://github.com/scylladb/scylla/issues/5068>

 * DAX (DynamoDB Accelerator), an in-memory cache for DynamoDB, is not
  available in for Alternator. Anyway, it should not be necessary - Scylla's
  internal cache is already rather advanced and there is no need to place
  another cache in front of the it. We wrote more about this here:
-  https://www.scylladb.com/2017/07/31/database-caches-not-good/
+  <https://www.scylladb.com/2017/07/31/database-caches-not-good/>

 * The DescribeTable is missing information about creation data and size
  estimates, and also part of the information about indexes enabled on 
  the table.
-  https://github.com/scylladb/scylla/issues/5013
-  https://github.com/scylladb/scylla/issues/5026
-  https://github.com/scylladb/scylla/issues/7550
-  https://github.com/scylladb/scylla/issues/7551 
+  <https://github.com/scylladb/scylla/issues/5013>
+  <https://github.com/scylladb/scylla/issues/5026>
+  <https://github.com/scylladb/scylla/issues/7550>
+  <https://github.com/scylladb/scylla/issues/7551 >

 * The recently-added PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE
  expressions) and the new operations ExecuteStatement, BatchExecuteStatement
  and ExecuteTransaction is not yet supported.
  A user that is interested in an SQL-like syntax can consider using Scylla's
  CQL protocol instead.
-  https://github.com/scylladb/scylla/issues/8787
+  <https://github.com/scylladb/scylla/issues/8787>

 * As mentioned above, Alternator has its own powerful monitoring framework,
  which is different from AWS's. In particular, the operations
@@ -266,8 +266,8 @@ they should be easy to detect. Here is a list of these unimplemented features:
  UpdateContributorInsights that configure Amazon's "CloudWatch Contributor
  Insights" are not yet supported. Scylla has different ways to retrieve the
  same information, such as which items were accessed most often.
-  https://github.com/scylladb/scylla/issues/8788
+  <https://github.com/scylladb/scylla/issues/8788>

 * Alternator does not support the new DynamoDB feature "export to S3",
  and its operations DescribeExport, ExportTableToPointInTime, ListExports.
-  https://github.com/scylladb/scylla/issues/8789
+  <https://github.com/scylladb/scylla/issues/8789>
--- a/docs/architecture/raft.rst
+++ b/docs/architecture/raft.rst
@@ -4,70 +4,65 @@ Raft Consensus Algorithm in ScyllaDB

 Introduction
 --------------
-ScyllaDB was originally designed, following Apache Cassandra, to use gossip for topology and schema updates and the Paxos consensus algorithm for 
-strong data consistency (:doc:`LWT </using-scylla/lwt>`). To achieve stronger consistency without performance penalty, ScyllaDB 5.0 is  turning to Raft - a consensus algorithm designed as an alternative to both gossip and Paxos.
+ScyllaDB was originally designed, following Apache Cassandra, to use gossip for topology and schema updates and the Paxos consensus algorithm for
+strong data consistency (:doc:`LWT </using-scylla/lwt>`). To achieve stronger consistency without performance penalty, ScyllaDB 5.x has turned to Raft - a consensus algorithm designed as an alternative to both gossip and Paxos.

 Raft is a consensus algorithm that implements a distributed, consistent, replicated log across members (nodes). Raft implements consensus by first electing a distinguished leader, then giving the leader complete responsibility for managing the replicated log. The leader accepts log entries from clients, replicates them on other servers, and tells servers when it is safe to apply log entries to their state machines.

 Raft uses a heartbeat mechanism to trigger a leader election. All servers start as followers and remain in the follower state as long as they receive valid RPCs (heartbeat) from a leader or candidate. A leader sends periodic heartbeats to all followers to maintain his authority (leadership). Suppose a follower receives no communication over a period called the election timeout. In that case, it assumes no viable leader and begins an election to choose a new leader.

-Leader selection is described in detail in the `raft paper <https://raft.github.io/raft.pdf>`_.
+Leader selection is described in detail in the `Raft paper <https://raft.github.io/raft.pdf>`_.

-Scylla 5.0 uses Raft to maintain schema updates in every node (see below). Any schema update, like ALTER, CREATE or DROP TABLE, is first committed as an entry in the replicated Raft log, and, once stored on most replicas, applied to all nodes **in the same order**, even in the face of a node or network failures.
+ScyllaDB 5.x may use Raft to maintain schema updates in every node (see below). Any schema update, like ALTER, CREATE or DROP TABLE, is first committed as an entry in the replicated Raft log, and, once stored on most replicas, applied to all nodes **in the same order**, even in the face of a node or network failures.

-Following Scylla 5.x releases will use Raft to guarantee consistent topology updates similarly.
+Following ScyllaDB 5.x releases will use Raft to guarantee consistent topology updates similarly.

 .. _raft-quorum-requirement:

 Quorum Requirement
 -------------------

-Raft requires at least a quorum of nodes in a cluster to be available. If multiple nodes fail 
-and the quorum is lost, the cluster is unavailable for schema updates. See :ref:`Handling Failures <raft-handliing-failures>` 
+Raft requires at least a quorum of nodes in a cluster to be available. If multiple nodes fail
+and the quorum is lost, the cluster is unavailable for schema updates. See :ref:`Handling Failures <raft-handling-failures>`
 for information on how to handle failures.


 Upgrade Considerations for SyllaDB 5.0 and Later
 ==================================================

-Note that when you have a two-DC cluster with the same number of nodes in each DC, the cluster will lose the quorum if one 
+Note that when you have a two-DC cluster with the same number of nodes in each DC, the cluster will lose the quorum if one
 of the DCs is down.
 **We recommend configuring three DCs per cluster to ensure that the cluster remains available and operational when one DC is down.**

 Enabling Raft
 ---------------

-Enabling Raft in ScyllaDB 5.0
-===============================
+Enabling Raft in ScyllaDB 5.0 and 5.1
+=====================================

-.. note:: 
-  In ScyllaDB 5.0:
+.. warning::
+  In ScyllaDB 5.0 and 5.1, Raft is an experimental feature.

-  * Raft is an experimental feature.
-  * Raft implementation only covers safe schema changes. See :ref:`Safe Schema Changes with Raft <raft-schema-changes>`.
+It is not possible to enable Raft in an existing cluster in ScyllaDB 5.0 and 5.1.
+In order to have a Raft-enabled cluster in these versions, you must create a new cluster with Raft enabled from the start.

-If you are creating a new cluster, add ``raft`` to the list of experimental features in your ``scylla.yaml`` file:
+.. warning::

-.. code-block:: yaml
-    
-    experimental_features:
-     - raft
+   **Do not** use Raft in production clusters in ScyllaDB 5.0 and 5.1. Such clusters won't be able to correctly upgrade to ScyllaDB 5.2.

-If you upgrade to ScyllaDB 5.0 from an earlier version, perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>` 
-updating the ``scylla.yaml`` file for **each node** in the cluster to enable the experimental Raft feature:
-
-.. code-block:: yaml
-    
-    experimental_features:
-      - raft
-
-
-When all the nodes in the cluster and updated and restarted, the cluster will begin to use Raft for schema changes.
+   Use Raft only for testing and experimentation in clusters which can be thrown away.

 .. warning::
    Once enabled, Raft cannot be disabled on your cluster. The cluster nodes will fail to restart if you remove the Raft feature.

-Verifying that Raft Is Enabled
+When creating a new cluster, add ``raft`` to the list of experimental features in your ``scylla.yaml`` file:
+
+.. code-block:: yaml
+
+    experimental_features:
+     - raft
+
+Verifying that Raft is enabled
 ===============================
 You can verify that Raft is enabled on your cluster in one of the following ways:

@@ -100,23 +95,23 @@ Safe Schema Changes with Raft
 -------------------------------
 In ScyllaDB, schema is based on :doc:`Data Definition Language (DDL) </cql/ddl>`. In earlier ScyllaDB versions, schema changes were tracked via the gossip protocol, which might lead to schema conflicts if the updates are happening concurrently.

-Implementing Raft eliminates schema conflicts and allows full automation of DDL changes under any conditions, as long as a quorum 
+Implementing Raft eliminates schema conflicts and allows full automation of DDL changes under any conditions, as long as a quorum
 of nodes in the cluster is available. The following examples illustrate how Raft provides the solution to problems with schema changes.

 * A network partition may lead to a split-brain case, where each subset of nodes has a different version of the schema.
-     
+
     With Raft, after a network split, the majority of the cluster can continue performing schema changes, while the minority needs to wait until it can rejoin the majority. Data manipulation statements on the minority can continue unaffected, provided the :ref:`quorum requirement <raft-quorum-requirement>` is satisfied.

-* Two or more conflicting schema updates are happening at the same time. For example, two different columns with the same definition are simultaneously added to the cluster. There is no effective way to resolve the conflict - the cluster will employ the schema with the most recent timestamp, but changes related to the shadowed table will be lost. 
+* Two or more conflicting schema updates are happening at the same time. For example, two different columns with the same definition are simultaneously added to the cluster. There is no effective way to resolve the conflict - the cluster will employ the schema with the most recent timestamp, but changes related to the shadowed table will be lost.

-     With Raft, concurrent schema changes are safe. 
+     With Raft, concurrent schema changes are safe.



 In summary, Raft makes schema changes safe, but it requires that a quorum of nodes in the cluster is available.


-.. _raft-handliing-failures:
+.. _raft-handling-failures:

 Handling Failures
 ------------------
@@ -175,7 +170,7 @@ Examples
   * - 1-4 nodes
     - Schema updates are possible and safe.
     - Try restarting the nodes. If the nodes are dead, :doc:`replace them with new nodes </operating-scylla/procedures/cluster-management/replace-dead-node-or-more/>`.
-   * - 1 DC 
+   * - 1 DC
     - Schema updates are possible and safe.
     - When the DC comes back online, try restarting the nodes in the cluster. If the nodes are dead, :doc:`add 3 new nodes in a new region </operating-scylla/procedures/cluster-management/add-dc-to-existing-dc/>`.
   * - 2 DCs
--- a/docs/architecture/sstable/_common/sstable_what_is.rst
+++ b/docs/architecture/sstable/_common/sstable_what_is.rst
@@ -1,20 +1,25 @@
-:term:`Sorted Strings Table (SSTable)<SSTable>` is the persistent file format used by Scylla and Apache Cassandra. SSTable is saved as a persistent, ordered, immutable set of files on disk.
+:term:`Sorted Strings Table (SSTable)<SSTable>` is the persistent file format used by ScyllaDB and Apache Cassandra. SSTable is saved as a persistent, ordered, immutable set of files on disk.
 Immutable means SSTables are never modified; they are created by a MemTable flush and are deleted by a compaction.
-The location of Scylla SSTables is specified in scylla.yaml ``data_file_directories`` parameter (default location: ``/var/lib/scylla/data``).
+The location of ScyllaDB SSTables is specified in scylla.yaml ``data_file_directories`` parameter (default location: ``/var/lib/scylla/data``).

-SSTable 3.0 (mc format) is more efficient and requires less disk space than the SSTable 2.x. SSTable version support is as follows: 
+SSTable 3.x is more efficient and requires less disk space than the SSTable 2.x.

+SSTable Version Support
+------------------------

 .. list-table::
   :widths: 33 33 33
   :header-rows: 1

   * - SSTable Version
-     - Scylla Enterprise Version
-     - Scylla Open Source Version
+     - ScyllaDB Enterprise Version
+     - ScyllaDB Open Source Version
+   * - 3.x ('me')
+     - 2022.2
+     - 5.1 and above
   * - 3.x ('md')
     - 2021.1
-     - 4.3 and above
+     - 4.3, 4.4, 4.5, 4.6, 5.0
   * - 3.0 ('mc')
     - 2019.1, 2020.1
     - 3.x, 4.1, 4.2
--- a/docs/architecture/sstable/sstable3/index.rst
+++ b/docs/architecture/sstable/sstable3/index.rst
@@ -1,5 +1,5 @@
-Scylla SSTable - 3.x
-====================
+ScyllaDB SSTable - 3.x
+=======================

 .. toctree::
   :hidden:
@@ -12,21 +12,24 @@ Scylla SSTable - 3.x

 .. include:: ../_common/sstable_what_is.rst

-* In Scylla 3.1 and above, mc format is enabled by default. 
+* In ScyllaDB 5.1 and above, the ``me`` format is enabled by default.
+* In ScyllaDB 4.3 to 5.0, the ``md`` format is enabled by default.
+* In ScyllaDB 3.1 to 4.2, the ``mc`` format is enabled by default. 
+* In ScyllaDB 3.0, the ``mc`` format is disabled by default. You can enable it by adding the ``enable_sstables_mc_format`` parameter set to ``true`` in the ``scylla.yaml`` file. For example: 
+    
+    .. code-block:: shell
+    
+       enable_sstables_mc_format: true

-* In Scylla 3.0, mc format is disabled by default and can be enabled by adding the ``enable_sstables_mc_format`` parameter as 'true' in ``scylla.yaml`` file.
+.. REMOVE IN FUTURE VERSIONS - Remove the note above in version 5.2.

-For example: 
+Additional Information
+-------------------------

-.. code-block:: shell
-
-   enable_sstables_mc_format: true
-
-
-For more information on Scylla 3.x SSTable formats, see below:
+For more information on ScyllaDB 3.x SSTable formats, see below:

 * :doc:`SSTable 3.0 Data File Format <sstables-3-data-file-format>`
 * :doc:`SSTable 3.0 Statistics <sstables-3-statistics>` 
 * :doc:`SSTable 3.0 Summary <sstables-3-summary>`
 * :doc:`SSTable 3.0 Index <sstables-3-index>`
-* :doc:`SSTable 3.0 Format in Scylla <sstable-format>`
+* :doc:`SSTable 3.0 Format in ScyllaDB <sstable-format>`
--- a/docs/architecture/sstable/sstable3/sstables-3-data-file-format.rst
+++ b/docs/architecture/sstable/sstable3/sstables-3-data-file-format.rst
@@ -28,8 +28,13 @@ Table of contents                                             mc-1-big-TOC.txt

 This document focuses on the data file format but also refers to other components in parts where information stored in them affects the way we read/write the data file.

-Note that the file on-disk format applies both to the "mc" and "md" SSTable format versions.
-The "md" format only fixed the semantics of the (min|max)_clustering_key fields in the SSTable Statistics file, which are now valid for describing the accurate range of clustering prefixes present in the SSTable.
+Note that the file on-disk format applies to all "m*" SSTable format versions ("mc", "md", and "me").
+
+* The "md" format only fixed the semantics of the ``(min|max)_clustering_key`` fields in the SSTable Statistics file, 
+  which are now valid for describing the accurate range of clustering prefixes present in the SSTable.
+* The "me" format added the ``host_id`` of the host writing the SStable to the SSTable Statistics file. 
+  It is used to qualify the commit log replay position that is also stored in the SSTable Statistics file.
+
 See :doc:`SSTables 3.0 Statistics File Format </architecture/sstable/sstable3/sstables-3-statistics>` for more details.

 Overview
--- a/docs/architecture/sstable/sstable3/sstables-3-statistics.rst
+++ b/docs/architecture/sstable/sstable3/sstables-3-statistics.rst
@@ -175,6 +175,13 @@ Whole entry
       // It contains only one commit log position interval - [lower bound of commit log, upper bound of commit log].
    
       array<be32<int32_t>, commit_log_interval> commit_log_intervals;
+
+       // Versions MC and MD of SSTable 3.x format end here.
+
+       // UUID of the host that wrote the SSTable.
+       // Qualifies all commitlog positions in the SSTable Statistics file.
+       
+       UUID host_id;
   }

   using clustering_bound = array<be32<int32_t>, clustering_column>;
--- a/docs/cql/appendices.rst
+++ b/docs/cql/appendices.rst
@@ -21,8 +21,6 @@
 Appendices
 ----------

-.. include:: /rst_include/cql-version-index.rst
-
 .. _appendix-A:

 Appendix A: CQL Keywords
--- a/docs/cql/cql-extensions.md
+++ b/docs/cql/cql-extensions.md
@@ -1,6 +1,6 @@
 # ScyllaDB CQL Extensions

-Scylla extends the CQL language to provide a few extra features. This document
+ScyllaDB extends the CQL language to provide a few extra features. This document
 lists those extensions.

 ## BYPASS CACHE clause
@@ -109,7 +109,7 @@ Storage options can be inspected by checking the new system schema table: `syste
 A special statement is dedicated for pruning ghost rows from materialized views.
 Ghost row is an inconsistency issue which manifests itself by having rows
 in a materialized view which do not correspond to any base table rows.
-Such inconsistencies should be prevented altogether and Scylla is striving to avoid
+Such inconsistencies should be prevented altogether and ScyllaDB is striving to avoid
 them, but *if* they happen, this statement can be used to restore a materialized view
 to a fully consistent state without rebuilding it from scratch.

@@ -133,21 +133,35 @@ token ranges.

 ## Synchronous materialized views

-Materialized view updates can be applied synchronously (with errors propagated
-back to the user) or asynchronously, in the background. Historically, in order
-to use synchronous updates, the materialized view had to be local,
-which could be achieved e.g. by using the same partition key definition
-as the one present in the base table.
-Scylla also allows explicitly marking the view as synchronous, which forces
-all its view updates to be updated synchronously. Such views tend to reduce
-observed availability of the base table, because a base table write would only
-succeed if all synchronous view updates also succeed. On the other hand,
-failed view updates would be detected immediately, and appropriate action
-can be taken (e.g. pruning the materialized view, as mentioned in the paragraph
-above).
+Usually, when a table with materialized views is updated, the update to the
+views happens _asynchronously_, i.e., in the background. This means that
+the user cannot know when the view updates have all finished - or even be
+sure that they succeeded.

-In order to mark a materialized view as synchronous, one can use the following
-syntax:
+However, there are circumstances where ScyllaDB does view updates
+_synchronously_ - i.e., the user's write returns only after the views
+were updated. This happens when the materialized-view replica is on the
+same node as the base-table replica. For example, if the base table and
+the view have the same partition key. Note that only ScyllaDB guarantees
+synchronous view updates in this case - they are asynchronous in Cassandra.
+
+ScyllaDB also allows explicitly marking a view as synchronous. When a view
+is marked synchronous, base-table updates will wait for that view to be
+updated before returning. A base table may have multiple views marked
+synchronous, and will wait for all of them. The consistency level of a
+write applies to synchronous views as well as to the base table: For
+example, writing with QUORUM consistency level returns only after a
+quorum of the base-table replicas were updated *and* also a quorum of
+each synchronous view table was also updated.
+
+Synchronous views tend to reduce the observed availability of the base table,
+because a base-table write would only succeed if enough synchronous view
+updates also succeed. On the other hand, failed view updates would be
+detected immediately, and appropriate action can be taken, such as retrying
+the write or pruning the materialized view (as mentioned in the previous
+section). This can improve the consistency of the base table with its views.
+
+To create a new materialized view with synchronous updates, use:

 ```cql
 CREATE MATERIALIZED VIEW main.mv
@@ -157,12 +171,18 @@ CREATE MATERIALIZED VIEW main.mv
  WITH synchronous_updates = true;
 ```

+To make an existing materialized view synchronous, use:
+
 ```cql
 ALTER MATERIALIZED VIEW main.mv WITH synchronous_updates = true;
 ```

-Synchronous updates can also be dynamically turned off by setting
-the value of `synchronous_updates` to `false`.
+To return a materialized view to the default behavior (which, as explained
+above, _usually_ means asynchronous updates), use:
+
+```cql
+ALTER MATERIALIZED VIEW main.mv WITH synchronous_updates = false;
+```

 ### Synchronous global secondary indexes

@@ -261,7 +281,7 @@ that the rate of requests exceeds configured limit, the cluster will start
 rejecting some of them in order to bring the throughput back to the configured
 limit. Rejected requests are less costly which can help reduce overload.

-_NOTE_: Due to Scylla's distributed nature, tracking per-partition request rates
+_NOTE_: Due to ScyllaDB's distributed nature, tracking per-partition request rates
 is not perfect and the actual rate of accepted requests may be higher up to
 a factor of keyspace's `RF`. This feature should not be used to enforce precise
 limits but rather serve as an overload protection feature.
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -3,9 +3,6 @@
 Data Definition
 ===============

-
-.. include:: /rst_include/cql-version-index.rst
-
 CQL stores data in *tables*, whose schema defines the layout of said data in the table, and those tables are grouped in
 *keyspaces*. A keyspace defines a number of options that apply to all the tables it contains, most prominently of
 which is the replication strategy used by the keyspace. An application can have only one keyspace. However, it is also possible to 
@@ -634,7 +631,7 @@ A table supports the following options:
     - map
     - see below
     - :ref:`Compaction options <cql-compaction-options>`
-   * - ``compaction``
+   * - ``compression``
     - map
     - see below
     - :ref:`Compression options <cql-compression-options>`
@@ -749,9 +746,7 @@ CDC options

 .. versionadded:: 3.2 Scylla Open Source

-The following options are to be used with Change Data Capture. Available as an experimental feature from Scylla Open Source 3.2. 
-To use this feature, you must enable the :ref:`experimental tag <yaml_enabling_experimental_features>` in the scylla.yaml.
-
+The following options can be used with Change Data Capture.

 +---------------------------+-----------------+------------------------------------------------------------------------------------------------------------------------+
 | option                    |  default        | description                                                                                                            |
@@ -863,6 +858,18 @@ Other considerations:
 - Adding new columns (see ``ALTER TABLE`` below) is a constant time operation. There is thus no need to try to
  anticipate future usage when creating a table.

+.. _ddl-per-parition-rate-limit:
+
+Limiting the rate of requests per partition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can limit the read rates and writes rates into a partition by applying 
+a ScyllaDB CQL extension to the CREATE TABLE or ALTER TABLE statements. 
+See `Per-partition rate limit <https://docs.scylladb.com/stable/cql/cql-extensions.html#per-partition-rate-limit>`_ 
+for details.
+
+ .. REMOVE IN FUTURE VERSIONS - Remove the URL above (temporary solution) and replace it with a relative link (once the solution is applied).
+
 .. _alter-table-statement:

 ALTER TABLE
@@ -921,6 +928,7 @@ The ``ALTER TABLE`` statement can:
  The same note applies to the set of ``compression`` sub-options.
 - Change or add any of the ``Encryption options`` above.
 - Change or add any of the :ref:`CDC options <cdc-options>` above.
+- Change or add per-partition rate limits. See :ref:`Limiting the rate of requests per partition <ddl-per-parition-rate-limit>`.

 .. warning:: Dropping a column assumes that the timestamps used for the value of this column are "real" timestamp in
   microseconds. Using "real" timestamps in microseconds is the default is and is **strongly** recommended, but as
@@ -930,7 +938,6 @@ The ``ALTER TABLE`` statement can:
 .. warning:: Once a column is dropped, it is allowed to re-add a column with the same name as the dropped one
   **unless** the type of the dropped column was a (non-frozen) column (due to an internal technical limitation).

-
 .. _drop-table-statement:

 DROP TABLE
--- a/docs/cql/definitions.rst
+++ b/docs/cql/definitions.rst
@@ -22,8 +22,6 @@
 Definitions
 -----------

-.. include:: /rst_include/cql-version-index.rst
-
 .. _conventions:

 Conventions
--- a/docs/cql/dml.rst
+++ b/docs/cql/dml.rst
@@ -3,8 +3,6 @@
 Data Manipulation
 -----------------

-.. include:: /rst_include/cql-version-index.rst
-
 This section describes the statements supported by CQL to insert, update, delete, and query data.

 :ref:`SELECT <select-statement>`
@@ -99,11 +97,12 @@ alternatively, of the wildcard character (``*``) to select all the columns defin
 Selectors
 `````````

-A :token:`selector` can be one of:
+A :token:`selector` can be one of the following:

 - A column name of the table selected to retrieve the values for that column.
 - A casting, which allows you to convert a nested selector to a (compatible) type.
 - A function call, where the arguments are selector themselves.
+- A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.

 Aliases
 ```````
@@ -606,7 +605,7 @@ of eventual consistency on an event of a timestamp collision:

 ``INSERT`` statements happening concurrently at different cluster
 nodes proceed without coordination. Eventually cell values
-supplied by a statement with the highest timestamp will prevail.
+supplied by a statement with the highest timestamp will prevail (see :ref:`update ordering <update-ordering>`).

 Unless a timestamp is provided by the client, Scylla will automatically
 generate a timestamp with microsecond precision for each
@@ -615,7 +614,7 @@ by the same node are unique. Timestamps assigned at different
 nodes are not guaranteed to be globally unique.
 With a steadily high write rate timestamp collision
 is not unlikely. If it happens, i.e. two ``INSERTS`` have the same
-timestamp, the lexicographically bigger value prevails:
+timestamp, a conflict resolution algorithm determines which of the inserted cells prevails (see :ref:`update ordering <update-ordering>`).

 Please refer to the :ref:`UPDATE <update-parameters>` section for more information on the :token:`update_parameter`.

@@ -723,8 +722,8 @@ Similarly to ``INSERT``, ``UPDATE`` statement happening concurrently at differen
 cluster nodes proceed without coordination. Cell values
 supplied by a statement with the highest timestamp will prevail.
 If two ``UPDATE`` statements or ``UPDATE`` and ``INSERT``
-statements have the same timestamp,
-lexicographically bigger value prevails.
+statements have the same timestamp, a conflict resolution algorithm determines which cells prevails
+(see :ref:`update ordering <update-ordering>`).

 Regarding the :token:`assignment`:

@@ -765,7 +764,7 @@ parameters:
  Scylla ensures that query timestamps created by the same coordinator node are unique (even across different shards
  on the same node). However, timestamps assigned at different nodes are not guaranteed to be globally unique.
  Note that with a steadily high write rate, timestamp collision is not unlikely. If it happens, e.g. two INSERTS
-  have the same timestamp, conflicting cell values are compared and the cells with the lexicographically bigger value prevail.
+  have the same timestamp, a conflict resolution algorithm determines which of the inserted cells prevails (see :ref:`update ordering <update-ordering>` for more information):
 - ``TTL``: specifies an optional Time To Live (in seconds) for the inserted values. If set, the inserted values are
  automatically removed from the database after the specified time. Note that the TTL concerns the inserted values, not
  the columns themselves. This means that any subsequent update of the column will also reset the TTL (to whatever TTL
@@ -775,6 +774,55 @@ parameters:
 - ``TIMEOUT``: specifies a timeout duration for the specific request.
  Please refer to the :ref:`SELECT <using-timeout>` section for more information.

+.. _update-ordering:
+
+Update ordering
+~~~~~~~~~~~~~~~
+
+:ref:`INSERT <insert-statement>`, :ref:`UPDATE <update-statement>`, and :ref:`DELETE <delete_statement>`
+operations are ordered by their ``TIMESTAMP``.
+
+Ordering of such changes is done at the cell level, where each cell carries a write ``TIMESTAMP``,
+other attributes related to its expiration when it has a non-zero time-to-live (``TTL``),
+and the cell value.
+
+The fundamental rule for ordering cells that insert, update, or delete data in a given row and column
+is that the cell with the highest timestamp wins.
+
+However, it is possible that multiple such cells will carry the same ``TIMESTAMP``.
+There could be several reasons for ``TIMESTAMP`` collision:
+
+* Benign collision can be caused by "replay" of a mutation, e.g., due to client retry, or due to internal processes.
+  In such cases, the cells are equivalent, and any of them can be selected arbitrarily.
+* ``TIMESTAMP`` collisions might be normally caused by parallel queries that are served
+  by different coordinator nodes. The coordinators might calculate the same write ``TIMESTAMP``
+  based on their local time in microseconds.
+* Collisions might also happen with user-provided timestamps if the application does not guarantee
+  unique timestamps with the ``USING TIMESTAMP`` parameter (see :ref:`Update parameters <update-parameters>` for more information).
+
+As said above, in the replay case, ordering of cells should not matter, as they carry the same value
+and same expiration attributes, so picking any of them will reach the same result.
+However, other ``TIMESTAMP`` conflicts must be resolved in a consistent way by all nodes.
+Otherwise, if nodes would have picked an arbitrary cell in case of a conflict and they would
+reach different results, reading from different replicas would detect the inconsistency and trigger
+read-repair that will generate yet another cell that would still conflict with the existing cells,
+with no guarantee for convergence.
+
+Therefore, Scylla implements an internal, consistent conflict-resolution algorithm
+that orders cells with conflicting ``TIMESTAMP`` values based on other properties, like:
+
+* whether the cell is a tombstone or a live cell,
+* whether the cell has an expiration time,
+* the cell ``TTL``,
+* and finally, what value the cell carries.
+
+The conflict-resolution algorithm is documented in `Scylla's internal documentation <https://github.com/scylladb/scylladb/blob/master/docs/dev/timestamp-conflict-resolution.md>`_
+and it may be subject to change.
+
+Reliable serialization can be achieved using unique write ``TIMESTAMP``
+and by using :doc:`Lightweight Transactions (LWT) </using-scylla/lwt>` to ensure atomicity of
+:ref:`INSERT <insert-statement>`, :ref:`UPDATE <update-statement>`, and :ref:`DELETE <delete_statement>`.
+
 .. _delete_statement:

 DELETE
@@ -814,7 +862,7 @@ For more information on the :token:`update_parameter` refer to the :ref:`UPDATE
 In a ``DELETE`` statement, all deletions within the same partition key are applied atomically,
 meaning either all columns mentioned in the statement are deleted or none.
 If ``DELETE`` statement has the same timestamp as ``INSERT`` or
-``UPDATE`` of the same primary key, delete operation prevails.
+``UPDATE`` of the same primary key, delete operation prevails (see :ref:`update ordering <update-ordering>`).

 A ``DELETE`` operation can be conditional through the use of an ``IF`` clause, similar to ``UPDATE`` and ``INSERT``
 statements. Each such ``DELETE`` gets a globally unique timestamp.
--- a/docs/cql/functions.rst
+++ b/docs/cql/functions.rst
@@ -21,7 +21,6 @@
 .. _cql-functions:

 .. Need some intro for UDF and native functions in general and point those to it.
-.. _udfs:
 .. _native-functions:

 Functions
@@ -33,13 +32,15 @@ CQL supports two main categories of functions:
 - The :ref:`aggregate functions <aggregate-functions>`, which are used to aggregate multiple rows of results from a
  ``SELECT`` statement.

-.. In both cases, CQL provides a number of native "hard-coded" functions as well as the ability to create new user-defined
-.. functions.
+In both cases, CQL provides a number of native "hard-coded" functions as well as the ability to create new user-defined
+functions.

-.. .. note:: By default, the use of user-defined functions is disabled by default for security concerns (even when
-..    enabled, the execution of user-defined functions is sandboxed and a "rogue" function should not be allowed to do
-..    evil, but no sandbox is perfect so using user-defined functions is opt-in). See the ``enable_user_defined_functions``
-..    in ``scylla.yaml`` to enable them.
+.. note:: Although user-defined functions are sandboxed, protecting the system from a "rogue" function, user-defined functions are disabled by default for extra security.
+   See the ``enable_user_defined_functions`` in ``scylla.yaml`` to enable them.
+
+   Additionally, user-defined functions are still experimental and need to be explicitly enabled by adding ``udf`` to the list of
+   ``experimental_features`` configuration options in ``scylla.yaml``, or turning on the ``experimental`` flag.
+   See :ref:`Enabling Experimental Features <yaml_enabling_experimental_features>` for details.

 .. A function is identifier by its name:

@@ -60,11 +61,11 @@ Native functions
 Cast
 ````

-Supported starting from Scylla version 2.1 
+Supported starting from ScyllaDB version 2.1 

 The ``cast`` function can be used to convert one native datatype to another.

-The following table describes the conversions supported by the ``cast`` function. Scylla will silently ignore any cast converting a cast datatype into its own datatype.
+The following table describes the conversions supported by the ``cast`` function. ScyllaDB will silently ignore any cast converting a cast datatype into its own datatype.

 =============== =======================================================================================================
 From            To
@@ -228,6 +229,65 @@ A number of functions are provided to “convert” the native types into binary
 takes a 64-bit ``blob`` argument and converts it to a ``bigint`` value. For example, ``bigintAsBlob(3)`` is
 ``0x0000000000000003`` and ``blobAsBigint(0x0000000000000003)`` is ``3``.

+.. _udfs:
+
+User-defined functions :label-caution:`Experimental`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+User-defined functions (UDFs) execute user-provided code in ScyllaDB. Supported languages are currently Lua and WebAssembly.
+
+UDFs are part of the ScyllaDB schema and are automatically propagated to all nodes in the cluster.
+UDFs can be overloaded, so that multiple UDFs with different argument types can have the same function name, for example::
+
+   CREATE FUNCTION sample ( arg int ) ...;
+   CREATE FUNCTION sample ( arg text ) ...;
+
+When calling a user-defined function, arguments can be literals or terms. Prepared statement placeholders can be used, too.
+
+CREATE FUNCTION statement
+`````````````````````````
+
+Creating a new user-defined function uses the ``CREATE FUNCTION`` statement. For example::
+
+    CREATE OR REPLACE FUNCTION div(dividend double, divisor double)
+      RETURNS NULL ON NULL INPUT
+      RETURNS double
+      LANGUAGE LUA
+      AS 'return dividend/divisor;';
+
+``CREATE FUNCTION`` with the optional ``OR REPLACE`` keywords creates either a function
+or replaces an existing one with the same signature. A ``CREATE FUNCTION`` without ``OR REPLACE``
+fails if a function with the same signature already exists. If the optional ``IF NOT EXISTS``
+keywords are used, the function will only be created only if another function with the same
+signature does not exist. ``OR REPLACE`` and ``IF NOT EXISTS`` cannot be used together.
+
+Behavior for null input values must be defined for each function:
+
+* ``RETURNS NULL ON NULL INPUT`` declares that the function will always return null (without being executed) if any of the input arguments is null.
+* ``CALLED ON NULL INPUT`` declares that the function will always be executed.
+
+Function Signature
+``````````````````
+
+Signatures are used to distinguish individual functions. The signature consists of a fully-qualified function name of the <keyspace>.<function_name> and a concatenated list of all the argument types.
+
+Note that keyspace names, function names and argument types are subject to the default naming conventions and case-sensitivity rules.
+
+Functions belong to a keyspace; if no keyspace is specified, the current keyspace is used. User-defined functions are not allowed in the system keyspaces.
+
+DROP FUNCTION statement
+```````````````````````
+
+Dropping a function uses the ``DROP FUNCTION`` statement. For example::
+
+   DROP FUNCTION myfunction;
+   DROP FUNCTION mykeyspace.afunction;
+   DROP FUNCTION afunction ( int );
+   DROP FUNCTION afunction ( text );
+
+You must specify the argument types of the function, the arguments_signature, in the drop command if there are multiple overloaded functions with the same name but different signatures.
+``DROP FUNCTION`` with the optional ``IF EXISTS`` keywords drops a function if it exists, but does not throw an error if it doesn’t.
+
 .. _aggregate-functions:

 Aggregate functions
@@ -261,6 +321,10 @@ It also can be used to count the non-null value of a given column::

    SELECT COUNT (scores) FROM plays;

+.. note::
+    Counting all rows in a table may be time-consuming and exceed the default timeout. In such a case, 
+    see :doc:`Counting all rows in a table is slow </kb/count-all-rows>` for instructions.
+
 Max and Min
 ```````````

@@ -286,6 +350,59 @@ instance::

 .. _user-defined-aggregates-functions:

+User-defined aggregates (UDAs) :label-caution:`Experimental`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+User-defined aggregates allow the creation of custom aggregate functions. User-defined aggregates can be used in SELECT statement.
+
+Each aggregate requires an initial state of type ``STYPE`` defined with the ``INITCOND`` value (default value: ``null``). The first argument of the state function must have type STYPE. The remaining arguments of the state function must match the types of the user-defined aggregate arguments. The state function is called once for each row, and the value returned by the state function becomes the new state. After all rows are processed, the optional FINALFUNC is executed with the last state value as its argument.
+
+The ``STYPE`` value is mandatory in order to distinguish possibly overloaded versions of the state and/or final function, since the overload can appear after creation of the aggregate.
+
+A complete working example for user-defined aggregates (assuming that a keyspace has been selected using the ``USE`` statement)::
+
+   CREATE FUNCTION accumulate_len(acc tuple<bigint,bigint>, a text)
+   	  RETURNS NULL ON NULL INPUT
+   	  RETURNS tuple<bigint,bigint>
+   	  LANGUAGE lua as 'return {acc[1] + 1, acc[2] + #a}';
+
+   CREATE OR REPLACE FUNCTION present(res tuple<bigint,bigint>)
+   	  RETURNS NULL ON NULL INPUT
+   	  RETURNS text
+   	  LANGUAGE lua as
+   	    'return "The average string length is " .. res[2]/res[1] .. "!"';
+
+   CREATE OR REPLACE AGGREGATE avg_length(text)
+      SFUNC accumulate_len
+      STYPE tuple<bigint,bigint>
+      FINALFUNC present
+      INITCOND (0,0);
+
+CREATE AGGREGATE statement
+``````````````````````````
+
+The ``CREATE AGGREGATE`` command with the optional ``OR REPLACE`` keywords creates either an aggregate or replaces an existing one with the same signature. A ``CREATE AGGREGATE`` without ``OR REPLACE`` fails if an aggregate with the same signature already exists. The ``CREATE AGGREGATE`` command with the optional ``IF NOT EXISTS`` keywords creates an aggregate if it does not already exist. The ``OR REPLACE`` and ``IF NOT EXISTS`` phrases cannot be used together.
+
+The ``STYPE`` value defines the type of the state value and must be specified. The optional ``INITCOND`` defines the initial state value for the aggregate; the default value is null. A non-null ``INITCOND`` must be specified for state functions that are declared with ``RETURNS NULL ON NULL INPUT``.
+
+The ``SFUNC`` value references an existing function to use as the state-modifying function. The first argument of the state function must have type ``STYPE``. The remaining arguments of the state function must match the types of the user-defined aggregate arguments. The state function is called once for each row, and the value returned by the state function becomes the new state. State is not updated for state functions declared with ``RETURNS NULL ON NULL INPUT`` and called with null. After all rows are processed, the optional ``FINALFUNC`` is executed with last state value as its argument. It must take only one argument with type ``STYPE``, but the return type of the ``FINALFUNC`` may be a different type. A final function declared with ``RETURNS NULL ON NULL INPUT`` means that the aggregate’s return value will be null, if the last state is null.
+
+If no ``FINALFUNC`` is defined, the overall return type of the aggregate function is ``STYPE``. If a ``FINALFUNC`` is defined, it is the return type of that function.
+
+DROP AGGREGATE statement
+````````````````````````
+
+Dropping an user-defined aggregate function uses the DROP AGGREGATE statement. For example::
+
+   DROP AGGREGATE myAggregate;
+   DROP AGGREGATE myKeyspace.anAggregate;
+   DROP AGGREGATE someAggregate ( int );
+   DROP AGGREGATE someAggregate ( text );
+
+The ``DROP AGGREGATE`` statement removes an aggregate created using ``CREATE AGGREGATE``. You must specify the argument types of the aggregate to drop if there are multiple overloaded aggregates with the same name but a different signature.
+
+The ``DROP AGGREGATE`` command with the optional ``IF EXISTS`` keywords drops an aggregate if it exists, and does nothing if a function with the signature does not exist.
+
 .. include:: /rst_include/apache-cql-return-index.rst 

-.. include:: /rst_include/apache-copyrights.rst
+.. include:: /rst_include/apache-copyrights.rst
--- a/docs/cql/mv.rst
+++ b/docs/cql/mv.rst
@@ -24,9 +24,6 @@ Materialized Views
 ------------------
 Production ready in Scylla Open Source 3.0 and Scylla Enterprise 2019.1.x

-.. include:: /rst_include/cql-version-index.rst
-
-
 Materialized views names are defined by:

 .. code-block:: cql
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -8,12 +8,6 @@ Data Types
 .. _UUID: https://en.wikipedia.org/wiki/Universally_unique_identifier


-
-
-.. include:: /rst_include/cql-version-index.rst
-
-
-
 CQL is a typed language and supports a rich set of data types, including :ref:`native types <native-types>` and
 :ref:`collection types <collections>`.

--- a/docs/dev/timestamp-conflict-resolution.md
+++ b/docs/dev/timestamp-conflict-resolution.md
@@ -0,0 +1,37 @@
+# Timestamp conflict resolution
+
+The fundamental rule for ordering cells that insert, update, or delete data in a given row and column
+is that the cell with the highest timestamp wins.
+
+However, it is possible that multiple such cells will carry the same `TIMESTAMP`.
+In this case, conflicts must be resolved in a consistent way by all nodes.
+Otherwise, if nodes would have picked an arbitrary cell in case of a conflict and they would
+reach different results, reading from different replicas would detect the inconsistency and trigger
+read-repair that will generate yet another cell that would still conflict with the existing cells,
+with no guarantee for convergence.
+
+The first tie-breaking rule when two cells have the same write timestamp is that
+dead cells win over live cells; and if both cells are deleted, the one with the later deletion time prevails.
+
+If both cells are alive, their expiration time is examined.
+Cells that are written with a non-zero TTL (either implicit, as determined by
+the table's default TTL, or explicit, `USING TTL`) are due to expire
+TTL seconds after the time they were written (as determined by the coordinator,
+and rounded to 1 second resolution). That time is the cell's expiration time.
+When cells expire, they become tombstones, shadowing any data written with a write timestamp
+less than or equal to the timestamp of the expiring cell.
+Therefore, cells that have an expiration time win over cells with no expiration time.
+
+If both cells have an expiration time, the one with the latest expiration time wins;
+and if they have the same expiration time (in whole second resolution),
+their write time is derived from the expiration time less the original time-to-live value
+and the one that was written at a later time prevails.
+
+Finally, if both cells are live and have no expiration, or have the same expiration time and time-to-live,
+the cell with the lexicographically bigger value prevails.
+
+Note that when multiple columns are INSERTed or UPDATEed using the same timestamp,
+SELECTing those columns might return a result that mixes cells from either upsert.
+This may happen when both upserts have no expiration time, or both their expiration time and TTL are the
+same, respectively (in whole second resolution). In such a case, cell selection would be based on the cell values
+in each column, independently of each other.
--- a/docs/getting-started/index.rst
+++ b/docs/getting-started/index.rst
@@ -8,47 +8,47 @@ Getting Started
   install-scylla/index
   configure
   requirements
-   Migrate to Scylla </using-scylla/migrate-scylla>
+   Migrate to ScyllaDB </using-scylla/migrate-scylla>
   Integration Solutions </using-scylla/integrations/index>
   tutorials

 .. panel-box::
-  :title: Scylla Requirements
+  :title: ScyllaDB Requirements
  :id: "getting-started"
  :class: my-panel
  
-  * :doc:`Scylla System Requirements Guide</getting-started/system-requirements/>`
+  * :doc:`ScyllaDB System Requirements Guide</getting-started/system-requirements/>`
  * :doc:`OS Support by Platform and Version</getting-started/os-support/>`
    
 .. panel-box::
-  :title: Install and Configure Scylla
+  :title: Install and Configure ScyllaDB
  :id: "getting-started"
  :class: my-panel

-  * `Install Scylla (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/>`_ - Links to the ScyllaDB Download Center
+  * `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
  
-  * :doc:`Configure Scylla</getting-started/system-configuration/>`
-  * :doc:`Run Scylla in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
-  * :doc:`Create a Scylla Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
-  * :doc:`Create a Scylla Cluster - Multi Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster-multidc/>`
+  * :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
+  * :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
+  * :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
+  * :doc:`Create a ScyllaDB Cluster - Multi Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster-multidc/>`

 .. panel-box::
-  :title: Develop Applications for Scylla
+  :title: Develop Applications for ScyllaDB
  :id: "getting-started"
  :class: my-panel

-  * :doc:`Scylla Drivers</using-scylla/drivers/index>`
-  * `Get Started Lesson on Scylla University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/quick-wins-install-and-run-scylla/>`_    
+  * :doc:`ScyllaDB Drivers</using-scylla/drivers/index>`
+  * `Get Started Lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/quick-wins-install-and-run-scylla/>`_    
  * :doc:`CQL Reference </cql/index>`
  * :doc:`cqlsh - the CQL shell </cql/cqlsh/>`

 .. panel-box::
-  :title: Use Scylla with Third-party Solutions
+  :title: Use ScyllaDB with Third-party Solutions
  :id: "getting-started"
  :class: my-panel
  
-  * :doc:`Migrate to Scylla </using-scylla/migrate-scylla>` - How to migrate your current database to Scylla
-  * :doc:`Integrate with Scylla </using-scylla/integrations/index>` - Integration solutions with Scylla
+  * :doc:`Migrate to ScyllaDB </using-scylla/migrate-scylla>` - How to migrate your current database to Scylla
+  * :doc:`Integrate with ScyllaDB </using-scylla/integrations/index>` - Integration solutions with Scylla
  

 .. panel-box::
--- a/docs/getting-started/install-scylla/index.rst
+++ b/docs/getting-started/install-scylla/index.rst
@@ -20,7 +20,7 @@ Install Scylla

  Keep your versions up-to-date. The two latest versions are supported. Also always install the latest patches for your version. 

-  * Download and install Scylla Server, Drivers and Tools in `Scylla Download Center <https://www.scylladb.com/download/#server/>`_
+  * Download and install ScyllaDB Server, Drivers and Tools in `ScyllaDB Download Center <https://www.scylladb.com/download/#core>`_
  * :doc:`ScyllaDB Web Installer for Linux <scylla-web-installer>`
  * :doc:`Scylla Unified Installer (relocatable executable) <unified-installer>`
  * :doc:`Air-gapped Server Installation <air-gapped-install>`
--- a/docs/getting-started/install-scylla/scylla-web-installer.rst
+++ b/docs/getting-started/install-scylla/scylla-web-installer.rst
@@ -4,7 +4,7 @@ ScyllaDB Web Installer for Linux

 ScyllaDB Web Installer is a platform-agnostic installation script you can run with ``curl`` to install ScyllaDB on Linux.

-See `ScyllaDB Download Center <https://www.scylladb.com/download/#server>`_ for information on manually installing ScyllaDB with platform-specific installation packages.
+See `ScyllaDB Download Center <https://www.scylladb.com/download/#core>`_ for information on manually installing ScyllaDB with platform-specific installation packages.

 Prerequisites
 --------------
--- a/docs/getting-started/os-support.rst
+++ b/docs/getting-started/os-support.rst
@@ -1,81 +1,93 @@
 OS Support by Platform and Version
 ==================================

-The following matrix shows which Operating Systems, Platforms, and Containers / Instance Engines are supported with which versions of Scylla.
+The following matrix shows which Operating Systems, Platforms, and Containers / Instance Engines are supported with which versions of ScyllaDB.

-Scylla requires a fix to the XFS append introduced in kernel 3.15 (back-ported to 3.10 in RHEL/CentOS).
-Scylla will not run with earlier kernel versions. Details in `Scylla issue 885 <https://github.com/scylladb/scylla/issues/885>`_.
+ScyllaDB requires a fix to the XFS append introduced in kernel 3.15 (back-ported to 3.10 in RHEL/CentOS).
+ScyllaDB will not run with earlier kernel versions. Details in `ScyllaDB issue 885 <https://github.com/scylladb/scylla/issues/885>`_.
+
+ .. REMOVE IN FUTURE VERSIONS - Remove information about versions from the notes below in version 5.2.

 .. note::
+   
+   **Supported Architecture**

-   Scylla Open Source supports x86_64 for all versions and aarch64 starting from Scylla 4.6 and nightly build. In particular, aarch64 support includes AWS EC2 Graviton.
-
-   For Scylla Open Source **4.5** and later, the recommended OS and Scylla AMI/IMage OS is Ubuntu 20.04.4 LTS.
+   ScyllaDB Open Source supports x86_64 for all versions and AArch64 starting from ScyllaDB 4.6 and nightly build. In particular, aarch64 support includes AWS EC2 Graviton.


-Scylla Open Source
-------------------
+ScyllaDB Open Source
+----------------------

-.. note:: For Enterprise versions **prior to** 4.6, the recommended OS and Scylla AMI/Image OS is CentOS 7.
+.. note:: 

-   For Scylla Open Source versions **4.6 and later**, the recommended OS and Scylla AMI/Image OS is Ubuntu 20.04.
+    Recommended OS and ScyllaDB AMI/Image OS for ScyllaDB Open Source:
+
+       - Ubuntu 20.04 for versions 4.6 and later.
+       - CentOS 7 for versions earlier than 4.6.


-
-+--------------------------+----------------------------------+-----------------------------+-------------+
-| Platform                 |       Ubuntu                     |    Debian                   | Centos/RHEL |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-| Scylla Version / Version | 14.04| 16.04| 18.04|20.04 |22.04 | 8    | 9    |  10   |  11   | 7    | 8    |
-+==========================+======+======+======+======+======+======+======+=======+=======+======+======+
-|   5.0                    | |x|  | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|   | |v|   | |v|  | |v|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   4.6                    | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|  | |v|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   4.5                    | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|  | |v|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   4.4                    | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|  | |v|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   4.3                    | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|  | |v|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   4.2                    | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|  | |v|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   4.1                    | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|  | |v|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   4.0                    | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |x|   | |x|   | |v|  | |x|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   3.x                    | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |x|   | |x|   | |v|  | |x|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   2.3                    | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|  | |x|   | |x|   | |v|  | |x|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
-|   2.2                    | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |x|  | |x|   | |x|   | |v|  | |x|  |
-+--------------------------+------+------+------+------+------+------+------+-------+-------+------+------+
+----------------------------+----------------------------------+-----------------------------+---------+-------+
+| Platform                   |       Ubuntu                     |    Debian                   | CentOS /| Rocky/|
+|                            |                                  |                             | RHEL    | RHEL  |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+| ScyllaDB Version / Version | 14.04| 16.04| 18.04|20.04 |22.04 | 8    | 9    |  10   |  11   | 7       |   8   |
+============================+======+======+======+======+======+======+======+=======+=======+=========+=======+
+|   5.1                      | |x|  | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|   | |v|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   5.0                      | |x|  | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|   | |v|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   4.6                      | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   4.5                      | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   4.4                      | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   4.3                      | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   4.2                      | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   4.1                      | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |v|   | |x|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   4.0                      | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |x|   | |x|   | |v|     | |x|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   3.x                      | |x|  | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |x|   | |x|   | |v|     | |x|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   2.3                      | |v|  | |v|  | |v|  | |x|  | |x|  | |v|  | |v|  | |x|   | |x|   | |v|     | |x|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
+|   2.2                      | |v|  | |v|  | |x|  | |x|  | |x|  | |v|  | |x|  | |x|   | |x|   | |v|     | |x|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+


 All releases are available as a Docker container, EC2 AMI, and a GCP image (GCP image from version 4.3).


-Scylla Enterprise
-----------------
+ScyllaDB Enterprise
+--------------------

-.. note:: Enterprise versions **prior to** 2021.1, the recommended OS and Scylla AMI/IMage OS is CentOS 7.
+.. note:: 
+   Recommended OS and ScyllaDB AMI/Image OS for ScyllaDB Enterprise:

-   For Enterprise versions **2021.1 and later**, the recommended OS and Scylla AMI/IMage OS is Ubuntu 20.04.4 LTS.
+    - Ubuntu 20.04 for versions 2021.1 and later.
+    - CentOS 7 for versions earlier than 2021.1.

-   For Enterprise versions **2021.1 and later**, the recommended OS and Scylla AMI/Image OS is Ubuntu 20.04.
-
-+--------------------------+---------------------------+--------------------+------------+
-| Platform                 |  Ubuntu                   | Debian             | Centos/RHEL|
-+--------------------------+------+------+------+------+------+------+------+------+-----+
-| Scylla Version / Version | 14   | 16   |  18  |  20  | 8    | 9    | 10   |  7   | 8   |
-+==========================+======+======+======+======+======+======+======+======+=====+
-|   2021.1                 | |x|  | |v|  | |v|  | |v|  |  |x| |  |v| |  |v| | |v|  | |v| |
-+--------------------------+------+------+------+------+------+------+------+------+-----+
-|   2020.1                 | |x|  | |v|  | |v|  |  |x| |  |x| |  |v| |  |v| | |v|  | |v| |
-+--------------------------+------+------+------+------+------+------+------+------+-----+
-|   2019.1                 | |x|  | |v|  | |v|  |  |x| |  |x| |  |v| |  |x| | |v|  | |x| |
-+--------------------------+------+------+------+------+------+------+------+------+-----+
-|   2018.1                 | |v|  | |v|  | |x|  |  |x| | |v|  | |x|  |  |x| | |v|  | |x| |
-+--------------------------+------+------+------+------+------+------+------+------+-----+
+----------------------------+-----------------------------------+---------------------------+--------+-------+
+| Platform                   |  Ubuntu                           | Debian                    | CentOS/| Rocky/|
+|                            |                                   |                           | RHEL   | RHEL  |
+----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+
+| ScyllaDB Version / Version | 14.04| 16.04| 18.04| 20.04| 22.04 | 8    | 9    | 10   | 11   |  7     | 8     |
+============================+======+======+======+======+=======+======+======+======+======+========+=======+
+|   2022.2                   | |x|  | |x|  | |v|  | |v|  | |v|   | |x|  | |x|  | |v|  | |v|  | |v|    | |v|   |
+----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+
+|   2022.1                   | |x|  | |x|  | |v|  | |v|  | |v|   | |x|  | |x|  | |v|  | |v|  | |v|    | |v|   |
+----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+
+|   2021.1                   | |x|  | |v|  | |v|  | |v|  | |x|   | |x|  | |v|  | |v|  | |x|  | |v|    | |v|   |
+----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+
+|   2020.1                   | |x|  | |v|  | |v|  |  |x| | |x|   | |x|  | |v|  | |v|  | |x|  | |v|    | |v|   |
+----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+
+|   2019.1                   | |x|  | |v|  | |v|  |  |x| | |x|   | |x|  | |v|  | |x|  | |x|  | |v|    | |x|   |
+----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+
+|   2018.1                   | |v|  | |v|  | |x|  |  |x| | |v|   | |x|  | |x|  | |x|  | |x|  | |v|    | |x|   |
+----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+


 All releases are available as a Docker container, EC2 AMI, and a GCP image (GCP image from version 2021.1).
--- a/docs/getting-started/requirements.rst
+++ b/docs/getting-started/requirements.rst
@@ -1,7 +1,7 @@

-===================
-Scylla Requirements
-===================
+=====================
+ScyllaDB Requirements
+=====================

 .. toctree::
   :maxdepth: 2
@@ -22,9 +22,9 @@ Scylla Requirements
              </div>
              <div class="medium-9 columns">

-* :doc:`Scylla System Requirements Guide</getting-started/system-requirements/>`
+* :doc:`ScyllaDB System Requirements Guide</getting-started/system-requirements/>`
 * :doc:`OS Support by Platform and Version</getting-started/os-support/>`
-* :doc:`Running Scylla in a shared environment </getting-started/scylla-in-a-shared-environment>`
+* :doc:`Running ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`

 .. raw:: html

--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,7 +13,7 @@
  :image: /_static/img/mascots/scylla-docs.svg
  :search_box:

-  The most up-to-date documents for the fastest, best performing, high availability NoSQL database.
+  New to ScyllaDB? Start `here <https://cloud.docs.scylladb.com/stable/scylladb-basics/>`_!

 .. raw:: html

@@ -26,28 +26,29 @@
  <div class="grid-x grid-margin-x hs">

 .. topic-box::
-  :title: Let us manage your DB
+  :title: ScyllaDB Cloud
  :link: https://cloud.docs.scylladb.com
  :class: large-4
-  :anchor: Get Started with Scylla Cloud
+  :anchor: ScyllaDB Cloud Documentation

-  Take advantage of Scylla Cloud, a fully-managed database-as-a-service.
+  Simplify application development with ScyllaDB Cloud - a fully managed database-as-a-service.

 .. topic-box::
-  :title: Manage your own DB
+  :title: ScyllaDB Enterprise
+  :link: https://enterprise.docs.scylladb.com
+  :class: large-4
+  :anchor: ScyllaDB Enterprise Documentation
+
+  Deploy and manage ScyllaDB's most stable enterprise-grade database with premium features and 24/7 support.
+
+.. topic-box::
+  :title: ScyllaDB Open Source
  :link: getting-started
  :class: large-4
-  :anchor: Get Started with Scylla
+  :anchor: ScyllaDB Open Source Documentation

-  Provision and manage a Scylla cluster in your environment.
+  Deploy and manage your database in your environment.

-.. topic-box::
-  :title: Connect your application to Scylla
-  :link: using-scylla/drivers
-  :class: large-4
-  :anchor: Choose a Driver
-
-  Use high performance Scylla drivers to connect your application to a Scylla cluster.

 .. raw:: html

@@ -57,75 +58,50 @@

  <div class="topics-grid topics-grid--products">

-      <h2 class="topics-grid__title">Our Product List</h2>
-      <p class="topics-grid__text">To begin choose a product from the list below</p>
+      <h2 class="topics-grid__title">Other Products</h2>

      <div class="grid-container full">
          <div class="grid-x grid-margin-x">

 .. topic-box::
-  :title: Scylla Enterprise
-  :link: getting-started
-  :image: /_static/img/mascots/scylla-enterprise.svg
-  :class: topic-box--product,large-3,small-6
-
-  ScyllaDB’s most stable high-performance enterprise-grade NoSQL database.
-
-.. topic-box::
-  :title: Scylla Open Source
-  :link: getting-started
-  :image: /_static/img/mascots/scylla-opensource.svg
-  :class: topic-box--product,large-3,small-6
-
-  A high-performance NoSQL database with a close-to-the-hardware, shared-nothing approach.
-
-.. topic-box::
-  :title: Scylla Cloud
-  :link: https://cloud.docs.scylladb.com
-  :image: /_static/img/mascots/scylla-cloud.svg
-  :class: topic-box--product,large-3,small-6
-
-  A fully managed NoSQL database as a service powered by Scylla Enterprise.
-
-.. topic-box::
-  :title: Scylla Alternator
+  :title: ScyllaDB Alternator
  :link: https://docs.scylladb.com/stable/alternator/alternator.html
  :image: /_static/img/mascots/scylla-alternator.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

  Open source Amazon DynamoDB-compatible API.

 .. topic-box::
-  :title: Scylla Monitoring Stack
+  :title: ScyllaDB Monitoring Stack
  :link: https://monitoring.docs.scylladb.com
  :image: /_static/img/mascots/scylla-monitor.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

-  Complete open source monitoring solution for your Scylla clusters.
+  Complete open source monitoring solution for your ScyllaDB clusters.

 .. topic-box::
-  :title: Scylla Manager
+  :title: ScyllaDB Manager
  :link: https://manager.docs.scylladb.com
  :image: /_static/img/mascots/scylla-manager.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

-  Hassle-free Scylla NoSQL database management for scale-out clusters.
+  Hassle-free ScyllaDB NoSQL database management for scale-out clusters.

 .. topic-box::
-  :title: Scylla Drivers
+  :title: ScyllaDB Drivers
  :link: https://docs.scylladb.com/stable/using-scylla/drivers/
  :image: /_static/img/mascots/scylla-drivers.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

  Shard-aware drivers for superior performance. 

 .. topic-box::
-  :title: Scylla Operator
+  :title: ScyllaDB Operator
  :link: https://operator.docs.scylladb.com
  :image: /_static/img/mascots/scylla-enterprise.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

-  Easily run and manage your Scylla Cluster on Kubernetes.
+  Easily run and manage your ScyllaDB cluster on Kubernetes.

 .. raw:: html

@@ -135,19 +111,19 @@

  <div class="topics-grid">

-      <h2 class="topics-grid__title">Learn More About Scylla</h2>
+      <h2 class="topics-grid__title">Learn More About ScyllaDB</h2>
      <p class="topics-grid__text"></p>
      <div class="grid-container full">
          <div class="grid-x grid-margin-x">

 .. topic-box::
-  :title: Attend Scylla University
+  :title: Attend ScyllaDB University
  :link: https://university.scylladb.com/
  :image: /_static/img/mascots/scylla-university.png
  :class: large-6,small-12
  :anchor: Find a Class

-  | Register to take a *free* class at Scylla University.
+  | Register to take a *free* class at ScyllaDB University.
  | There are several learning paths to choose from.

 .. topic-box::
@@ -178,9 +154,9 @@
  architecture/index
  troubleshooting/index
  kb/index
-  Scylla University <https://university.scylladb.com/>
+  ScyllaDB University <https://university.scylladb.com/>
  faq
-  Contribute to Scylla <contribute>
+  Contribute to ScyllaDB <contribute>
  glossary
  alternator/alternator

--- a/docs/kb/count-all-rows.rst
+++ b/docs/kb/count-all-rows.rst
@@ -2,7 +2,7 @@
 Counting all rows in a table is slow
 ====================================

-**Audience: Scylla users**
+**Audience: ScyllaDB users**

 Trying to count all rows in a table using

@@ -10,14 +10,21 @@ Trying to count all rows in a table using

   SELECT COUNT(1) FROM ks.table;

-often fails with **ReadTimeout** error.
+may fail with the **ReadTimeout** error.

-COUNT() is running a full-scan query on all nodes, which might take a long time to finish. Often the time is greater than Scylla query timeout. 
-One way to bypass this in Scylla 4.4 or later is increasing the timeout for this query using the :ref:`USING TIMEOUT <using-timeout>` directive, for example:
+COUNT() runs a full-scan query on all nodes, which might take a long time to finish. As a result, the count time may be greater than the ScyllaDB query timeout. 
+One way to prevent that issue in Scylla 4.4 or later is to increase the timeout for the query using the :ref:`USING TIMEOUT <using-timeout>` directive, for example:


 .. code-block:: cql

   SELECT COUNT(1) FROM ks.table USING TIMEOUT 120s;

-You can also get an *estimation* of the number **of partitions** (not rows) with :doc:`nodetool tablestats </operating-scylla/nodetool-commands/tablestats>`
+You can also get an *estimation* of the number **of partitions** (not rows) with :doc:`nodetool tablestats </operating-scylla/nodetool-commands/tablestats>`.
+
+.. note::
+    ScyllaDB 5.1 includes improvements to speed up the execution of SELECT COUNT(*) queries. 
+    To increase the count speed, we recommend upgrading to ScyllaDB 5.1 or later. 
+ 
+
+ .. REMOVE IN FUTURE VERSIONS - Remove the note above in version 5.1.
--- a/docs/kb/index.rst
+++ b/docs/kb/index.rst
@@ -55,6 +55,7 @@ Knowledge Base
  * :doc:`Map CPUs to Scylla Shards </kb/map-cpu>` - Mapping between CPUs and Scylla shards
  * :doc:`Recreate RAID devices </kb/raid-device>` - How to recreate your RAID devices without running scylla-setup
  * :doc:`Configure Scylla Networking with Multiple NIC/IP Combinations </kb/yaml-address>` - examples for setting the different IP addresses in scylla.yaml
+  * :doc:`Updating the Mode in perftune.yaml After a ScyllaDB Upgrade </kb/perftune-modes-sync>`
  * :doc:`Kafka Sink Connector Quickstart </using-scylla/integrations/kafka-connector>`
  * :doc:`Kafka Sink Connector Configuration </using-scylla/integrations/sink-config>`

--- a/docs/kb/perftune-modes-sync.rst
+++ b/docs/kb/perftune-modes-sync.rst
@@ -0,0 +1,48 @@
+==============================================================
+Updating the Mode in perftune.yaml After a ScyllaDB Upgrade
+==============================================================
+
+In versions 5.1 (ScyllaDB Open Source) and 2022.2 (ScyllaDB Enterprise), we improved ScyllaDB's performance by `removing the rx_queues_count from the mode 
+condition <https://github.com/scylladb/seastar/pull/949>`_. As a result, ScyllaDB operates in 
+the ``sq_split`` mode instead of the ``mq`` mode (see :doc:`Seastar Perftune </operating-scylla/admin-tools/perftune>` for information about the modes).
+If you upgrade from an earlier version of ScyllaDB, your cluster's existing nodes may use the ``mq`` mode, 
+while new nodes will use the ``sq_split`` mode. As using different modes across one cluster is not recommended, 
+you should change the configuration to ensure that the ``sq_split`` mode is used on all nodes.
+
+This section describes how to update the `perftune.yaml` file to configure the ``sq_split`` mode on all nodes. 
+
+Procedure
+------------
+The examples below assume that you are using the default locations for storing data and the `scylla.yaml` file, 
+and that your NIC is ``eth5``. 
+
+#. Backup your old configuration. 
+
+   .. code-block:: console
+
+     sudo mv /etc/scylla.d/cpuset.conf /etc/scylla.d/cpuset.conf.old
+     sudo mv /etc/scylla.d/perftune.yaml /etc/scylla.d/perftune.yaml.old
+
+#. Create a new configuration.
+
+   .. code-block:: console
+
+     sudo scylla_sysconfig_setup --nic eth5 --homedir /var/lib/scylla --confdir /etc/scylla
+
+   A new ``/etc/scylla.d/cpuset.conf`` will be generated on the output.
+
+#. Compare the contents of the newly generated ``/etc/scylla.d/cpuset.conf`` with ``/etc/scylla.d/cpuset.conf.old`` you created in step 1.
+    
+     - If they are exactly the same, rename ``/etc/scylla.d/perftune.yaml.old`` you created in step 1 back to ``/etc/scylla.d/perftune.yaml`` and continue to the next node.
+     - If they are different, move on to the next steps.
+
+#. Restart the ``scylla-server`` service.
+
+   .. code-block:: console
+
+     nodetool drain
+     sudo systemctl restart scylla-server
+
+#. Wait for the service to become up and running (similarly to how it is done during a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart>`). It may take a considerable amount of time before the node is in the UN state due to resharding.
+
+#. Continue to the next node.
--- a/docs/kb/tombstones-flush.rst
+++ b/docs/kb/tombstones-flush.rst
@@ -42,7 +42,7 @@ Steps:

 .. code-block:: sh
   
-   nodetool compact <keyspace>.<mytable>;
+   nodetool compact <keyspace> <mytable>;

 5. Alter the table and change the grace period back to the original ``gc_grace_seconds`` value.

--- a/docs/operating-scylla/_common/tools_index.rst
+++ b/docs/operating-scylla/_common/tools_index.rst
@@ -3,8 +3,8 @@
 * :doc:`REST - Scylla REST/HTTP Admin API</operating-scylla/rest>`.
 * :doc:`Tracing </using-scylla/tracing>` - a ScyllaDB tool for debugging and analyzing internal flows in the server. 
 * :doc:`SSTableloader </operating-scylla/admin-tools/sstableloader>` - Bulk load the sstables found in the directory to a Scylla cluster
-* :doc:`scylla-sstable </operating-scylla/admin-tools/scylla-sstable>` - Validates and dumps the content of SStables, generates a histogram, dumps the content of the SStable index.
-* :doc:`scylla-types </operating-scylla/admin-tools/scylla-types/>` - Examines raw values obtained from SStables, logs, coredumps, etc.
+* :doc:`Scylla SStable </operating-scylla/admin-tools/scylla-sstable>` - Validates and dumps the content of SStables, generates a histogram, dumps the content of the SStable index.
+* :doc:`Scylla Types </operating-scylla/admin-tools/scylla-types/>` - Examines raw values obtained from SStables, logs, coredumps, etc.
 * :doc:`cassandra-stress </operating-scylla/admin-tools/cassandra-stress/>` A tool for benchmarking and load testing a Scylla and Cassandra clusters.
 * :doc:`SSTabledump - Scylla 3.0, Scylla Enterprise 2019.1 and newer versions </operating-scylla/admin-tools/sstabledump>`
 * :doc:`SSTable2JSON - Scylla 2.3 and older </operating-scylla/admin-tools/sstable2json>`
--- a/docs/operating-scylla/admin-tools/index.rst
+++ b/docs/operating-scylla/admin-tools/index.rst
@@ -9,8 +9,8 @@ Admin Tools
   CQLSh </cql/cqlsh>
   REST </operating-scylla/rest>
   Tracing </using-scylla/tracing>
-   scylla-sstable
-   scylla-types </operating-scylla/admin-tools/scylla-types/>
+   Scylla SStable </operating-scylla/admin-tools/scylla-sstable/>
+   Scylla Types </operating-scylla/admin-tools/scylla-types/>
   sstableloader
   cassandra-stress </operating-scylla/admin-tools/cassandra-stress/>
   sstabledump
--- a/docs/operating-scylla/admin-tools/scylla-sstable.rst
+++ b/docs/operating-scylla/admin-tools/scylla-sstable.rst
@@ -1,4 +1,4 @@
-scylla-sstable
+Scylla SStable
 ==============

 .. versionadded:: 5.0
@@ -9,7 +9,17 @@ Introduction
 This tool allows you to examine the content of SStables by performing operations such as dumping the content of SStables,
 generating a histogram, validating the content of SStables, and more. See `Supported Operations`_ for the list of available operations.

-Run ``scylla-sstable --help`` for additional information about the tool and the operations.
+Run ``scylla sstable --help`` for additional information about the tool and the operations.
+
+This tool is similar to SStableDump_, with notable differences:
+
+* Built on the ScyllaDB C++ codebase, it supports all SStable formats and components that ScyllaDB supports.
+* Expanded scope: this tool supports much more than dumping SStable data components (see `Supported Operations`_).
+* More flexible on how schema is obtained and where SStables are located: SStableDump_ only supports dumping SStables located in their native data directory. To dump an SStable, one has to clone the entire ScyllaDB data directory tree, including system table directories and even config files. ``scylla sstable`` can dump sstables from any path with multiple choices on how to obtain the schema, see Schema_.
+
+Currently, SStableDump_ works better on production systems as it automatically loads the schema from the system tables, unlike ``scylla sstable``, which has to be provided with the schema explicitly. On the other hand ``scylla sstable`` works better for off-line investigations, as it can be used with as little as just a schema definition file and a single sstable. In the future we plan on closing this gap -- adding support for automatic schema-loading for ``scylla sstable`` too -- and completely supplant SStableDump_ with ``scylla sstable``.
+
+.. _SStableDump: /operating-scylla/admin-tools/sstabledump

 Usage
 ------
@@ -21,11 +31,82 @@ The command syntax is as follows:

 .. code-block:: console

-   scylla-sstable <operation> <path to SStable>
+   scylla sstable <operation> <path to SStable>


 You can specify more than one SStable.

+Schema
+^^^^^^
+All operations need a schema to interpret the SStables with.
+Currently, there are two ways to obtain the schema:
+
+* ``--schema-file FILENAME`` - Read the schema definition from a file.
+* ``--system-schema KEYSPACE.TABLE`` - Use the known definition of built-in tables (only works for system tables).
+
+By default, the tool uses the first method: ``--schema-file schema.cql``; i.e. it assumes there is a schema file named ``schema.cql`` in the working directory.
+If this fails, it will exit with an error.
+
+The schema file should contain all definitions needed to interpret data belonging to the table.
+
+Example ``schema.cql``:
+
+.. code-block:: cql
+
+    CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'mydc1': 1, 'mydc2': 4};
+
+    CREATE TYPE ks.mytype (
+        f1 int,
+        f2 text
+    );
+
+    CREATE TABLE ks.cf (
+        pk int,
+        ck text,
+        v1 int,
+        v2 mytype,
+        PRIMARY KEY (pk, ck)
+    );
+
+Note:
+
+* In addition to the table itself, the definition also has to includes any user defined types the table uses.
+* The keyspace definition is optional, if missing one will be auto-generated.
+* The schema file doesn't have to be called ``schema.cql``, this is just the default name. Any file name is supported (with any extension).
+
+Dropped columns
+***************
+
+The examined sstable might have columns which were dropped from the schema definition. In this case providing the up-do-date schema will not be enough, the tool will fail when attempting to process a cell for the dropped column.
+Dropped columns can be provided to the tool in the form of insert statements into the ``system_schema.dropped_columns`` system table, in the schema definition file. Example:
+
+.. code-block:: cql
+
+    INSERT INTO system_schema.dropped_columns (
+        keyspace_name,
+        table_name,
+        column_name,
+        dropped_time,
+        type
+    ) VALUES (
+        'ks',
+        'cf',
+        'v1',
+        1631011979170675,
+        'int'
+    );
+
+    CREATE TABLE ks.cf (pk int PRIMARY KEY, v2 int);
+
+System tables
+*************
+
+If the examined table is a system table -- it belongs to one of the system keyspaces (``system``, ``system_schema``, ``system_distributed`` or ``system_distributed_everywhere``) -- you can just tell the tool to use the known built-in definition of said table. This is possible with the ``--system-schema`` flag. Example:
+
+.. code-block:: console
+
+    scylla sstable dump-data --system-schema system.local ./path/to/md-123456-big-Data.db
+
 Supported Operations
 ^^^^^^^^^^^^^^^^^^^^^^^
 The ``dump-*`` operations output JSON. For ``dump-data``, you can specify another output format.
@@ -56,17 +137,17 @@ Dumping the content of the SStable:

 .. code-block:: console

-   scylla-sstable dump-data /path/to/md-123456-big-Data.db
+   scylla sstable dump-data /path/to/md-123456-big-Data.db

 Dumping the content of two SStables as a unified stream:

 .. code-block:: console

-   scylla-sstable dump-data --merge /path/to/md-123456-big-Data.db /path/to/md-123457-big-Data.db
+   scylla sstable dump-data --merge /path/to/md-123456-big-Data.db /path/to/md-123457-big-Data.db


 Validating the specified SStables:

 .. code-block:: console

-   scylla-sstable validate /path/to/md-123456-big-Data.db /path/to/md-123457-big-Data.db
+   scylla sstable validate /path/to/md-123456-big-Data.db /path/to/md-123457-big-Data.db
--- a/docs/operating-scylla/admin-tools/scylla-types.rst
+++ b/docs/operating-scylla/admin-tools/scylla-types.rst
@@ -1,4 +1,4 @@
-scylla-types
+Scylla Types
 ==============

 .. versionadded:: 5.0
@@ -26,7 +26,7 @@ The command syntax is as follows:
 * Provide the values in the hex form without a leading 0x prefix.
 * You must specify the type of the provided values. See :ref:`Specifying the Value Type <scylla-types-type>`.
 * The number of provided values depends on the operation. See :ref:`Supported Operations <scylla-types-operations>` for details.
-* The scylla-types operations come with additional options. See :ref:`Additional Options <scylla-types-options>` for the list of options.
+* The ``scylla types`` operations come with additional options. See :ref:`Additional Options <scylla-types-options>` for the list of options.

 .. _scylla-types-type:

--- a/docs/operating-scylla/admin-tools/sstabledump.rst
+++ b/docs/operating-scylla/admin-tools/sstabledump.rst
@@ -4,8 +4,10 @@ SSTabledump
 This tool allows you to converts SSTable into a JSON format file.
 SSTabledump supported when using Scylla 3.0, Scylla Enterprise 2019.1, and newer versions.
 In older versions, the tool is named SSTable2json_.
+If you need more flexibility or want to dump more than just the data-component, see scylla-sstable_.

 .. _SSTable2json: /operating-scylla/admin-tools/sstable2json
+.. _scylla-sstable: /operating-scylla/admin-tools/scylla-sstable

 Use the full path to the data file when executing the command.

--- a/docs/operating-scylla/index.rst
+++ b/docs/operating-scylla/index.rst
@@ -9,12 +9,9 @@ Scylla for Administrators
   Procedures <procedures/index>
   security/index
   admin-tools/index
-   manager/index
   ScyllaDB Monitoring Stack <https://monitoring.docs.scylladb.com/>
   ScyllaDB Operator <https://operator.docs.scylladb.com/>
   ScyllaDB Manager <https://manager.docs.scylladb.com/>
-   Scylla Monitoring Stack <monitoring/index>
-   Scylla Operator <scylla-operator/index>
   Upgrade Procedures </upgrade/index>
   System Configuration <system-configuration/index>
   benchmarking-scylla
@@ -36,15 +33,9 @@ Scylla for Administrators
  :class: my-panel
    
  * :doc:`Scylla Tools </operating-scylla/admin-tools/index>` - Tools for Administrating and integrating with Scylla
-<<<<<<< HEAD
-  * :doc:`Scylla Manager </operating-scylla/manager/index>` - Tool for cluster administration and automation
  * `ScyllaDB Monitoring Stack <https://monitoring.docs.scylladb.com/stable/>`_ - Tool for cluster monitoring and alerting
  * `ScyllaDB Operator <https://operator.docs.scylladb.com>`_ - Tool to run Scylla on Kubernetes
-=======
  * `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_ - Tool for cluster administration and automation
-  * :doc:`Scylla Monitoring Stack </operating-scylla/monitoring/index>` - Tool for cluster monitoring and alerting
-  * :doc:`Scylla Operator </operating-scylla/scylla-operator/index>` - Tool to run Scylla on Kubernetes
->>>>>>> 40050f951 (doc: add the link to manager.docs.scylladb.com to the toctree)
  * :doc:`Scylla Logs </getting-started/logging/>`

 .. panel-box::
--- a/docs/operating-scylla/nodetool-commands/refresh.rst
+++ b/docs/operating-scylla/nodetool-commands/refresh.rst
@@ -22,22 +22,4 @@ For example:
 ``nodetool refresh nba player_stats``


-Load and Stream
---------------
-
-.. versionadded:: 4.6
-
-.. code::
-
-   nodetool refresh <my_keyspace> <my_table> [--load-and-stream | -las]
-
-The Load and Stream feature extends nodetool refresh. The new ``-las`` option loads arbitrary sstables that do not belong to a node into the cluster. It loads the sstables from the disk and calculates the data's owning nodes, and streams automatically.
-For example, say the old cluster has 6 nodes and the new cluster has 3 nodes. We can copy the sstables from the old cluster to any of the new nodes and trigger the load and stream process.
-
-Load and Stream make restores and migrations much easier:
-
-* You can place sstable from every node to every node
-* No need to run nodetool cleanup to remove unused data
-
-
 .. include:: nodetool-index.rst
--- a/docs/operating-scylla/nodetool-commands/removenode.rst
+++ b/docs/operating-scylla/nodetool-commands/removenode.rst
@@ -25,7 +25,7 @@ Example:

 .. code-block:: console

-    nodetool removenode 192.168.1.3
+    nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy

 Note that all the nodes in the cluster participate in the ``removenode`` operation to sync data if needed. For this reason, the operation will fail if one or more nodes in the cluster are not available.
 In such a case, to ensure that the operation succeeds, you must explicitly specify a list of unavailable nodes with the ``--ignore-dead-nodes`` option.
@@ -41,8 +41,7 @@ Example:

 .. code-block:: console

-    nodetool removenode 192.168.1.3
-    nodetool removenode --ignore-dead-nodes 192.168.1.4,192.168.1.5 192.168.1.3
+    nodetool removenode --ignore-dead-nodes 192.168.1.4,192.168.1.5 675ed9f4-6564-6dbd-can8-43fddce952gy


 .. versionadded:: version 4.6 ``--ignore-dead-nodes`` option    
--- a/docs/operating-scylla/nodetool-commands/repair.rst
+++ b/docs/operating-scylla/nodetool-commands/repair.rst
@@ -41,14 +41,6 @@ Scylla nodetool repair command supports the following options:

     nodetool repair -et 90874935784
     nodetool repair --end-token 90874935784
-
- ``-seq``, ``--sequential`` Use *-seq* to carry out a sequential repair.
-
-  For example, a sequential repair of all keyspaces on a node:
-
-  ::
-
-     nodetool repair -seq
     
 - ``-hosts`` ``--in-hosts`` syncs the **repair master** data subset only between a list of nodes, using host ID or Address. The list *must* include the **repair master**.

--- a/Show More
+++ b/Show More