Merge 'repair: Fix finished ranges metrics for removenode' from Asias He

The skipped ranges should be multiplied by the number of tables Otherwise the finished ranges ratio will not reach 100%. Fixes #21174 Closes scylladb/scylladb#21252 * github.com:scylladb/scylladb: test: Add test_node_ops_metrics.py repair: Make the ranges more consistent in the log repair: Fix finished ranges metrics for removenode (cherry picked from commit be70755f47)
[Backport 6.0] replica/table: check memtable before discarding tombstone during read
2024-10-29 09:50:07 +02:00 · 2024-10-25 11:20:24 +03:00 · 2024-10-25 11:18:58 +03:00 · 2024-10-25 11:18:32 +03:00 · 2024-10-25 11:18:12 +03:00 · 2024-10-23 11:48:45 +02:00
409 changed files with 10903 additions and 3035 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1 +0,0 @@
-**Please replace this line with justification for the backport/\* labels added to this PR**
--- a/.github/scripts/label_promoted_commits.py
+++ b/.github/scripts/label_promoted_commits.py
@@ -1,9 +1,9 @@
-import requests
-from github import Github
 import argparse
 import re
 import sys
 import os
+from github import Github
+from github.GithubException import UnknownObjectException

 try:
    github_token = os.environ["GITHUB_TOKEN"]
@@ -23,36 +23,68 @@ def parser():
                             'commit, exclusive).')
    parser.add_argument('--update_issue', type=bool, default=False, help='Set True to update issues when backport was '
                                                                         'done')
-    parser.add_argument('--label', type=str, required=True, help='Label to use')
+    parser.add_argument('--ref', type=str, required=True, help='PR target branch')
    return parser.parse_args()


+def add_comment_and_close_pr(pr, comment):
+    if pr.state == 'open':
+        pr.create_issue_comment(comment)
+        pr.edit(state="closed")
+
+
+def mark_backport_done(repo, ref_pr_number, branch):
+    pr = repo.get_pull(int(ref_pr_number))
+    label_to_remove = f'backport/{branch}'
+    label_to_add = f'{label_to_remove}-done'
+    current_labels = [label.name for label in pr.get_labels()]
+    if label_to_remove in current_labels:
+        pr.remove_from_labels(label_to_remove)
+    if label_to_add not in current_labels:
+        pr.add_to_labels(label_to_add)
+
+
 def main():
+    # This script is triggered by a push event to either the master branch or a branch named branch-x.y (where x and y represent version numbers). Based on the pushed branch, the script performs the following actions:
+    # - When ref branch is `master`, it will add the `promoted-to-master` label, which we need later for the auto backport process
+    # - When ref branch is `branch-x.y` (which means we backported a patch), it will replace in the original PR the `backport/x.y` label with `backport/x.y-done` and will close the backport PR (Since GitHub close only the one referring to default branch)
    args = parser()
    pr_pattern = re.compile(r'Closes .*#([0-9]+)')
+    target_branch = re.search(r'branch-(\d+\.\d+)', args.ref)
    g = Github(github_token)
    repo = g.get_repo(args.repository, lazy=False)
-
    commits = repo.compare(head=args.commit_after_merge, base=args.commit_before_merge)
+    processed_prs = set()
    # Print commit information
    for commit in commits.commits:
-        print(commit.sha)
+        print(f'Commit sha is: {commit.sha}')
        match = pr_pattern.search(commit.commit.message)
        if match:
-            pr_number = match.group(1)
-            url = f'https://api.github.com/repos/{args.repository}/issues/{pr_number}/labels'
-            data = {
-                "labels": [f'{args.label}']
-            }
-            headers = {
-                "Authorization": f"token {github_token}",
-                "Accept": "application/vnd.github.v3+json"
-            }
-            response = requests.post(url, headers=headers, json=data)
-            if response.ok:
-                print(f"Label added successfully to {url}")
+            pr_number = int(match.group(1))
+            if pr_number in processed_prs:
+                continue
+            if target_branch:
+                pr = repo.get_pull(pr_number)
+                branch_name = target_branch[1]
+                refs_pr = re.findall(r'Refs (?:#|https.*?)(\d+)', pr.body)
+                if refs_pr:
+                    print(f'branch-{target_branch.group(1)}, pr number is: {pr_number}')
+                    # 1. change the backport label of the parent PR to note that
+                    #    we've merge the corresponding backport PR
+                    # 2. close the backport PR and leave a comment on it to note
+                    #    that it has been merged with a certain git commit,
+                    ref_pr_number = refs_pr[0]
+                    mark_backport_done(repo, ref_pr_number, branch_name)
+                    comment = f'Closed via {commit.sha}'
+                    add_comment_and_close_pr(pr, comment)
            else:
-                print(f"No label was added to {url}")
+                try:
+                    pr = repo.get_pull(pr_number)
+                    pr.add_to_labels('promoted-to-master')
+                    print(f'master branch, pr number is: {pr_number}')
+                except UnknownObjectException:
+                    print(f'{pr_number} is not a PR but an issue, no need to add label')
+            processed_prs.add(pr_number)


 if __name__ == "__main__":
--- a/.github/workflows/add-label-when-promoted.yaml
+++ b/.github/workflows/add-label-when-promoted.yaml
@@ -4,6 +4,10 @@ on:
  push:
    branches:
      - master
+      - branch-*.*
+
+env:
+  DEFAULT_BRANCH: 'master'

 jobs:
  check-commit:
@@ -15,6 +19,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
+          repository: ${{ github.repository }}
+          ref: ${{ env.DEFAULT_BRANCH }}
          fetch-depth: 0  # Fetch all history for all tags and branches

      - name: Install dependencies
@@ -23,4 +29,4 @@ jobs:
      - name: Run python script
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: python .github/scripts/label_promoted_commits.py --commit_before_merge ${{ github.event.before }} --commit_after_merge ${{ github.event.after }} --repository ${{ github.repository }} --label promoted-to-master
+        run: python .github/scripts/label_promoted_commits.py --commit_before_merge ${{ github.event.before }} --commit_after_merge ${{ github.event.after }} --repository ${{ github.repository }} --ref ${{ github.ref }}
--- a/.gitignore
+++ b/.gitignore
@@ -18,7 +18,7 @@ CMakeLists.txt.user
 *.egg-info
 __pycache__CMakeLists.txt.user
 .gdbinit
-resources
+/resources
 .pytest_cache
 /expressions.tokens
 tags
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.5.0-dev
+VERSION=6.0.5

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -9,6 +9,7 @@
 #include <fmt/ranges.h>
 #include <seastar/core/sleep.hh>
 #include "alternator/executor.hh"
+#include "cdc/log.hh"
 #include "db/config.hh"
 #include "log.hh"
 #include "schema/schema_builder.hh"
@@ -4439,8 +4440,10 @@ future<executor::request_return_type> executor::list_tables(client_state& client

    auto tables = _proxy.data_dictionary().get_tables(); // hold on to temporary, table_names isn't a container, it's a view
    auto table_names = tables
-            | boost::adaptors::filtered([] (data_dictionary::table t) {
-                        return t.schema()->ks_name().find(KEYSPACE_NAME_PREFIX) == 0 && !t.schema()->is_view();
+            | boost::adaptors::filtered([this] (data_dictionary::table t) {
+                        return t.schema()->ks_name().find(KEYSPACE_NAME_PREFIX) == 0 &&
+                            !t.schema()->is_view() &&
+                            !cdc::is_log_for_some_table(_proxy.local_db(), t.schema()->ks_name(), t.schema()->cf_name());
                    })
            | boost::adaptors::transformed([] (data_dictionary::table t) {
                        return t.schema()->cf_name();
@@ -4576,7 +4579,7 @@ static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_vie
    // used by default on new Alternator tables. Change this initialization
    // to 0 enable tablets by default, with automatic number of tablets.
    std::optional<unsigned> initial_tablets;
-    if (sp.get_db().local().get_config().check_experimental(db::experimental_features_t::feature::TABLETS)) {
+    if (sp.get_db().local().get_config().enable_tablets()) {
        auto it = tags_map.find(INITIAL_TABLETS_TAG_KEY);
        if (it != tags_map.end()) {
            // Tag set. If it's a valid number, use it. If not - e.g., it's
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -211,7 +211,10 @@ protected:
        sstring local_dc = topology.get_datacenter();
        std::unordered_set<gms::inet_address> local_dc_nodes = topology.get_datacenter_endpoints().at(local_dc);
        for (auto& ip : local_dc_nodes) {
-            if (_gossiper.is_alive(ip)) {
+            // Note that it's not enough for the node to be is_alive() - a
+            // node joining the cluster is also "alive" but not responsive to
+            // requests. We need the node to be in normal state. See #19694.
+            if (_gossiper.is_normal(ip)) {
                rjson::push_back(results, rjson::from_string(fmt::to_string(ip)));
            }
        }
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -26,6 +26,7 @@
 #include "log.hh"
 #include "gc_clock.hh"
 #include "replica/database.hh"
+#include "service/client_state.hh"
 #include "service_permit.hh"
 #include "timestamp.hh"
 #include "service/storage_proxy.hh"
@@ -498,6 +499,7 @@ struct scan_ranges_context {
    bytes column_name;
    std::optional<std::string> member;

+    service::client_state internal_client_state;
    ::shared_ptr<cql3::selection::selection> selection;
    std::unique_ptr<service::query_state> query_state_ptr;
    std::unique_ptr<cql3::query_options> query_options;
@@ -507,6 +509,7 @@ struct scan_ranges_context {
        : s(s)
        , column_name(column_name)
        , member(member)
+        , internal_client_state(service::client_state::internal_tag())
    {
        // FIXME: don't read the entire items - read only parts of it.
        // We must read the key columns (to be able to delete) and also
@@ -525,10 +528,9 @@ struct scan_ranges_context {
        std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
        auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), opts);
        command = ::make_lw_shared<query::read_command>(s->id(), s->version(), partition_slice, proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
-        executor::client_state client_state{executor::client_state::internal_tag()};
        tracing::trace_state_ptr trace_state;
        // NOTICE: empty_service_permit is used because the TTL service has fixed parallelism
-        query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, empty_service_permit());
+        query_state_ptr = std::make_unique<service::query_state>(internal_client_state, trace_state, empty_service_permit());
        // FIXME: What should we do on multi-DC? Will we run the expiration on the same ranges on all
        // DCs or only once for each range? If the latter, we need to change the CLs in the
        // scanner and deleter.
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -63,6 +63,28 @@
                     "paramType":"path"
                  }
               ]
+            },
+            {
+               "method":"GET",
+               "summary":"Read the state of an injection from all shards",
+               "type":"array",
+               "items":{
+                  "type":"error_injection_info"
+               },
+               "nickname":"read_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
            }
         ]
      },
@@ -152,5 +174,39 @@
            }
         }
      }
+   },
+   "models":{
+      "mapper":{
+         "id":"mapper",
+         "description":"A key value mapping",
+         "properties":{
+            "key":{
+               "type":"string",
+               "description":"The key"
+            },
+            "value":{
+               "type":"string",
+               "description":"The value"
+            }
+         }
+      },
+       "error_injection_info":{
+         "id":"error_injection_info",
+         "description":"Information about an error injection",
+         "properties":{
+            "enabled":{
+               "type":"boolean",
+               "description":"Is the error injection enabled"
+            },
+            "parameters":{
+               "type":"array",
+               "items":{
+                  "type":"mapper"
+               },
+               "description":"The parameter values"
+            }
+         },
+         "required":["enabled"]
+      }
   }
 }
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1913,6 +1913,14 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"force",
+                     "description":"Enforce the source_dc option, even if it unsafe to use for rebuild",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -194,6 +194,21 @@
               "parameters":[]
            }
         ]
+      },
+      {
+         "path":"/system/highest_supported_sstable_version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get highest supported sstable version",
+               "type":"string",
+               "nickname":"get_highest_supported_sstable_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
      }
   ]
 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -366,6 +366,14 @@ ratio_holder filter_recent_false_positive_as_ratio_holder(const sstables::shared
    return ratio_holder(f + sst->filter_get_recent_true_positive(), f);
 }

+uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<uint64_t(replica::memtable& mt)> action) {
+    uint64_t ret = 0;
+    t.for_each_active_memtable([&] (replica::memtable& mt) {
+        ret += action(mt);
+    });
+    return ret;
+}
+
 void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace>& sys_ks) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        std::vector<sstring> res;
@@ -401,13 +409,13 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t{0}, [](replica::column_family& cf) {
-            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
+            return accumulate_on_active_memtables(cf, std::mem_fn(&replica::memtable::partition_count));
        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return map_reduce_cf(ctx, uint64_t{0}, [](replica::column_family& cf) {
-            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
+            return accumulate_on_active_memtables(cf, std::mem_fn(&replica::memtable::partition_count));
        }, std::plus<>());
    });

@@ -421,33 +429,33 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
-            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
-                return active_memtable->region().occupancy().total_space();
-            }), uint64_t(0));
+            return accumulate_on_active_memtables(cf, [] (replica::memtable& active_memtable) {
+                return active_memtable.region().occupancy().total_space();
+            });
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
-                return active_memtable->region().occupancy().total_space();
-            }), uint64_t(0));
+            return accumulate_on_active_memtables(cf, [] (replica::memtable& active_memtable) {
+                return active_memtable.region().occupancy().total_space();
+            });
        }, std::plus<int64_t>());
    });

    cf::get_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
-            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
-                return active_memtable->region().occupancy().used_space();
-            }), uint64_t(0));
+            return accumulate_on_active_memtables(cf, [] (replica::memtable& active_memtable) {
+                return active_memtable.region().occupancy().used_space();
+            });
        }, std::plus<int64_t>());
    });

    cf::get_all_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
-                return active_memtable->region().occupancy().used_space();
-            }), uint64_t(0));
+            return accumulate_on_active_memtables(cf, [] (replica::memtable& active_memtable) {
+                return active_memtable.region().occupancy().used_space();
+            });
        }, std::plus<int64_t>());
    });

@@ -485,9 +493,9 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    cf::get_all_cf_all_memtables_live_data_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        warn(unimplemented::cause::INDEXES);
        return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
-            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
-                return active_memtable->region().occupancy().used_space();
-            }), uint64_t(0));
+            return accumulate_on_active_memtables(cf, [] (replica::memtable& active_memtable) {
+                return active_memtable.region().occupancy().used_space();
+            });
        }, std::plus<int64_t>());
    });

--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/exception.hh>

 #include "compaction_manager.hh"
 #include "compaction/compaction_manager.hh"
@@ -153,10 +154,13 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    });

    cm::get_compaction_history.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        std::function<future<>(output_stream<char>&&)> f = [&ctx](output_stream<char>&& s) {
-            return do_with(output_stream<char>(std::move(s)), true, [&ctx] (output_stream<char>& s, bool& first){
-                return s.write("[").then([&ctx, &s, &first] {
-                    return ctx.db.local().get_compaction_manager().get_compaction_history([&s, &first](const db::compaction_history_entry& entry) mutable {
+        std::function<future<>(output_stream<char>&&)> f = [&ctx] (output_stream<char>&& out) -> future<> {
+            auto s = std::move(out);
+            bool first = true;
+            std::exception_ptr ex;
+            try {
+                co_await s.write("[");
+                co_await ctx.db.local().get_compaction_manager().get_compaction_history([&s, &first](const db::compaction_history_entry& entry) mutable -> future<> {
                        cm::history h;
                        h.id = fmt::to_string(entry.id);
                        h.ks = std::move(entry.ks);
@@ -170,18 +174,21 @@ void set_compaction_manager(http_context& ctx, routes& r) {
                            e.value = it.second;
                            h.rows_merged.push(std::move(e));
                        }
-                        auto fut = first ? make_ready_future<>() : s.write(", ");
+                        if (!first) {
+                            co_await s.write(", ");
+                        }
                        first = false;
-                        return fut.then([&s, h = std::move(h)] {
-                            return formatter::write(s, h);
-                        });
-                    }).then([&s] {
-                        return s.write("]").then([&s] {
-                            return s.close();
-                        });
+                        co_await formatter::write(s, h);
                    });
-                });
-            });
+                co_await s.write("]");
+                co_await s.flush();
+            } catch (...) {
+                ex = std::current_exception();
+            }
+            co_await s.close();
+            if (ex) {
+                co_await coroutine::return_exception_ptr(std::move(ex));
+            }
        };
        return make_ready_future<json::json_return_type>(std::move(f));
    });
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -64,6 +64,32 @@ void set_error_injection(http_context& ctx, routes& r) {
        });
    });

+    hf::read_injection.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        const sstring injection = req->get_path_param("injection");
+
+        std::vector<error_injection_json::error_injection_info> error_injection_infos(smp::count, error_injection_json::error_injection_info{});
+
+        co_await smp::invoke_on_all([&] {
+            auto& info = error_injection_infos[this_shard_id()];
+            auto& errinj = utils::get_local_injector();
+            const auto enabled = errinj.is_enabled(injection);
+            info.enabled = enabled;
+            if (!enabled) {
+                return;
+            }
+            std::vector<error_injection_json::mapper> parameters;
+            for (const auto& p : errinj.get_injection_parameters(injection)) {
+                error_injection_json::mapper param;
+                param.key = p.first;
+                param.value = p.second;
+                parameters.push_back(std::move(param));
+            }
+            info.parameters = std::move(parameters);
+        });
+
+        co_return json::json_return_type(error_injection_infos);
+    });
+
    hf::disable_on_all.set(r, [](std::unique_ptr<request> req) {
        auto& errinj = utils::get_local_injector();
        return errinj.disable_on_all().then([] {
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -61,17 +61,31 @@ void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_regis
        co_return json_void{};
    });
    r::get_leader_host.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
-        return smp::submit_to(0, [&] {
-            auto& srv = std::invoke([&] () -> raft::server& {
-                if (req->query_parameters.contains("group_id")) {
-                    raft::group_id id{utils::UUID{req->get_query_param("group_id")}};
-                    return raft_gr.local().get_server(id);
-                } else {
-                    return raft_gr.local().group0();
-                }
+        if (!req->query_parameters.contains("group_id")) {
+            const auto leader_id = co_await raft_gr.invoke_on(0, [] (service::raft_group_registry& raft_gr) {
+                auto& srv = raft_gr.group0();
+                return srv.current_leader();
            });
-            return json_return_type(srv.current_leader().to_sstring());
+            co_return json_return_type{leader_id.to_sstring()};
+        }
+
+        const raft::group_id gid{utils::UUID{req->get_query_param("group_id")}};
+
+        std::atomic<bool> found_srv{false};
+        std::atomic<raft::server_id> leader_id = raft::server_id::create_null_id();
+        co_await raft_gr.invoke_on_all([gid, &found_srv, &leader_id] (service::raft_group_registry& raft_gr) {
+            if (raft_gr.find_server(gid)) {
+                found_srv = true;
+                leader_id = raft_gr.get_server(gid).current_leader();
+            }
+            return make_ready_future<>();
        });
+
+        if (!found_srv) {
+            throw bad_param_exception{fmt::format("Server for group ID {} not found", gid)};
+        }
+
+        co_return json_return_type(leader_id.load().to_sstring());
    });
 }

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -36,6 +36,7 @@
 #include <seastar/http/exception.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/coroutine/exception.hh>
 #include "repair/row_level.hh"
 #include "locator/snitch_base.hh"
 #include "column_family.hh"
@@ -54,6 +55,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "sstables_loader.hh"
 #include "db/view/view_builder.hh"
+#include "utils/user_provided_param.hh"

 using namespace seastar::httpd;
 using namespace std::chrono_literals;
@@ -1136,7 +1138,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::rebuild.set(r, [&ss](std::unique_ptr<http::request> req) {
-        auto source_dc = req->get_query_param("source_dc");
+        utils::optional_param source_dc;
+        if (auto source_dc_str = req->get_query_param("source_dc"); !source_dc_str.empty()) {
+            source_dc.emplace(std::move(source_dc_str)).set_user_provided();
+        }
+        if (auto force_str = req->get_query_param("force"); !force_str.empty() && service::loosen_constraints(validate_bool(force_str))) {
+            if (!source_dc) {
+                throw bad_param_exception("The `source_dc` option must be provided for using the `force` option");
+            }
+            source_dc.set_force();
+        }
        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -1685,32 +1696,41 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
    ss::get_snapshot_details.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto result = co_await snap_ctl.local().get_snapshot_details();
        co_return std::function([res = std::move(result)] (output_stream<char>&& o) -> future<> {
-            auto result = std::move(res);
+            std::exception_ptr ex;
            output_stream<char> out = std::move(o);
-            bool first = true;
+            try {
+                auto result = std::move(res);
+                bool first = true;

-            co_await out.write("[");
-            for (auto& [name, details] : result) {
-                if (!first) {
-                    co_await out.write(", ");
+                co_await out.write("[");
+                for (auto& [name, details] : result) {
+                    if (!first) {
+                        co_await out.write(", ");
+                    }
+                    std::vector<ss::snapshot> snapshot;
+                    for (auto& cf : details) {
+                        ss::snapshot snp;
+                        snp.ks = cf.ks;
+                        snp.cf = cf.cf;
+                        snp.live = cf.details.live;
+                        snp.total = cf.details.total;
+                        snapshot.push_back(std::move(snp));
+                    }
+                    ss::snapshots all_snapshots;
+                    all_snapshots.key = name;
+                    all_snapshots.value = std::move(snapshot);
+                    co_await all_snapshots.write(out);
+                    first = false;
                }
-                std::vector<ss::snapshot> snapshot;
-                for (auto& cf : details) {
-                    ss::snapshot snp;
-                    snp.ks = cf.ks;
-                    snp.cf = cf.cf;
-                    snp.live = cf.details.live;
-                    snp.total = cf.details.total;
-                    snapshot.push_back(std::move(snp));
-                }
-                ss::snapshots all_snapshots;
-                all_snapshots.key = name;
-                all_snapshots.value = std::move(snapshot);
-                co_await all_snapshots.write(out);
-                first = false;
+                co_await out.write("]");
+                co_await out.flush();
+            } catch (...) {
+              ex = std::current_exception();
            }
-            co_await out.write("]");
            co_await out.close();
+            if (ex) {
+                co_await coroutine::return_exception_ptr(std::move(ex));
+            }
        });
    });

--- a/api/system.cc
+++ b/api/system.cc
@@ -10,6 +10,7 @@
 #include "api/api-doc/system.json.hh"
 #include "api/api-doc/metrics.json.hh"
 #include "replica/database.hh"
+#include "sstables/sstables_manager.hh"

 #include <rapidjson/document.h>
 #include <seastar/core/reactor.hh>
@@ -182,6 +183,11 @@ void set_system(http_context& ctx, routes& r) {
        apilog.info("Profile dumped to {}", profile_dest);
        return make_ready_future<json::json_return_type>(json::json_return_type(json::json_void()));
    }) ;
+
+    hs::get_highest_supported_sstable_version.set(r, [&ctx] (const_req req) {
+        auto& table = ctx.db.local().find_column_family("system", "local");
+        return seastar::to_sstring(table.get_sstables_manager().get_highest_supported_format());
+    });
 }

 }
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/exception.hh>
 #include <seastar/http/exception.hh>

 #include "task_manager.hh"
@@ -23,6 +24,8 @@ namespace tm = httpd::task_manager_json;
 using namespace json;
 using namespace seastar::httpd;

+using task_variant = std::variant<tasks::task_manager::foreign_task_ptr, tasks::task_manager::task::task_essentials>;
+
 inline bool filter_tasks(tasks::task_manager::task_ptr task, std::unordered_map<sstring, sstring>& query_params) {
    return (!query_params.contains("keyspace") || query_params["keyspace"] == task->get_status().keyspace) &&
        (!query_params.contains("table") || query_params["table"] == task->get_status().table);
@@ -102,13 +105,14 @@ future<full_task_status> retrieve_status(const tasks::task_manager::foreign_task
    s.module = task->get_module_name();
    s.progress.completed = progress.completed;
    s.progress.total = progress.total;
-    std::vector<std::string> ct{task->get_children().size()};
-    boost::transform(task->get_children(), ct.begin(), [] (const auto& child) {
+    std::vector<std::string> ct = co_await task->get_children().map_each_task<std::string>([] (const tasks::task_manager::foreign_task_ptr& child) {
        return child->id().to_sstring();
+    }, [] (const tasks::task_manager::task::task_essentials& child) {
+        return child.task_status.id.to_sstring();
    });
    s.children_ids = std::move(ct);
    co_return s;
-}
+};

 void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>& tm, db::config& cfg) {
    tm::get_modules.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -138,19 +142,28 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>

        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
            auto s = std::move(os);
-            auto res = std::move(r);
-            co_await s.write("[");
-            std::string delim = "";
-            for (auto& v: res) {
-                for (auto& stats: v) {
-                    co_await s.write(std::exchange(delim, ", "));
-                    tm::task_stats ts;
-                    ts = stats;
-                    co_await formatter::write(s, ts);
+            std::exception_ptr ex;
+            try {
+                auto res = std::move(r);
+                co_await s.write("[");
+                std::string delim = "";
+                for (auto& v: res) {
+                    for (auto& stats: v) {
+                        co_await s.write(std::exchange(delim, ", "));
+                        tm::task_stats ts;
+                        ts = stats;
+                        co_await formatter::write(s, ts);
+                    }
                }
+                co_await s.write("]");
+                co_await s.flush();
+            } catch (...) {
+                ex = std::current_exception();
            }
-            co_await s.write("]");
            co_await s.close();
+            if (ex) {
+                co_await coroutine::return_exception_ptr(std::move(ex));
+            }
        };
        co_return std::move(f);
    });
@@ -179,7 +192,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
                if (!task->is_abortable()) {
                    co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
                }
-                co_await task->abort();
+                task->abort();
            });
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
@@ -193,7 +206,6 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
        try {
            task = co_await tasks::task_manager::invoke_on_task(tm, id, std::function([] (tasks::task_manager::task_ptr task) {
                return task->done().then_wrapped([task] (auto f) {
-                    task->unregister_task();
                    // done() is called only because we want the task to be complete before getting its status.
                    // The future should be ignored here as the result does not matter.
                    f.ignore_ready_future();
@@ -210,7 +222,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
    tm::get_task_status_recursively.set(r, [&_tm = tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& tm = _tm;
        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
-        std::queue<tasks::task_manager::foreign_task_ptr> q;
+        std::queue<task_variant> q;
        utils::chunked_vector<full_task_status> res;

        tasks::task_manager::foreign_task_ptr task;
@@ -230,10 +242,33 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
        while (!q.empty()) {
            auto& current = q.front();
-            res.push_back(co_await retrieve_status(current));
-            for (auto& child: current->get_children()) {
-                q.push(co_await child.copy());
-            }
+            co_await std::visit(overloaded_functor {
+                [&] (const tasks::task_manager::foreign_task_ptr& task) -> future<> {
+                    res.push_back(co_await retrieve_status(task));
+                    co_await task->get_children().for_each_task([&q] (const tasks::task_manager::foreign_task_ptr& child) -> future<> {
+                        q.push(co_await child.copy());
+                    }, [&] (const tasks::task_manager::task::task_essentials& child) {
+                        q.push(child);
+                        return make_ready_future();
+                    });
+                },
+                [&] (const tasks::task_manager::task::task_essentials& task) -> future<> {
+                    res.push_back(full_task_status{
+                        .task_status = task.task_status,
+                        .type = task.type,
+                        .progress = task.task_progress,
+                        .parent_id = task.parent_id,
+                        .abortable = task.abortable,
+                        .children_ids = boost::copy_range<std::vector<std::string>>(task.failed_children | boost::adaptors::transformed([] (auto& child) {
+                            return child.task_status.id.to_sstring();
+                        }))
+                    });
+                    for (auto& child: task.failed_children) {
+                        q.push(child);
+                    }
+                    return make_ready_future();
+                }
+            }, current);
            q.pop();
        }

--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -89,14 +89,13 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
        std::string error = fail ? it->second : "";

        try {
-            co_await tasks::task_manager::invoke_on_task(tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) {
+            co_await tasks::task_manager::invoke_on_task(tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) -> future<> {
                tasks::test_task test_task{task};
                if (fail) {
-                    test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
+                    co_await test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
                } else {
-                    test_task.finish();
+                    co_await test_task.finish();
                }
-                return make_ready_future<>();
            });
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -76,7 +76,7 @@ auth::certificate_authenticator::certificate_authenticator(cql3::query_processor
                    continue;
                } catch (std::out_of_range&) {
                    // just fallthrough
-                } catch (std::regex_error&) {
+                } catch (boost::regex_error&) {
                    std::throw_with_nested(std::invalid_argument(fmt::format("Invalid query expression: {}", map.at(cfg_query_attr))));
                }
            }
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -24,7 +24,6 @@
 #include "service/raft/group0_state_machine.hh"
 #include "timeout_config.hh"
 #include "db/config.hh"
-#include "db/system_auth_keyspace.hh"
 #include "utils/error_injection.hh"

 namespace auth {
@@ -41,14 +40,14 @@ constinit const std::string_view AUTH_PACKAGE_NAME("org.apache.cassandra.auth.")
 static logging::logger auth_log("auth");

 bool legacy_mode(cql3::query_processor& qp) {
-    return qp.auth_version < db::system_auth_keyspace::version_t::v2;
+    return qp.auth_version < db::system_keyspace::auth_version_t::v2;
 }

 std::string_view get_auth_ks_name(cql3::query_processor& qp) {
    if (legacy_mode(qp)) {
        return meta::legacy::AUTH_KS;
    }
-    return db::system_auth_keyspace::NAME;
+    return db::system_keyspace::NAME;
 }

 // Func must support being invoked more than once.
@@ -73,7 +72,7 @@ static future<> create_legacy_metadata_table_if_missing_impl(
    assert(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only

    auto db = qp.db();
-    auto parsed_statement = cql3::query_processor::parse_statement(cql);
+    auto parsed_statement = cql3::query_processor::parse_statement(cql, cql3::dialect{});
    auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);

    parsed_cf_statement.prepare_keyspace(meta::legacy::AUTH_KS);
@@ -123,7 +122,7 @@ static future<> announce_mutations_with_guard(
        ::service::raft_group0_client& group0_client,
        std::vector<canonical_mutation> muts,
        ::service::group0_guard group0_guard,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
    auto group0_cmd = group0_client.prepare_command(
        ::service::write_mutations{
@@ -139,7 +138,7 @@ future<> announce_mutations_with_batching(
        ::service::raft_group0_client& group0_client,
        start_operation_func_t start_operation_func,
        std::function<mutations_generator(api::timestamp_type& t)> gen,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
    // account for command's overhead, it's better to use smaller threshold than constantly bounce off the limit
    size_t memory_threshold = group0_client.max_command_size() * 0.75;
@@ -190,7 +189,7 @@ future<> announce_mutations(
        ::service::raft_group0_client& group0_client,
        const sstring query_string,
        std::vector<data_value_or_unset> values,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
    auto group0_guard = co_await group0_client.start_operation(as, timeout);
    auto timestamp = group0_guard.write_timestamp();
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -84,7 +84,7 @@ future<> create_legacy_metadata_table_if_missing(
 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
 // Use this function when need to perform read before write on a single guard or if
 // you have more than one mutation and potentially exceed single command size limit.
-using start_operation_func_t = std::function<future<::service::group0_guard>(abort_source*)>;
+using start_operation_func_t = std::function<future<::service::group0_guard>(abort_source&)>;
 using mutations_generator = coroutine::experimental::generator<mutation>;
 future<> announce_mutations_with_batching(
        ::service::raft_group0_client& group0_client,
@@ -93,7 +93,7 @@ future<> announce_mutations_with_batching(
        // function here
        start_operation_func_t start_operation_func,
        std::function<mutations_generator(api::timestamp_type& t)> gen,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout);

 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
@@ -102,7 +102,7 @@ future<> announce_mutations(
        ::service::raft_group0_client& group0_client,
        const sstring query_string,
        std::vector<data_value_or_unset> values,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout);

 }
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -9,7 +9,7 @@
 */

 #include "auth/default_authorizer.hh"
-#include "db/system_auth_keyspace.hh"
+#include "db/system_keyspace.hh"

 extern "C" {
 #include <crypt.h>
@@ -203,7 +203,7 @@ default_authorizer::modify(
                cql3::query_processor::cache_internal::no).discard_result();
    }
    co_return co_await announce_mutations(_qp, _group0_client, query,
-        {permissions::to_strings(set), sstring(role_name), resource.name()}, &_as, ::service::raft_timeout{});
+        {permissions::to_strings(set), sstring(role_name), resource.name()}, _as, ::service::raft_timeout{});
 }


@@ -256,7 +256,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name) {
                    {sstring(role_name)},
                    cql3::query_processor::cache_internal::no).discard_result();
        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name)}, &_as, ::service::raft_timeout{});
+            co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name)}, _as, ::service::raft_timeout{});
        }
    } catch (exceptions::request_execution_exception& e) {
        alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", role_name, e);
@@ -346,9 +346,9 @@ future<> default_authorizer::revoke_all(const resource& resource) {
        const auto timeout = ::service::raft_timeout{};
        co_await announce_mutations_with_batching(
                _group0_client,
-                [this, timeout](abort_source* as) { return _group0_client.start_operation(as, timeout); },
+                [this, timeout](abort_source& as) { return _group0_client.start_operation(as, timeout); },
                std::move(gen),
-                &_as,
+                _as,
            timeout);
    } catch (exceptions::request_execution_exception& e) {
        alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", name, e);
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -136,7 +136,7 @@ future<> password_authenticator::create_default_if_missing() {
        plogger.info("Created default superuser authentication record.");
    } else {
        co_await announce_mutations(_qp, _group0_client, query,
-            {salted_pwd, _superuser}, &_as, ::service::raft_timeout{});
+            {salted_pwd, _superuser}, _as, ::service::raft_timeout{});
        plogger.info("Created default superuser authentication record.");
    }
 }
@@ -271,7 +271,7 @@ future<> password_authenticator::create(std::string_view role_name, const authen
                cql3::query_processor::cache_internal::no).discard_result();
    } else {
        co_await announce_mutations(_qp, _group0_client, query,
-                {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}, &_as, ::service::raft_timeout{});
+                {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}, _as, ::service::raft_timeout{});
    }
 }

@@ -294,7 +294,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
                cql3::query_processor::cache_internal::no).discard_result();
    } else {
        co_await announce_mutations(_qp, _group0_client, query,
-            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}, &_as, ::service::raft_timeout{});
+            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}, _as, ::service::raft_timeout{});
    }
 }

@@ -311,7 +311,7 @@ future<> password_authenticator::drop(std::string_view name) {
                {sstring(name)},
                cql3::query_processor::cache_internal::no).discard_result();
    } else {
-        co_await announce_mutations(_qp, _group0_client, query, {sstring(name)}, &_as, ::service::raft_timeout{});
+        co_await announce_mutations(_qp, _group0_client, query, {sstring(name)}, _as, ::service::raft_timeout{});
    }
 }

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -28,7 +28,6 @@
 #include "db/config.hh"
 #include "db/consistency_level_type.hh"
 #include "db/functions/function_name.hh"
-#include "db/system_auth_keyspace.hh"
 #include "log.hh"
 #include "schema/schema_fwd.hh"
 #include <seastar/core/future.hh>
@@ -644,7 +643,7 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
                }
                auto muts = co_await qp.get_mutations_internal(
                        format("INSERT INTO {}.{} ({}) VALUES ({})",
-                                db::system_auth_keyspace::NAME,
+                                db::system_keyspace::NAME,
                                cf_name,
                                col_names_str,
                                val_binders_str),
@@ -659,12 +658,12 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
            }
        }
        co_yield co_await sys_ks.make_auth_version_mutation(ts,
-                db::system_auth_keyspace::version_t::v2);
+                db::system_keyspace::auth_version_t::v2);
    };
    co_await announce_mutations_with_batching(g0,
            start_operation_func,
            std::move(gen),
-            &as,
+            as,
            std::nullopt);
 }

--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -190,7 +190,7 @@ future<> standard_role_manager::create_default_role_if_missing() {
                    {_superuser},
                    cql3::query_processor::cache_internal::no).discard_result();
        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {_superuser}, &_as, ::service::raft_timeout{});
+            co_await announce_mutations(_qp, _group0_client, query, {_superuser}, _as, ::service::raft_timeout{});
        }
        log.info("Created default superuser role '{}'.", _superuser);
    } catch(const exceptions::unavailable_exception& e) {
@@ -285,7 +285,7 @@ future<> standard_role_manager::create_or_replace(std::string_view role_name, co
                {sstring(role_name), c.is_superuser, c.can_login},
                cql3::query_processor::cache_internal::yes).discard_result();
    } else {
-        co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name), c.is_superuser, c.can_login}, &_as, ::service::raft_timeout{});
+        co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name), c.is_superuser, c.can_login}, _as, ::service::raft_timeout{});
    }
 }

@@ -333,7 +333,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
                    {sstring(role_name)},
                    cql3::query_processor::cache_internal::no).discard_result();
        } else {
-            return announce_mutations(_qp, _group0_client, std::move(query), {sstring(role_name)}, &_as, ::service::raft_timeout{});
+            return announce_mutations(_qp, _group0_client, std::move(query), {sstring(role_name)}, _as, ::service::raft_timeout{});
        }
    });
 }
@@ -383,7 +383,7 @@ future<> standard_role_manager::drop(std::string_view role_name) {
            co_await _qp.execute_internal(query, {sstring(role_name)},
                cql3::query_processor::cache_internal::yes).discard_result();
        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name)}, &_as, ::service::raft_timeout{});
+            co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name)}, _as, ::service::raft_timeout{});
        }
    };
    // Finally, delete the role itself.
@@ -401,7 +401,7 @@ future<> standard_role_manager::drop(std::string_view role_name) {
                    {sstring(role_name)},
                    cql3::query_processor::cache_internal::no).discard_result();
        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name)}, &_as, ::service::raft_timeout{});
+            co_await announce_mutations(_qp, _group0_client, query, {sstring(role_name)}, _as, ::service::raft_timeout{});
        }
    };

@@ -434,7 +434,7 @@ standard_role_manager::modify_membership(
                    cql3::query_processor::cache_internal::no).discard_result();
        } else {
            co_await announce_mutations(_qp, _group0_client, std::move(query),
-                    {role_set{sstring(role_name)}, sstring(grantee_name)}, &_as, ::service::raft_timeout{});
+                    {role_set{sstring(role_name)}, sstring(grantee_name)}, _as, ::service::raft_timeout{});
        }
    };

@@ -453,7 +453,7 @@ standard_role_manager::modify_membership(
                            cql3::query_processor::cache_internal::no).discard_result();
                } else {
                    co_return co_await announce_mutations(_qp, _group0_client, insert_query,
-                            {sstring(role_name), sstring(grantee_name)}, &_as, ::service::raft_timeout{});
+                            {sstring(role_name), sstring(grantee_name)}, _as, ::service::raft_timeout{});
                }
            }

@@ -470,7 +470,7 @@ standard_role_manager::modify_membership(
                            cql3::query_processor::cache_internal::no).discard_result();
                } else {
                    co_return co_await announce_mutations(_qp, _group0_client, delete_query,
-                            {sstring(role_name), sstring(grantee_name)}, &_as, ::service::raft_timeout{});
+                            {sstring(role_name), sstring(grantee_name)}, _as, ::service::raft_timeout{});
                }
            }
        }
@@ -644,7 +644,7 @@ future<> standard_role_manager::set_attribute(std::string_view role_name, std::s
        co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name), sstring(attribute_value)}, cql3::query_processor::cache_internal::yes).discard_result();
    } else {
        co_await announce_mutations(_qp, _group0_client, query,
-                {sstring(role_name), sstring(attribute_name), sstring(attribute_value)}, &_as, ::service::raft_timeout{});
+                {sstring(role_name), sstring(attribute_name), sstring(attribute_value)}, _as, ::service::raft_timeout{});
    }
 }

@@ -659,7 +659,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
        co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes).discard_result();
    } else {
        co_await announce_mutations(_qp, _group0_client, query,
-                {sstring(role_name), sstring(attribute_name)}, &_as, ::service::raft_timeout{});
+                {sstring(role_name), sstring(attribute_name)}, _as, ::service::raft_timeout{});
    }
 }
 }
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -171,7 +171,8 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
 }

 static std::vector<shared_sstable> get_uncompacting_sstables(const table_state& table_s, std::vector<shared_sstable> sstables) {
-    auto all_sstables = boost::copy_range<std::vector<shared_sstable>>(*table_s.main_sstable_set().all());
+    auto sstable_set = table_s.sstable_set_for_tombstone_gc();
+    auto all_sstables = boost::copy_range<std::vector<shared_sstable>>(*sstable_set->all());
    auto& compacted_undeleted = table_s.compacted_undeleted_sstables();
    all_sstables.insert(all_sstables.end(), compacted_undeleted.begin(), compacted_undeleted.end());
    boost::sort(all_sstables, [] (const shared_sstable& x, const shared_sstable& y) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -387,11 +387,26 @@ future<sstables::compaction_result> compaction_task_executor::compact_sstables_a

    co_return res;
 }
+
+future<sstables::sstable_set> compaction_task_executor::sstable_set_for_tombstone_gc(table_state& t) {
+    auto compound_set = t.sstable_set_for_tombstone_gc();
+    // Compound set will be linearized into a single set, since compaction might add or remove sstables
+    // to it for incremental compaction to work.
+    auto new_set = sstables::make_partitioned_sstable_set(t.schema(), false);
+    co_await compound_set->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) {
+        auto inserted = new_set.insert(sst);
+        if (!inserted) {
+            on_internal_error(cmlog, format("Unable to insert SSTable {} into set used for tombstone GC", sst->get_filename()));
+        }
+    });
+    co_return std::move(new_set);
+}
+
 future<sstables::compaction_result> compaction_task_executor::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, compaction_manager::can_purge_tombstones can_purge,
                                                                               sstables::offstrategy offstrategy) {
    table_state& t = *_compacting_table;
    if (can_purge) {
-        descriptor.enable_garbage_collection(t.main_sstable_set());
+        descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
    }
    descriptor.creator = [&t] (shard_id dummy) {
        auto sst = t.make_sstable();
@@ -489,7 +504,7 @@ public:
        return compaction_task_impl::get_progress(_compaction_data, _progress_monitor);
    }

-    virtual future<> abort() noexcept override {
+    virtual void abort() noexcept override {
        return compaction_task_executor::abort(_as);
    }
 protected:
@@ -514,7 +529,7 @@ public:
        return compaction_task_impl::get_progress(_compaction_data, _progress_monitor);
    }

-    virtual future<> abort() noexcept override {
+    virtual void abort() noexcept override {
        return compaction_task_executor::abort(_as);
    }
 protected:
@@ -551,6 +566,14 @@ protected:
        // the exclusive lock can be freed to let regular compaction run in parallel to major
        lock_holder.return_all();

+        co_await utils::get_local_injector().inject("major_compaction_wait", [this] (auto& handler) -> future<> {
+            cmlog.info("major_compaction_wait: waiting");
+            while (!handler.poll_for_message() && !_compaction_data.is_stop_requested()) {
+                co_await sleep(std::chrono::milliseconds(5));
+            }
+            cmlog.info("major_compaction_wait: released");
+        });
+
        co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace);

        finish_compaction();
@@ -629,7 +652,7 @@ public:
        return compaction_task_impl::get_progress(_compaction_data, _progress_monitor);
    }

-    virtual future<> abort() noexcept override {
+    virtual void abort() noexcept override {
        return compaction_task_executor::abort(_as);
    }
 protected:
@@ -855,12 +878,11 @@ void compaction_task_executor::finish_compaction(state finish_state) noexcept {
    _compaction_state.compaction_done.signal();
 }

-future<> compaction_task_executor::abort(abort_source& as) noexcept {
+void compaction_task_executor::abort(abort_source& as) noexcept {
    if (!as.abort_requested()) {
        as.request_abort();
        stop_compaction("user requested abort");
    }
-    return make_ready_future();
 }

 void compaction_task_executor::stop_compaction(sstring reason) noexcept {
@@ -1181,7 +1203,7 @@ public:
        , regular_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), mgr._task_manager_module->new_sequence_number(), t.schema()->ks_name(), t.schema()->cf_name(), "", tasks::task_id::create_null_id())
    {}

-    virtual future<> abort() noexcept override {
+    virtual void abort() noexcept override {
        return compaction_task_executor::abort(_as);
    }
 protected:
@@ -1352,7 +1374,7 @@ public:
        return compaction_task_impl::get_progress(_compaction_data, _progress_monitor);
    }

-    virtual future<> abort() noexcept override {
+    virtual void abort() noexcept override {
        return compaction_task_executor::abort(_as);
    }
 protected:
@@ -1379,13 +1401,20 @@ private:
                }));
        };

-        auto get_next_job = [&] () -> std::optional<sstables::compaction_descriptor> {
-            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), sstables::reshape_mode::strict);
-            return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
+        auto get_next_job = [&] () -> future<std::optional<sstables::compaction_descriptor>> {
+            auto candidates = get_reshape_candidates();
+            if (candidates.empty()) {
+                co_return std::nullopt;
+            }
+            // all sstables added to maintenance set share the same underlying storage.
+            auto& storage = candidates.front()->get_storage();
+            sstables::reshape_config cfg = co_await sstables::make_reshape_config(storage, sstables::reshape_mode::strict);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), cfg);
+            co_return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

        std::exception_ptr err;
-        while (auto desc = get_next_job()) {
+        while (auto desc = co_await get_next_job()) {
            auto compacting = compacting_sstable_registration(_cm, _cm.get_compaction_state(&t), desc->sstables);
            auto on_replace = compacting.update_on_sstable_replacement();

@@ -1517,11 +1546,16 @@ protected:
        co_return stats;
    }

-    virtual sstables::compaction_descriptor make_descriptor(const sstables::shared_sstable& sst) const {
+    static sstables::compaction_descriptor
+    make_descriptor(const sstables::shared_sstable& sst, const sstables::compaction_type_options& opt, owned_ranges_ptr owned_ranges = {}) {
        auto sstable_level = sst->get_sstable_level();
        auto run_identifier = sst->run_identifier();
        return sstables::compaction_descriptor({ sst },
-            sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, _options, _owned_ranges_ptr);
+            sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, opt, owned_ranges);
+    }
+
+    virtual sstables::compaction_descriptor make_descriptor(const sstables::shared_sstable& sst) const {
+        return make_descriptor(sst, _options, _owned_ranges_ptr);
    }

    virtual future<sstables::compaction_result> rewrite_sstable(const sstables::shared_sstable sst) {
@@ -1574,19 +1608,30 @@ public:
                std::move(sstables), std::move(compacting), compaction_manager::can_purge_tombstones::yes)
            , _opt(options.as<sstables::compaction_type_options::split>())
    {
+        if (utils::get_local_injector().is_enabled("split_sstable_rewrite")) {
+            _do_throw_if_stopping = throw_if_stopping::yes;
+        }
+    }
+
+    static bool sstable_needs_split(const sstables::shared_sstable& sst, const sstables::compaction_type_options::split& opt) {
+        return opt.classifier(sst->get_first_decorated_key().token()) != opt.classifier(sst->get_last_decorated_key().token());
+    }
+
+    static sstables::compaction_descriptor
+    make_descriptor(const sstables::shared_sstable& sst, const sstables::compaction_type_options::split& split_opt) {
+        auto opt = sstables::compaction_type_options::make_split(split_opt.classifier);
+        return rewrite_sstables_compaction_task_executor::make_descriptor(sst, std::move(opt));
    }
 private:
    bool sstable_needs_split(const sstables::shared_sstable& sst) const {
-        return _opt.classifier(sst->get_first_decorated_key().token()) != _opt.classifier(sst->get_last_decorated_key().token());
+        return sstable_needs_split(sst, _opt);
    }
 protected:
    sstables::compaction_descriptor make_descriptor(const sstables::shared_sstable& sst) const override {
-        auto desc = rewrite_sstables_compaction_task_executor::make_descriptor(sst);
-        desc.options = sstables::compaction_type_options::make_split(_opt.classifier);
-        return desc;
+        return make_descriptor(sst, _opt);
    }

-    future<sstables::compaction_result> rewrite_sstable(const sstables::shared_sstable sst) override {
+    future<sstables::compaction_result> do_rewrite_sstable(const sstables::shared_sstable sst) {
        if (sstable_needs_split(sst)) {
            return rewrite_sstables_compaction_task_executor::rewrite_sstable(std::move(sst));
        }
@@ -1599,6 +1644,20 @@ protected:
            return sstables::compaction_result{};
        });
    }
+
+    future<sstables::compaction_result> rewrite_sstable(const sstables::shared_sstable sst) override {
+        co_await utils::get_local_injector().inject("split_sstable_rewrite", [this] (auto& handler) -> future<> {
+            cmlog.info("split_sstable_rewrite: waiting");
+            while (!handler.poll_for_message() && !_compaction_data.is_stop_requested()) {
+                co_await sleep(std::chrono::milliseconds(5));
+            }
+            cmlog.info("split_sstable_rewrite: released");
+            if (_compaction_data.is_stop_requested()) {
+                throw make_compaction_stopped_exception();
+            }
+        }, false);
+        co_return co_await do_rewrite_sstable(std::move(sst));
+    }
 };

 }
@@ -1755,7 +1814,7 @@ public:
        return compaction_task_impl::get_progress(_compaction_data, _progress_monitor);
    }

-    virtual future<> abort() noexcept override {
+    virtual void abort() noexcept override {
        return compaction_task_executor::abort(_as);
    }
 protected:
@@ -2020,6 +2079,31 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>(info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables));
 }

+future<std::vector<sstables::shared_sstable>>
+compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, table_state& t, sstables::compaction_type_options::split opt) {
+    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
+        co_return std::vector<sstables::shared_sstable>{sst};
+    }
+    std::vector<sstables::shared_sstable> ret;
+
+        // FIXME: indentation.
+        auto gate = get_compaction_state(&t).gate.hold();
+        sstables::compaction_progress_monitor monitor;
+        sstables::compaction_data info = create_compaction_data();
+        sstables::compaction_descriptor desc = split_compaction_task_executor::make_descriptor(sst, opt);
+        desc.creator = [&t] (shard_id _) {
+            return t.make_sstable();
+        };
+        desc.replacer = [&] (sstables::compaction_completion_desc d) {
+            std::move(d.new_sstables.begin(), d.new_sstables.end(), std::back_inserter(ret));
+        };
+
+        co_await sstables::compact_sstables(std::move(desc), info, t, monitor);
+        co_await sst->unlink();
+
+    co_return ret;
+}
+
 // Submit a table to be scrubbed and wait for its termination.
 future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub(table_state& t, sstables::compaction_type_options::scrub opts, std::optional<tasks::task_info> info) {
    auto scrub_mode = opts.operation_mode;
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -350,6 +350,11 @@ public:
    // or user aborted splitting using stop API.
    future<compaction_stats_opt> perform_split_compaction(compaction::table_state& t, sstables::compaction_type_options::split opt, std::optional<tasks::task_info> info = std::nullopt);

+    // Splits a single SSTable by segregating all its data according to the classifier.
+    // If SSTable doesn't need split, the same input SSTable is returned as output.
+    // If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
+    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, table_state& t, sstables::compaction_type_options::split opt);
+
    // Run a custom job for a given table, defined by a function
    // it completes when future returned by job is ready or returns immediately
    // if manager was asked to stop.
@@ -589,12 +594,14 @@ private:
    future<compaction_manager::compaction_stats_opt> compaction_done() noexcept {
        return _compaction_done.get_future();
    }
+
+    future<sstables::sstable_set> sstable_set_for_tombstone_gc(::compaction::table_state& t);
 public:
    bool stopping() const noexcept {
        return _compaction_data.abort.abort_requested();
    }

-    future<> abort(abort_source& as) noexcept;
+    void abort(abort_source& as) noexcept;

    void stop_compaction(sstring reason) noexcept;

--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -83,7 +83,7 @@ reader_consumer_v2 compaction_strategy_impl::make_interposer_consumer(const muta
 }

 compaction_descriptor
-compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
    return compaction_descriptor();
 }

@@ -728,8 +728,8 @@ compaction_backlog_tracker compaction_strategy::make_backlog_tracker() const {
 }

 sstables::compaction_descriptor
-compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
-    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, mode);
+compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
+    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, cfg);
 }

 uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) const {
@@ -767,6 +767,13 @@ compaction_strategy make_compaction_strategy(compaction_strategy_type strategy,
    return compaction_strategy(std::move(impl));
 }

+future<reshape_config> make_reshape_config(const sstables::storage& storage, reshape_mode mode) {
+    co_return sstables::reshape_config{
+        .mode = mode,
+        .free_storage_space = co_await storage.free_space() / smp::count,
+    };
+}
+
 }

 namespace compaction {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -30,6 +30,7 @@ class compaction_strategy_impl;
 class sstable;
 class sstable_set;
 struct compaction_descriptor;
+class storage;

 class compaction_strategy {
    ::shared_ptr<compaction_strategy_impl> _compaction_strategy_impl;
@@ -121,11 +122,13 @@ public:
    //
    // The caller should also pass a maximum number of SSTables which is the maximum amount of
    // SSTables that can be added into a single job.
-    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const;
+    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const;

 };

 // Creates a compaction_strategy object from one of the strategies available.
 compaction_strategy make_compaction_strategy(compaction_strategy_type strategy, const std::map<sstring, sstring>& options);

+future<reshape_config> make_reshape_config(const sstables::storage& storage, reshape_mode mode);
+
 }
--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -76,6 +76,6 @@ public:
        return false;
    }

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const;
 };
 }
--- a/compaction/compaction_strategy_type.hh
+++ b/compaction/compaction_strategy_type.hh
@@ -8,6 +8,8 @@

 #pragma once

+#include <cstdint>
+
 namespace sstables {

 enum class compaction_strategy_type {
@@ -18,4 +20,10 @@ enum class compaction_strategy_type {
 };

 enum class reshape_mode { strict, relaxed };
+
+struct reshape_config {
+    reshape_mode mode;
+    const uint64_t free_storage_space;
+};
+
 }
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -146,7 +146,8 @@ int64_t leveled_compaction_strategy::estimated_pending_compactions(table_state&
 }

 compaction_descriptor
-leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
+    auto mode = cfg.mode;
    std::array<std::vector<shared_sstable>, leveled_manifest::MAX_LEVELS> level_info;

    auto is_disjoint = [schema] (const std::vector<shared_sstable>& sstables, unsigned tolerance) -> std::tuple<bool, unsigned> {
@@ -203,7 +204,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    if (level_info[0].size() > offstrategy_threshold) {
        size_tiered_compaction_strategy stcs(_stcs_options);
-        return stcs.get_reshaping_job(std::move(level_info[0]), schema, mode);
+        return stcs.get_reshaping_job(std::move(level_info[0]), schema, cfg);
    }

    for (unsigned level = leveled_manifest::MAX_LEVELS - 1; level > 0; --level) {
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -74,7 +74,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;
 };

 }
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -298,8 +298,9 @@ size_tiered_compaction_strategy::most_interesting_bucket(const std::vector<sstab
 }

 compaction_descriptor
-size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const
+size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const
 {
+    auto mode = cfg.mode;
    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));

--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -96,7 +96,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;

    friend class ::size_tiered_backlog_tracker;
 };
--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -39,6 +39,7 @@ public:
    virtual bool compaction_enforce_min_threshold() const noexcept = 0;
    virtual const sstables::sstable_set& main_sstable_set() const = 0;
    virtual const sstables::sstable_set& maintenance_sstable_set() const = 0;
+    virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
    virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
    virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
    virtual sstables::compaction_strategy& get_compaction_strategy() const noexcept = 0;
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -595,28 +595,35 @@ future<> table_reshaping_compaction_task_impl::run() {

 future<> shard_reshaping_compaction_task_impl::run() {
    auto& table = _db.local().find_column_family(_status.keyspace, _status.table);
+    auto holder = table.async_gate().hold();
    tasks::task_info info{_status.id, _status.shard};

-    std::unordered_map<size_t, std::unordered_set<sstables::shared_sstable>> sstables_grouped_by_compaction_group;
+    std::unordered_map<compaction::table_state*, std::unordered_set<sstables::shared_sstable>> sstables_grouped_by_compaction_group;
    for (auto& sstable : _dir.get_unshared_local_sstables()) {
-        auto compaction_group_id = table.get_compaction_group_id_for_sstable(sstable);
-        sstables_grouped_by_compaction_group[compaction_group_id].insert(sstable);
+        auto& t = table.table_state_for_sstable(sstable);
+        sstables_grouped_by_compaction_group[&t].insert(sstable);
    }

    // reshape sstables individually within the compaction groups
    for (auto& sstables_in_cg : sstables_grouped_by_compaction_group) {
-        co_await reshape_compaction_group(sstables_in_cg.first, sstables_in_cg.second, table, info);
+        co_await reshape_compaction_group(*sstables_in_cg.first, sstables_in_cg.second, table, info);
    }
 }

-future<> shard_reshaping_compaction_task_impl::reshape_compaction_group(size_t compaction_group_id, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info) {
+future<> shard_reshaping_compaction_task_impl::reshape_compaction_group(compaction::table_state& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info) {

    while (true) {
        auto reshape_candidates = boost::copy_range<std::vector<sstables::shared_sstable>>(sstables_in_cg
                | boost::adaptors::filtered([&filter = _filter] (const auto& sst) {
            return filter(sst);
        }));
-        auto desc = table.get_compaction_strategy().get_reshaping_job(std::move(reshape_candidates), table.schema(), _mode);
+        if (reshape_candidates.empty()) {
+            break;
+        }
+        // all sstables were found in the same sstable_directory instance, so they share the same underlying storage.
+        auto& storage = reshape_candidates.front()->get_storage();
+        auto cfg = co_await sstables::make_reshape_config(storage, _mode);
+        auto desc = table.get_compaction_strategy().get_reshaping_job(std::move(reshape_candidates), table.schema(), cfg);
        if (desc.sstables.empty()) {
            break;
        }
@@ -635,7 +642,6 @@ future<> shard_reshaping_compaction_task_impl::reshape_compaction_group(size_t c
        desc.creator = _creator;

        try {
-            auto& t = table.get_compaction_group(compaction_group_id)->as_table_state();
            co_await table.get_compaction_manager().run_custom_job(t, sstables::compaction_type::Reshape, "Reshape compaction", [&dir = _dir, sstlist = std::move(sstlist), desc = std::move(desc), &sstables_in_cg, &t] (sstables::compaction_data& info, sstables::compaction_progress_monitor& progress_monitor) mutable -> future<> {
                sstables::compaction_result result = co_await sstables::compact_sstables(std::move(desc), info, t, progress_monitor);
                // update the sstables_in_cg set with new sstables and remove the reshaped ones
--- a/compaction/task_manager_module.hh
+++ b/compaction/task_manager_module.hh
@@ -606,7 +606,7 @@ private:
    std::function<bool (const sstables::shared_sstable&)> _filter;
    uint64_t& _total_shard_size;

-    future<> reshape_compaction_group(size_t compaction_group_id, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info);
+    future<> reshape_compaction_group(compaction::table_state& t, std::unordered_set<sstables::shared_sstable>& sstables_in_cg, replica::column_family& table, const tasks::task_info& info);
 public:
    shard_reshaping_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -226,12 +226,14 @@ reader_consumer_v2 time_window_compaction_strategy::make_interposer_consumer(con
 }

 compaction_descriptor
-time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
+    auto mode = cfg.mode;
    std::vector<shared_sstable> single_window;
    std::vector<shared_sstable> multi_window;

    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
+    const uint64_t target_job_size = cfg.free_storage_space * reshape_target_space_overhead;

    if (mode == reshape_mode::relaxed) {
        offstrategy_threshold = max_sstables;
@@ -263,22 +265,41 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
            multi_window.size(), !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0,
            single_window.size(), !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0);

-    auto need_trimming = [max_sstables, schema, &is_disjoint] (const std::vector<shared_sstable>& ssts) {
-        // All sstables can be compacted at once if they're disjoint, given that partitioned set
-        // will incrementally open sstables which translates into bounded memory usage.
-        return ssts.size() > max_sstables && !is_disjoint(ssts);
+    auto get_job_size = [] (const std::vector<shared_sstable>& ssts) {
+        return boost::accumulate(ssts | boost::adaptors::transformed(std::mem_fn(&sstable::bytes_on_disk)), uint64_t(0));
+    };
+
+    // Targets a space overhead of 10%. All disjoint sstables can be compacted together as long as they won't
+    // cause an overhead above target. Otherwise, the job targets a maximum of #max_threshold sstables.
+    auto need_trimming = [&] (const std::vector<shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
+        const size_t min_sstables = 2;
+        auto is_above_target_size = job_size > target_job_size;
+
+        return (ssts.size() > max_sstables && !is_disjoint) ||
+               (ssts.size() > min_sstables && is_above_target_size);
+    };
+
+    auto maybe_trim_job = [&need_trimming] (std::vector<shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
+        while (need_trimming(ssts, job_size, is_disjoint)) {
+            auto sst = ssts.back();
+            ssts.pop_back();
+            job_size -= sst->bytes_on_disk();
+        }
    };

    if (!multi_window.empty()) {
+        auto disjoint = is_disjoint(multi_window);
+        auto job_size = get_job_size(multi_window);
        // Everything that spans multiple windows will need reshaping
-        if (need_trimming(multi_window)) {
+        if (need_trimming(multi_window, job_size, disjoint)) {
            // When trimming, let's keep sstables with overlapping time window, so as to reduce write amplification.
            // For example, if there are N sstables spanning window W, where N <= 32, then we can produce all data for W
            // in a single compaction round, removing the need to later compact W to reduce its number of files.
-            boost::partial_sort(multi_window, multi_window.begin() + max_sstables, [](const shared_sstable &a, const shared_sstable &b) {
+            auto sort_size = std::min(max_sstables, multi_window.size());
+            boost::partial_sort(multi_window, multi_window.begin() + sort_size, [](const shared_sstable &a, const shared_sstable &b) {
                return a->get_stats_metadata().max_timestamp < b->get_stats_metadata().max_timestamp;
            });
-            multi_window.resize(max_sstables);
+            maybe_trim_job(multi_window, job_size, disjoint);
        }
        compaction_descriptor desc(std::move(multi_window));
        desc.options = compaction_type_options::make_reshape();
@@ -297,15 +318,17 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
                std::copy(ssts.begin(), ssts.end(), std::back_inserter(single_window));
                continue;
            }
+
            // reuse STCS reshape logic which will only compact similar-sized files, to increase overall efficiency
            // when reshaping time buckets containing a huge amount of files
-            auto desc = size_tiered_compaction_strategy(_stcs_options).get_reshaping_job(std::move(ssts), schema, mode);
+            auto desc = size_tiered_compaction_strategy(_stcs_options).get_reshaping_job(std::move(ssts), schema, cfg);
            if (!desc.sstables.empty()) {
                return desc;
            }
        }
    }
    if (!single_window.empty()) {
+        maybe_trim_job(single_window, get_job_size(single_window), all_disjoint);
        compaction_descriptor desc(std::move(single_window));
        desc.options = compaction_type_options::make_reshape();
        return desc;
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -76,6 +76,7 @@ public:
    // To prevent an explosion in the number of sstables we cap it.
    // Better co-locate some windows into the same sstables than OOM.
    static constexpr uint64_t max_data_segregation_window_count = 100;
+    static constexpr float reshape_target_space_overhead = 0.1f;

    using bucket_t = std::vector<shared_sstable>;
    enum class bucket_compaction_mode { none, size_tiered, major };
@@ -168,7 +169,7 @@ public:
        return true;
    }

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;
 };

 }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -280,7 +280,6 @@ batch_size_fail_threshold_in_kb: 1024
 #     - alternator-streams
 #     - broadcast-tables
 #     - keyspace-storage-options
-#     - tablets

 # The directory where hints files are stored if hinted handoff is enabled.
 # hints_directory: /var/lib/scylla/hints
@@ -618,3 +617,17 @@ maintenance_socket: ignore
 # replication_strategy_warn_list:
 #  - SimpleStrategy
 # replication_strategy_fail_list:
+
+# Enables the tablets feature.
+# When enabled, newly created keyspaces will have tablets enabled by default.
+# That can be explicitly disabled in the CREATE KEYSPACE query
+# by using the `tablets = {'enabled': false}` replication option.
+#
+# When the tablets feature is disabled, there is no way to enable tablets
+# per keyspace.
+#
+# Note that creating keyspaces with tablets enabled is irreversible.
+# Disabling the tablets feature may impact existing keyspaces that were created with tablets.
+# For example, the tablets map would remain "frozen" and will not respond to topology changes
+# like adding, removing, or replacing nodes, or to replication factor changes.
+enable_tablets: true
--- a/configure.py
+++ b/configure.py
@@ -456,8 +456,6 @@ modes = {

 scylla_tests = set([
    'test/boost/UUID_test',
-    'test/boost/pretty_printers_test',
-    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
    'test/boost/alternator_unit_test',
@@ -467,7 +465,9 @@ scylla_tests = set([
    'test/boost/auth_test',
    'test/boost/batchlog_manager_test',
    'test/boost/big_decimal_test',
+    'test/boost/bptree_test',
    'test/boost/broken_sstable_test',
+    'test/boost/btree_test',
    'test/boost/bytes_ostream_test',
    'test/boost/cache_algorithm_test',
    'test/boost/cache_flat_mutation_reader_test',
@@ -476,13 +476,15 @@ scylla_tests = set([
    'test/boost/canonical_mutation_test',
    'test/boost/cartesian_product_test',
    'test/boost/castas_fcts_test',
+    'test/boost/cdc_generation_test',
    'test/boost/cdc_test',
    'test/boost/cell_locker_test',
    'test/boost/checksum_utils_test',
-    'test/boost/chunked_vector_test',
    'test/boost/chunked_managed_vector_test',
+    'test/boost/chunked_vector_test',
    'test/boost/clustering_ranges_walker_test',
    'test/boost/column_mapping_test',
+    'test/boost/commitlog_cleanup_test',
    'test/boost/commitlog_test',
    'test/boost/compaction_group_test',
    'test/boost/compound_test',
@@ -492,102 +494,124 @@ scylla_tests = set([
    'test/boost/counter_test',
    'test/boost/cql_auth_query_test',
    'test/boost/cql_auth_syntax_test',
-    'test/boost/cql_query_test',
+    'test/boost/cql_functions_test',
+    'test/boost/cql_query_group_test',
    'test/boost/cql_query_large_test',
    'test/boost/cql_query_like_test',
-    'test/boost/cql_query_group_test',
-    'test/boost/cql_functions_test',
+    'test/boost/cql_query_test',
    'test/boost/crc_test',
    'test/boost/data_listeners_test',
    'test/boost/database_test',
-    'test/boost/commitlog_cleanup_test',
    'test/boost/dirty_memory_manager_test',
+    'test/boost/double_decker_test',
    'test/boost/duration_test',
    'test/boost/dynamic_bitset_test',
    'test/boost/enum_option_test',
    'test/boost/enum_set_test',
-    'test/boost/extensions_test',
    'test/boost/error_injection_test',
+    'test/boost/estimated_histogram_test',
+    'test/boost/exception_container_test',
+    'test/boost/exceptions_fallback_test',
+    'test/boost/exceptions_optimized_test',
+    'test/boost/expr_test',
+    'test/boost/extensions_test',
    'test/boost/filtering_test',
    'test/boost/flat_mutation_reader_test',
    'test/boost/flush_queue_test',
    'test/boost/fragmented_temporary_buffer_test',
    'test/boost/frozen_mutation_test',
+    'test/boost/generic_server_test',
    'test/boost/gossiping_property_file_snitch_test',
+    'test/boost/group0_cmd_merge_test',
+    'test/boost/group0_test',
    'test/boost/hash_test',
    'test/boost/hashers_test',
    'test/boost/hint_test',
    'test/boost/idl_test',
+    'test/boost/index_with_paging_test',
    'test/boost/input_stream_test',
+    'test/boost/intrusive_array_test',
    'test/boost/json_cql_query_test',
    'test/boost/json_test',
    'test/boost/keys_test',
    'test/boost/large_paging_state_test',
-    'test/boost/recent_entries_map_test',
    'test/boost/like_matcher_test',
    'test/boost/limiting_data_source_test',
    'test/boost/linearizing_input_stream_test',
+    'test/boost/lister_test',
    'test/boost/loading_cache_test',
+    'test/boost/locator_topology_test',
    'test/boost/log_heap_test',
-    'test/boost/estimated_histogram_test',
-    'test/boost/summary_test',
-    'test/boost/logalloc_test',
    'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
-    'test/boost/managed_vector_test',
+    'test/boost/logalloc_test',
    'test/boost/managed_bytes_test',
-    'test/boost/intrusive_array_test',
+    'test/boost/managed_vector_test',
    'test/boost/map_difference_test',
    'test/boost/memtable_test',
+    'test/boost/multishard_combining_reader_as_mutation_source_test',
    'test/boost/multishard_mutation_query_test',
    'test/boost/murmur_hash_test',
    'test/boost/mutation_fragment_test',
    'test/boost/mutation_query_test',
    'test/boost/mutation_reader_test',
-    'test/boost/multishard_combining_reader_as_mutation_source_test',
    'test/boost/mutation_test',
    'test/boost/mutation_writer_test',
    'test/boost/mvcc_test',
    'test/boost/network_topology_strategy_test',
-    'test/boost/token_metadata_test',
-    'test/boost/tablets_test',
-    'test/boost/sessions_test',
    'test/boost/nonwrapping_interval_test',
    'test/boost/observable_test',
    'test/boost/partitioner_test',
+    'test/boost/per_partition_rate_limit_test',
+    'test/boost/pretty_printers_test',
    'test/boost/querier_cache_test',
    'test/boost/query_processor_test',
-    'test/boost/wrapping_interval_test',
+    'test/boost/radix_tree_test',
    'test/boost/range_tombstone_list_test',
-    'test/boost/reusable_buffer_test',
-    'test/boost/restrictions_test',
+    'test/boost/rate_limiter_test',
+    'test/boost/reader_concurrency_semaphore_test',
+    'test/boost/recent_entries_map_test',
    'test/boost/repair_test',
+    'test/boost/restrictions_test',
+    'test/boost/result_utils_test',
+    'test/boost/reusable_buffer_test',
    'test/boost/role_manager_test',
    'test/boost/row_cache_test',
    'test/boost/rust_test',
+    'test/boost/s3_test',
    'test/boost/schema_change_test',
+    'test/boost/schema_changes_test',
+    'test/boost/schema_loader_test',
    'test/boost/schema_registry_test',
    'test/boost/secondary_index_test',
-    'test/boost/tracing_test',
-    'test/boost/index_with_paging_test',
    'test/boost/serialization_test',
    'test/boost/serialized_action_test',
+    'test/boost/service_level_controller_test',
+    'test/boost/sessions_test',
    'test/boost/small_vector_test',
    'test/boost/snitch_reset_test',
+    'test/boost/sorting_test',
    'test/boost/sstable_3_x_test',
+    'test/boost/sstable_compaction_test',
+    'test/boost/sstable_conforms_to_mutation_source_test',
    'test/boost/sstable_datafile_test',
+    'test/boost/sstable_directory_test',
    'test/boost/sstable_generation_test',
+    'test/boost/sstable_move_test',
    'test/boost/sstable_mutation_test',
    'test/boost/sstable_partition_index_cache_test',
-    'test/boost/schema_changes_test',
-    'test/boost/sstable_conforms_to_mutation_source_test',
-    'test/boost/sstable_compaction_test',
    'test/boost/sstable_resharding_test',
-    'test/boost/sstable_directory_test',
+    'test/boost/sstable_set_test',
    'test/boost/sstable_test',
-    'test/boost/sstable_move_test',
+    'test/boost/stall_free_test',
    'test/boost/statement_restrictions_test',
    'test/boost/storage_proxy_test',
+    'test/boost/string_format_test',
+    'test/boost/summary_test',
+    'test/boost/tablets_test',
+    'test/boost/tagged_integer_test',
+    'test/boost/token_metadata_test',
    'test/boost/top_k_test',
+    'test/boost/tracing_test',
    'test/boost/transport_test',
    'test/boost/types_test',
    'test/boost/user_function_test',
@@ -595,39 +619,16 @@ scylla_tests = set([
    'test/boost/utf8_test',
    'test/boost/view_build_test',
    'test/boost/view_complex_test',
-    'test/boost/view_schema_test',
-    'test/boost/view_schema_pkey_test',
    'test/boost/view_schema_ckey_test',
+    'test/boost/view_schema_pkey_test',
+    'test/boost/view_schema_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
    'test/boost/virtual_table_mutation_source_test',
    'test/boost/virtual_table_test',
-    'test/boost/wasm_test',
    'test/boost/wasm_alloc_test',
-    'test/boost/bptree_test',
-    'test/boost/btree_test',
-    'test/boost/radix_tree_test',
-    'test/boost/double_decker_test',
-    'test/boost/stall_free_test',
-    'test/boost/sstable_set_test',
-    'test/boost/reader_concurrency_semaphore_test',
-    'test/boost/service_level_controller_test',
-    'test/boost/schema_loader_test',
-    'test/boost/lister_test',
-    'test/boost/group0_test',
-    'test/boost/exception_container_test',
-    'test/boost/result_utils_test',
-    'test/boost/rate_limiter_test',
-    'test/boost/per_partition_rate_limit_test',
-    'test/boost/expr_test',
-    'test/boost/exceptions_optimized_test',
-    'test/boost/exceptions_fallback_test',
-    'test/boost/s3_test',
-    'test/boost/locator_topology_test',
-    'test/boost/string_format_test',
-    'test/boost/tagged_integer_test',
-    'test/boost/group0_cmd_merge_test',
-    'test/boost/sorting_test',
+    'test/boost/wasm_test',
+    'test/boost/wrapping_interval_test',
    'test/manual/ec2_snitch_test',
    'test/manual/enormous_table_scan_test',
    'test/manual/gce_snitch_test',
@@ -1015,7 +1016,6 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/result_set.cc',
                'cql3/prepare_context.cc',
                'db/consistency_level.cc',
-                'db/system_auth_keyspace.cc',
                'db/system_keyspace.cc',
                'db/virtual_table.cc',
                'db/virtual_tables.cc',
@@ -1358,6 +1358,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/perf/perf_simple_query.cc',
                'test/perf/perf_sstable.cc',
                'test/perf/perf_tablets.cc',
+                'test/perf/tablet_load_balancing.cc',
                'test/perf/perf.cc',
                'test/lib/alternator_test_env.cc',
                'test/lib/cql_test_env.cc',
@@ -1753,33 +1754,32 @@ def configure_seastar(build_dir, mode, mode_config):


 def configure_abseil(build_dir, mode, mode_config):
-    # for sanitizer cflags
-    seastar_flags = query_seastar_flags(f'{outdir}/{mode}/seastar/seastar.pc',
-                                        mode_config['build_seastar_shared_libs'],
-                                        args.staticcxx)
-    seastar_cflags = seastar_flags['seastar_cflags']
+    abseil_cflags = mode_config['lib_cflags']
+    cxx_flags = mode_config['cxxflags']
+    if '-DSANITIZE' in cxx_flags:
+        abseil_cflags += ' -fsanitize=address -fsanitize=undefined -fno-sanitize=vptr'

-    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
-
-    abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
    # We want to "undo" coverage for abseil if we have it enabled, as we are not
    # interested in the coverage of the abseil library. these flags were previously
    # added to cxx_ld_flags
    if args.coverage:
        for flag in COVERAGE_INST_FLAGS:
-            abseil_cflags = abseil_cflags.replace(f' {flag}', '')
+            cxx_flags = cxx_flags.replace(f' {flag}', '')
+
+    cxx_flags += ' ' + abseil_cflags.strip()
    cmake_mode = mode_config['cmake_build_type']
    abseil_cmake_args = [
        '-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
        '-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
        '-DCMAKE_C_COMPILER={}'.format(args.cc),
        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
-        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
+        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), cxx_flags),
        '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
        '-DCMAKE_CXX_STANDARD=20',
        '-DABSL_PROPAGATE_CXX_STD=ON',
    ]

+    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
    abseil_cmd = ['cmake', '-G', 'Ninja', real_relpath('abseil', abseil_build_dir)] + abseil_cmake_args

    os.makedirs(abseil_build_dir, exist_ok=True)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -68,6 +68,7 @@ options {
 #include "cql3/statements/ks_prop_defs.hh"
 #include "cql3/selection/raw_selector.hh"
 #include "cql3/selection/selectable-expr.hh"
+#include "cql3/dialect.hh"
 #include "cql3/keyspace_element_name.hh"
 #include "cql3/constants.hh"
 #include "cql3/operation_impl.hh"
@@ -148,6 +149,8 @@ using uexpression = uninitialized<expression>;

    listener_type* listener;

+    dialect _dialect;
+
    // Keeps the names of all bind variables. For bind variables without a name ('?'), the name is nullptr.
    // Maps bind_index -> name.
    std::vector<::shared_ptr<cql3::column_identifier>> _bind_variable_names;
@@ -171,9 +174,14 @@ using uexpression = uninitialized<expression>;
        return s;
    }

+    void set_dialect(dialect d) {
+        _dialect = d;
+    }
+
    bind_variable new_bind_variables(shared_ptr<cql3::column_identifier> name)
    {
-        if (name && _named_bind_variables_indexes.contains(*name)) {
+        if (_dialect.duplicate_bind_variable_names_refer_to_same_variable
+                && name && _named_bind_variables_indexes.contains(*name)) {
            return bind_variable{_named_bind_variables_indexes[*name]};
        }
        auto marker = bind_variable{_bind_variable_names.size()};
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -454,7 +454,8 @@ sstring maybe_quote(const sstring& identifier) {
        // many keywords but allow keywords listed as "unreserved keywords".
        // So we can use any of them, for example cident.
        try {
-            cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
+            // In general it's not a good idea to use the default dialect, but for parsing an identifier, it's okay.
+            cql3::util::do_with_parser(identifier, dialect{}, std::mem_fn(&cql3_parser::CqlParser::cident));
            return identifier;
        } catch(exceptions::syntax_exception&) {
            // This alphanumeric string is not a valid identifier, so fall
--- a/cql3/dialect.hh
+++ b/cql3/dialect.hh
@@ -0,0 +1,34 @@
+// Copyright (C) 2024-present ScyllaDB
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+#pragma once
+
+#include <fmt/core.h>
+
+namespace cql3 {
+
+struct dialect {
+    bool duplicate_bind_variable_names_refer_to_same_variable = true;  // if :a is found twice in a query, the two references are to the same variable (see #15559)
+    bool operator==(const dialect&) const = default;
+};
+
+inline
+dialect
+internal_dialect() {
+    return dialect{
+        .duplicate_bind_variable_names_refer_to_same_variable = true,
+    };
+}
+
+}
+
+template <>
+struct fmt::formatter<cql3::dialect> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+    template <typename FormatContext>
+    auto format(const cql3::dialect& d, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "cql3::dialect{{duplicate_bind_variable_names_refer_to_same_variable={}}}",
+                d.duplicate_bind_variable_names_refer_to_same_variable);
+    }
+};
--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -14,6 +14,7 @@
 #include "utils/hash.hh"
 #include "cql3/statements/prepared_statement.hh"
 #include "cql3/column_specification.hh"
+#include "cql3/dialect.hh"

 namespace cql3 {

@@ -38,15 +39,20 @@ typedef int32_t thrift_prepared_id_type;
 /// and for Thrift - {CQL_PREP_ID_TYPE(0), THRIFT_PREP_ID}. This way CQL and Thrift keys' values will never collide.
 class prepared_cache_key_type {
 public:
-    using cache_key_type = std::pair<cql_prepared_id_type, int64_t>;
+    // derive from cql_prepared_id_type so we can customize the formatter of
+    // cache_key_type
+    struct cache_key_type : public std::pair<cql_prepared_id_type, int64_t> {
+        cache_key_type(std::pair<cql_prepared_id_type, int64_t>&& id, cql3::dialect d) : pair(std::move(id)), dialect(d) {}
+        cql3::dialect dialect; // Not part of hash, but we don't expect collisions because of that
+        bool operator==(const cache_key_type& other) const = default;
+    };

 private:
    cache_key_type _key;

 public:
-    prepared_cache_key_type() = default;
-    explicit prepared_cache_key_type(cql_prepared_id_type cql_id) : _key(std::move(cql_id), std::numeric_limits<int64_t>::max()) {}
-    explicit prepared_cache_key_type(thrift_prepared_id_type thrift_id) : _key(cql_prepared_id_type(), thrift_id) {}
+    explicit prepared_cache_key_type(cql_prepared_id_type cql_id, dialect d) : _key({std::move(cql_id), std::numeric_limits<int64_t>::max()}, d) {}
+    explicit prepared_cache_key_type(thrift_prepared_id_type thrift_id, dialect d) : _key({cql_prepared_id_type(), thrift_id}, d) {}

    cache_key_type& key() { return _key; }
    const cache_key_type& key() const { return _key; }
@@ -173,7 +179,7 @@ struct hash<cql3::prepared_cache_key_type> final {
 template <> struct fmt::formatter<cql3::prepared_cache_key_type::cache_key_type> {
    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
    auto format(const cql3::prepared_cache_key_type::cache_key_type& p, fmt::format_context& ctx) const {
-        return fmt::format_to(ctx.out(), "{{cql_id: {}, thrift_id: {}}}", p.first, p.second);
+        return fmt::format_to(ctx.out(), "{{cql_id: {}, thrift_id: {}, dialect: {}}}", p.first, p.second, p.dialect);
    }
 };

--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -14,9 +14,11 @@
 #include <seastar/coroutine/parallel_for_each.hh>

 #include "service/storage_proxy.hh"
+#include "service/topology_mutation.hh"
 #include "service/migration_manager.hh"
 #include "service/forward_service.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "service/storage_service.hh"
 #include "cql3/CqlParser.hpp"
 #include "cql3/statements/batch_statement.hh"
 #include "cql3/statements/modification_statement.hh"
@@ -42,16 +44,22 @@ const sstring query_processor::CQL_VERSION = "3.3.1";
 const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);

 struct query_processor::remote {
-    remote(service::migration_manager& mm, service::forward_service& fwd, service::raft_group0_client& group0_client)
-            : mm(mm), forwarder(fwd), group0_client(group0_client) {}
+    remote(service::migration_manager& mm, service::forward_service& fwd,
+           service::storage_service& ss, service::raft_group0_client& group0_client)
+            : mm(mm), forwarder(fwd), ss(ss), group0_client(group0_client) {}

    service::migration_manager& mm;
    service::forward_service& forwarder;
+    service::storage_service& ss;
    service::raft_group0_client& group0_client;

    seastar::gate gate;
 };

+bool query_processor::topology_global_queue_empty() {
+    return remote().first.get().ss.topology_global_queue_empty();
+}
+
 static service::query_state query_state_for_internal_call() {
    return {service::client_state::for_internal_calls(), empty_service_permit()};
 }
@@ -498,8 +506,8 @@ query_processor::~query_processor() {
 }

 void query_processor::start_remote(service::migration_manager& mm, service::forward_service& forwarder,
-                                  service::raft_group0_client& group0_client) {
-    _remote = std::make_unique<struct remote>(mm, forwarder, group0_client);
+                                   service::storage_service& ss, service::raft_group0_client& group0_client) {
+    _remote = std::make_unique<struct remote>(mm, forwarder, ss, group0_client);
 }

 future<> query_processor::stop_remote() {
@@ -557,10 +565,10 @@ query_processor::execute_maybe_with_guard(service::query_state& query_state, ::s
 }

 future<::shared_ptr<result_message>>
-query_processor::execute_direct_without_checking_exception_message(const sstring_view& query_string, service::query_state& query_state, query_options& options) {
+query_processor::execute_direct_without_checking_exception_message(const sstring_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
    log.trace("execute_direct: \"{}\"", query_string);
    tracing::trace(query_state.get_trace_state(), "Parsing a statement");
-    auto p = get_statement(query_string, query_state.get_client_state());
+    auto p = get_statement(query_string, query_state.get_client_state(), d);
    auto statement = p->statement;
    const auto warnings = std::move(p->warnings);
    if (statement->get_bound_terms() != options.get_values_count()) {
@@ -644,24 +652,31 @@ query_processor::process_authorized_statement(const ::shared_ptr<cql_statement>
 }

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::prepare(sstring query_string, service::query_state& query_state) {
+query_processor::prepare(sstring query_string, service::query_state& query_state, cql3::dialect d) {
    auto& client_state = query_state.get_client_state();
-    return prepare(std::move(query_string), client_state, client_state.is_thrift());
+    return prepare(std::move(query_string), client_state, client_state.is_thrift(), d);
 }

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::prepare(sstring query_string, const service::client_state& client_state, bool for_thrift) {
+query_processor::prepare(sstring query_string, const service::client_state& client_state, bool for_thrift, cql3::dialect d) {
    using namespace cql_transport::messages;
    if (for_thrift) {
        return prepare_one<result_message::prepared::thrift>(
                std::move(query_string),
                client_state,
-                compute_thrift_id, prepared_cache_key_type::thrift_id);
+                            d,
+                [d] (const sstring& query_string, const sstring& keyspace) {
+                    return compute_thrift_id(query_string, keyspace, d);
+                },
+                prepared_cache_key_type::thrift_id);
    } else {
        return prepare_one<result_message::prepared::cql>(
                std::move(query_string),
                client_state,
-                compute_id,
+                d,
+                [d] (const sstring& query_string, const sstring& keyspace) {
+                    return compute_id(query_string, keyspace, d);
+                },
                prepared_cache_key_type::cql_id);
    }
 }
@@ -674,23 +689,25 @@ static std::string hash_target(std::string_view query_string, std::string_view k

 prepared_cache_key_type query_processor::compute_id(
        std::string_view query_string,
-        std::string_view keyspace) {
-    return prepared_cache_key_type(md5_hasher::calculate(hash_target(query_string, keyspace)));
+        std::string_view keyspace,
+        dialect d) {
+    return prepared_cache_key_type(md5_hasher::calculate(hash_target(query_string, keyspace)), d);
 }

 prepared_cache_key_type query_processor::compute_thrift_id(
        const std::string_view& query_string,
-        const sstring& keyspace) {
+        const sstring& keyspace,
+        cql3::dialect d) {
    uint32_t h = 0;
    for (auto&& c : hash_target(query_string, keyspace)) {
        h = 31*h + c;
    }
-    return prepared_cache_key_type(static_cast<int32_t>(h));
+    return prepared_cache_key_type(static_cast<int32_t>(h), d);
 }

 std::unique_ptr<prepared_statement>
-query_processor::get_statement(const sstring_view& query, const service::client_state& client_state) {
-    std::unique_ptr<raw::parsed_statement> statement = parse_statement(query);
+query_processor::get_statement(const sstring_view& query, const service::client_state& client_state, dialect d) {
+    std::unique_ptr<raw::parsed_statement> statement = parse_statement(query, d);

    // Set keyspace for statement that require login
    auto cf_stmt = dynamic_cast<raw::cf_statement*>(statement.get());
@@ -704,7 +721,7 @@ query_processor::get_statement(const sstring_view& query, const service::client_
 }

 std::unique_ptr<raw::parsed_statement>
-query_processor::parse_statement(const sstring_view& query) {
+query_processor::parse_statement(const sstring_view& query, dialect d) {
    try {
        {
            const char* error_injection_key = "query_processor-parse_statement-test_failure";
@@ -714,7 +731,7 @@ query_processor::parse_statement(const sstring_view& query) {
                }
            });
        }
-        auto statement = util::do_with_parser(query,  std::mem_fn(&cql3_parser::CqlParser::query));
+        auto statement = util::do_with_parser(query, d, std::mem_fn(&cql3_parser::CqlParser::query));
        if (!statement) {
            throw exceptions::syntax_exception("Parsing failed");
        }
@@ -730,9 +747,9 @@ query_processor::parse_statement(const sstring_view& query) {
 }

 std::vector<std::unique_ptr<raw::parsed_statement>>
-query_processor::parse_statements(std::string_view queries) {
+query_processor::parse_statements(std::string_view queries, dialect d) {
    try {
-        auto statements = util::do_with_parser(queries, std::mem_fn(&cql3_parser::CqlParser::queries));
+        auto statements = util::do_with_parser(queries, d, std::mem_fn(&cql3_parser::CqlParser::queries));
        if (statements.empty()) {
            throw exceptions::syntax_exception("Parsing failed");
        }
@@ -805,7 +822,7 @@ query_options query_processor::make_internal_options(
 statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string) {
    auto& p = _internal_statements[query_string];
    if (p == nullptr) {
-        auto np = parse_statement(query_string)->prepare(_db, _cql_stats);
+        auto np = parse_statement(query_string, internal_dialect())->prepare(_db, _cql_stats);
        np->statement->raw_cql_statement = query_string;
        p = std::move(np); // inserts it into map
    }
@@ -835,7 +852,7 @@ bool query_processor::has_more_results(cql3::internal_query_state& state) const

 future<> query_processor::for_each_cql_result(
        cql3::internal_query_state& state,
-         noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)>&& f) {
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
    do {
        auto msg = co_await execute_paged_internal(state);
        for (auto& row : *msg) {
@@ -911,7 +928,8 @@ query_processor::execute_internal(
        auto p = prepare_internal(query_string);
        return execute_with_params(std::move(p), cl, query_state, values);
    } else {
-        auto p = parse_statement(query_string)->prepare(_db, _cql_stats);
+        // For internal queries, we want the default dialect, not the user provided one
+        auto p = parse_statement(query_string, dialect{})->prepare(_db, _cql_stats);
        p->statement->raw_cql_statement = query_string;
        auto checked_weak_ptr = p->checked_weak_from_this();
        return execute_with_params(std::move(checked_weak_ptr), cl, query_state, values).finally([p = std::move(p)] {});
@@ -1018,16 +1036,29 @@ query_processor::execute_schema_statement(const statements::schema_altering_stat

    cql3::cql_warnings_vec warnings;

+    auto request_id = guard->new_group0_state_id();
+    stmt.global_req_id = request_id;
+
    auto [ret, m, cql_warnings] = co_await stmt.prepare_schema_mutations(*this, options, guard->write_timestamp());
    warnings = std::move(cql_warnings);

+    ce = std::move(ret);
    if (!m.empty()) {
        auto description = format("CQL DDL statement: \"{}\"", stmt.raw_cql_statement);
-        co_await remote_.get().mm.announce(std::move(m), std::move(*guard), description);
+        if (ce && ce->target == cql_transport::event::schema_change::target_type::TABLET_KEYSPACE) {
+            co_await remote_.get().mm.announce<service::topology_change>(std::move(m), std::move(*guard), description);
+            // TODO: eliminate timeout from alter ks statement on the cqlsh/driver side
+            auto error = co_await remote_.get().ss.wait_for_topology_request_completion(request_id);
+            co_await remote_.get().ss.wait_for_topology_not_busy();
+            if (!error.empty()) {
+                log.error("CQL statement \"{}\" with topology request_id \"{}\" failed with error: \"{}\"", stmt.raw_cql_statement, request_id, error);
+                throw exceptions::request_execution_exception(exceptions::exception_code::INVALID, error);
+            }
+        } else {
+            co_await remote_.get().mm.announce<service::schema_change>(std::move(m), std::move(*guard), description);
+        }
    }

-    ce = std::move(ret);
-
    // If an IF [NOT] EXISTS clause was used, this may not result in an actual schema change.  To avoid doing
    // extra work in the drivers to handle schema changes, we return an empty message in this case. (CASSANDRA-7600)
    ::shared_ptr<messages::result_message> result;
@@ -1158,14 +1189,14 @@ future<> query_processor::query_internal(
        db::consistency_level cl,
        const data_value_list& values,
        int32_t page_size,
-        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
    auto query_state = create_paged_state(query_string, cl, values, page_size);
    co_return co_await for_each_cql_result(query_state, std::move(f));
 }

 future<> query_processor::query_internal(
        const sstring& query_string,
-        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
    return query_internal(query_string, db::consistency_level::ONE, {}, 1000, std::move(f));
 }

--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -21,6 +21,7 @@
 #include "cql3/authorized_prepared_statements_cache.hh"
 #include "cql3/statements/prepared_statement.hh"
 #include "cql3/cql_statement.hh"
+#include "cql3/dialect.hh"
 #include "exceptions/exceptions.hh"
 #include "service/migration_listener.hh"
 #include "timestamp.hh"
@@ -31,7 +32,6 @@
 #include "lang/wasm.hh"
 #include "service/raft/raft_group0_client.hh"
 #include "types/types.hh"
-#include "db/system_auth_keyspace.hh"


 namespace service {
@@ -138,20 +138,23 @@ public:

    static prepared_cache_key_type compute_id(
            std::string_view query_string,
-            std::string_view keyspace);
+            std::string_view keyspace,
+            dialect d);

    static prepared_cache_key_type compute_thrift_id(
            const std::string_view& query_string,
-            const sstring& keyspace);
+            const sstring& keyspace,
+            dialect d);

-    static std::unique_ptr<statements::raw::parsed_statement> parse_statement(const std::string_view& query);
-    static std::vector<std::unique_ptr<statements::raw::parsed_statement>> parse_statements(std::string_view queries);
+    static std::unique_ptr<statements::raw::parsed_statement> parse_statement(const std::string_view& query, dialect d);
+    static std::vector<std::unique_ptr<statements::raw::parsed_statement>> parse_statements(std::string_view queries, dialect d);

    query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, wasm::manager& wasm);

    ~query_processor();

-    void start_remote(service::migration_manager&, service::forward_service&, service::raft_group0_client&);
+    void start_remote(service::migration_manager&, service::forward_service&,
+                      service::storage_service& ss, service::raft_group0_client&);
    future<> stop_remote();

    data_dictionary::database db() {
@@ -176,7 +179,7 @@ public:

    wasm::manager& wasm() { return _wasm; }

-    db::system_auth_keyspace::version_t auth_version;
+    db::system_keyspace::auth_version_t auth_version;

    statements::prepared_statement::checked_weak_ptr get_prepared(const std::optional<auth::authenticated_user>& user, const prepared_cache_key_type& key) {
        if (user) {
@@ -253,10 +256,12 @@ public:
    execute_direct(
            const std::string_view& query_string,
            service::query_state& query_state,
+            dialect d,
            query_options& options) {
        return execute_direct_without_checking_exception_message(
                query_string,
                query_state,
+                d,
                options)
                .then(cql_transport::messages::propagate_exception_as_future<::shared_ptr<cql_transport::messages::result_message>>);
    }
@@ -267,6 +272,7 @@ public:
    execute_direct_without_checking_exception_message(
            const std::string_view& query_string,
            service::query_state& query_state,
+            dialect d,
            query_options& options);

    future<::shared_ptr<cql_transport::messages::result_message>>
@@ -315,7 +321,7 @@ public:
            db::consistency_level cl,
            const data_value_list& values,
            int32_t page_size,
-            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);

    /*
     * \brief iterate over all cql results using paging
@@ -330,7 +336,7 @@ public:
     */
    future<> query_internal(
            const sstring& query_string,
-            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);

    class cache_internal_tag;
    using cache_internal = bool_class<cache_internal_tag>;
@@ -401,10 +407,10 @@ public:


    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    prepare(sstring query_string, service::query_state& query_state);
+    prepare(sstring query_string, service::query_state& query_state, dialect d);

    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    prepare(sstring query_string, const service::client_state& client_state, bool for_thrift);
+    prepare(sstring query_string, const service::client_state& client_state, bool for_thrift, dialect d);

    future<> stop();

@@ -451,7 +457,8 @@ public:

    std::unique_ptr<statements::prepared_statement> get_statement(
            const std::string_view& query,
-            const service::client_state& client_state);
+            const service::client_state& client_state,
+            dialect d);

    friend class migration_subscriber;

@@ -461,6 +468,8 @@ public:

    void reset_cache();

+    bool topology_global_queue_empty();
+
 private:
    // Keep the holder until you stop using the `remote` services.
    std::pair<std::reference_wrapper<remote>, gate::holder> remote();
@@ -499,7 +508,7 @@ private:
     */
    future<> for_each_cql_result(
            cql3::internal_query_state& state,
-             noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);

    /*!
     * \brief check, based on the state if there are additional results
@@ -533,14 +542,15 @@ private:
    prepare_one(
            sstring query_string,
            const service::client_state& client_state,
+            dialect d,
            PreparedKeyGenerator&& id_gen,
            IdGetter&& id_getter) {
        return do_with(
                id_gen(query_string, client_state.get_raw_keyspace()),
                std::move(query_string),
-                [this, &client_state, &id_getter](const prepared_cache_key_type& key, const sstring& query_string) {
-            return _prepared_cache.get(key, [this, &query_string, &client_state] {
-                auto prepared = get_statement(query_string, client_state);
+                [this, &client_state, &id_getter, d](const prepared_cache_key_type& key, const sstring& query_string) {
+            return _prepared_cache.get(key, [this, &query_string, &client_state, d] {
+                auto prepared = get_statement(query_string, client_state, d);
                auto bound_terms = prepared->statement->get_bound_terms();
                if (bound_terms > std::numeric_limits<uint16_t>::max()) {
                    throw exceptions::invalid_request_exception(
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -8,11 +8,16 @@
 * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
 */

+#include <boost/range/algorithm.hpp>
+#include <fmt/format.h>
 #include <seastar/core/coroutine.hh>
+#include <seastar/core/on_internal_error.hh>
+#include <stdexcept>
 #include "alter_keyspace_statement.hh"
 #include "prepared_statement.hh"
 #include "service/migration_manager.hh"
 #include "service/storage_proxy.hh"
+#include "service/topology_mutation.hh"
 #include "db/system_keyspace.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "data_dictionary/keyspace_metadata.hh"
@@ -21,6 +26,8 @@
 #include "create_keyspace_statement.hh"
 #include "gms/feature_service.hh"

+static logging::logger mylogger("alter_keyspace");
+
 bool is_system_keyspace(std::string_view keyspace);

 cql3::statements::alter_keyspace_statement::alter_keyspace_statement(sstring name, ::shared_ptr<ks_prop_defs> attrs)
@@ -36,6 +43,18 @@ future<> cql3::statements::alter_keyspace_statement::check_access(query_processo
    return state.has_keyspace_access(_name, auth::permission::ALTER);
 }

+static unsigned get_abs_rf_diff(const std::string& curr_rf, const std::string& new_rf) {
+    try {
+        return std::abs(std::stoi(curr_rf) - std::stoi(new_rf));
+    } catch (std::invalid_argument const& ex) {
+        on_internal_error(mylogger, fmt::format("get_abs_rf_diff expects integer arguments, "
+                                                "but got curr_rf:{} and new_rf:{}", curr_rf, new_rf));
+    } catch (std::out_of_range const& ex) {
+        on_internal_error(mylogger, fmt::format("get_abs_rf_diff expects integer arguments to fit into `int` type, "
+                                                "but got curr_rf:{} and new_rf:{}", curr_rf, new_rf));
+    }
+}
+
 void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, const service::client_state& state) const {
        auto tmp = _name;
        std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
@@ -61,6 +80,30 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
            }

            auto new_ks = _attrs->as_ks_metadata_update(ks.metadata(), *qp.proxy().get_token_metadata_ptr(), qp.proxy().features());
+
+            if (ks.get_replication_strategy().uses_tablets()) {
+                const std::map<sstring, sstring>& current_rf_per_dc = ks.metadata()->strategy_options();
+                auto new_rf_per_dc = _attrs->get_replication_options();
+                new_rf_per_dc.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
+                unsigned total_abs_rfs_diff = 0;
+                for (const auto& [new_dc, new_rf] : new_rf_per_dc) {
+                    sstring old_rf = "0";
+                    if (auto new_dc_in_current_mapping = current_rf_per_dc.find(new_dc);
+                             new_dc_in_current_mapping != current_rf_per_dc.end()) {
+                        old_rf = new_dc_in_current_mapping->second;
+                    } else if (!qp.proxy().get_token_metadata_ptr()->get_topology().get_datacenters().contains(new_dc)) {
+                        // This means that the DC listed in ALTER doesn't exist. This error will be reported later,
+                        // during validation in abstract_replication_strategy::validate_replication_strategy.
+                        // We can't report this error now, because it'd change the order of errors reported:
+                        // first we need to report non-existing DCs, then if RFs aren't changed by too much.
+                        continue;
+                    }
+                    if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2) {
+                        throw exceptions::invalid_request_exception("Only one DC's RF can be changed at a time and not by more than 1");
+                    }
+                }
+            }
+
            locator::replication_strategy_params params(new_ks->strategy_options(), new_ks->initial_tablets());
            auto new_rs = locator::abstract_replication_strategy::create_replication_strategy(new_ks->strategy_name(), params);
            if (new_rs->is_per_table() != ks.get_replication_strategy().is_per_table()) {
@@ -81,22 +124,133 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
 #endif
 }

+bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor& qp) const {
+    auto ks = qp.db().find_keyspace(_name);
+    return ks.get_replication_strategy().uses_tablets() && !_attrs->get_replication_options().empty();
+}
+
+namespace {
+// These functions are used to flatten all the options in the keyspace definition into a single-level map<string, string>.
+// (Currently options are stored in a nested structure that looks more like a map<string, map<string, string>>).
+// Flattening is simply joining the keys of maps from both levels with a colon ':' character,
+// or in other words: prefixing the keys in the output map with the option type, e.g. 'replication', 'storage', etc.,
+// so that the output map contains entries like: "replication:dc1" -> "3".
+// This is done to avoid key conflicts and to be able to de-flatten the map back into the original structure.
+
+void add_prefixed_key(const sstring& prefix, const std::map<sstring, sstring>& in, std::map<sstring, sstring>& out) {
+    for (const auto& [in_key, in_value]: in) {
+        out[prefix + ":" + in_key] = in_value;
+    }
+};
+
+std::map<sstring, sstring> get_current_options_flattened(const shared_ptr<cql3::statements::ks_prop_defs>& ks,
+                                                         bool include_tablet_options,
+                                                         const gms::feature_service& feat) {
+    std::map<sstring, sstring> all_options;
+
+    add_prefixed_key(ks->KW_REPLICATION, ks->get_replication_options(), all_options);
+    add_prefixed_key(ks->KW_STORAGE, ks->get_storage_options().to_map(), all_options);
+    // if no tablet options are specified in ATLER KS statement,
+    // we want to preserve the old ones and hence cannot overwrite them with defaults
+    if (include_tablet_options) {
+        auto initial_tablets = ks->get_initial_tablets(std::nullopt);
+        add_prefixed_key(ks->KW_TABLETS,
+                         {{"enabled", initial_tablets ? "true" : "false"},
+                         {"initial", std::to_string(initial_tablets.value_or(0))}},
+                         all_options);
+    }
+    add_prefixed_key(ks->KW_DURABLE_WRITES,
+                     {{sstring(ks->KW_DURABLE_WRITES), to_sstring(ks->get_boolean(ks->KW_DURABLE_WRITES, true))}},
+                     all_options);
+
+    return all_options;
+}
+
+std::map<sstring, sstring> get_old_options_flattened(const data_dictionary::keyspace& ks, bool include_tablet_options) {
+    std::map<sstring, sstring> all_options;
+
+    using namespace cql3::statements;
+    add_prefixed_key(ks_prop_defs::KW_REPLICATION, ks.get_replication_strategy().get_config_options(), all_options);
+    add_prefixed_key(ks_prop_defs::KW_STORAGE, ks.metadata()->get_storage_options().to_map(), all_options);
+    if (include_tablet_options) {
+        add_prefixed_key(ks_prop_defs::KW_TABLETS,
+                         {{"enabled", ks.metadata()->initial_tablets() ? "true" : "false"},
+                          {"initial", std::to_string(ks.metadata()->initial_tablets().value_or(0))}},
+                         all_options);
+    }
+    add_prefixed_key(ks_prop_defs::KW_DURABLE_WRITES,
+                     {{sstring(ks_prop_defs::KW_DURABLE_WRITES), to_sstring(ks.metadata()->durable_writes())}},
+                     all_options);
+
+    return all_options;
+}
+} // <anonymous> namespace
+
 future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>
 cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, const query_options&, api::timestamp_type ts) const {
+    using namespace cql_transport;
    try {
-        auto old_ksm = qp.db().find_keyspace(_name).metadata();
+        event::schema_change::target_type target_type = event::schema_change::target_type::KEYSPACE;
+        auto ks = qp.db().find_keyspace(_name);
+        auto ks_md = ks.metadata();
        const auto& tm = *qp.proxy().get_token_metadata_ptr();
        const auto& feat = qp.proxy().features();
+        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, tm, feat);
+        std::vector<mutation> muts;
+        std::vector<sstring> warnings;
+        bool include_tablet_options = _attrs->get_map(_attrs->KW_TABLETS).has_value();
+        auto old_ks_options = get_old_options_flattened(ks, include_tablet_options);
+        auto ks_options = get_current_options_flattened(_attrs, include_tablet_options, feat);
+        ks_options.merge(old_ks_options);

-        auto m = service::prepare_keyspace_update_announcement(qp.db().real_database(), _attrs->as_ks_metadata_update(old_ksm, tm, feat), ts);
+        // we only want to run the tablets path if there are actually any tablets changes, not only schema changes
+        // TODO: the current `if (changes_tablets(qp))` is insufficient: someone may set the same RFs as before,
+        //       and we'll unnecessarily trigger the processing path for ALTER tablets KS,
+        //       when in reality nothing or only schema is being changed
+        if (changes_tablets(qp)) {
+            if (!qp.topology_global_queue_empty()) {
+                return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>(
+                        exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
+            }
+            if (_attrs->get_replication_options().contains(ks_prop_defs::REPLICATION_FACTOR_KEY)) {
+                return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>(
+                       exceptions::invalid_request_exception("'replication_factor' tag is not allowed when executing ALTER KEYSPACE with tablets, please list the DCs explicitly"));
+            }
+            qp.db().real_database().validate_keyspace_update(*ks_md_update);
+
+            service::topology_mutation_builder builder(ts);
+            builder.set_global_topology_request(service::global_topology_request::keyspace_rf_change);
+            builder.set_global_topology_request_id(this->global_req_id);
+            builder.set_new_keyspace_rf_change_data(_name, ks_options);
+            service::topology_change change{{builder.build()}};
+
+            auto topo_schema = qp.db().find_schema(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
+            boost::transform(change.mutations, std::back_inserter(muts), [topo_schema] (const canonical_mutation& cm) {
+                return cm.to_mutation(topo_schema);
+            });
+
+            service::topology_request_tracking_mutation_builder rtbuilder{utils::UUID{this->global_req_id}};
+            rtbuilder.set("done", false)
+                     .set("start_time", db_clock::now());
+            service::topology_change req_change{{rtbuilder.build()}};
+
+            auto topo_req_schema = qp.db().find_schema(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY_REQUESTS);
+            boost::transform(req_change.mutations, std::back_inserter(muts), [topo_req_schema] (const canonical_mutation& cm) {
+                return cm.to_mutation(topo_req_schema);
+            });
+
+            target_type = event::schema_change::target_type::TABLET_KEYSPACE;
+        } else {
+            auto schema_mutations = service::prepare_keyspace_update_announcement(qp.db().real_database(), ks_md_update, ts);
+            muts.insert(muts.begin(), schema_mutations.begin(), schema_mutations.end());
+        }

-        using namespace cql_transport;
        auto ret = ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
-                event::schema_change::target_type::KEYSPACE,
+                target_type,
                keyspace());

-        return make_ready_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>(std::make_tuple(std::move(ret), std::move(m), std::vector<sstring>()));
+        return make_ready_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>(std::make_tuple(std::move(ret), std::move(muts), warnings));
    } catch (data_dictionary::no_such_keyspace& e) {
        return make_exception_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>(exceptions::invalid_request_exception("Unknown keyspace " + _name));
    }
@@ -107,7 +261,6 @@ cql3::statements::alter_keyspace_statement::prepare(data_dictionary::database db
    return std::make_unique<prepared_statement>(make_shared<alter_keyspace_statement>(*this));
 }

-static logging::logger mylogger("alter_keyspace");

 future<::shared_ptr<cql_transport::messages::result_message>>
 cql3::statements::alter_keyspace_statement::execute(query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const {
--- a/cql3/statements/alter_keyspace_statement.hh
+++ b/cql3/statements/alter_keyspace_statement.hh
@@ -25,6 +25,7 @@ class ks_prop_defs;
 class alter_keyspace_statement : public schema_altering_statement {
    sstring _name;
    ::shared_ptr<ks_prop_defs> _attrs;
+    bool changes_tablets(query_processor& qp) const;

 public:
    alter_keyspace_statement(sstring name, ::shared_ptr<ks_prop_defs> attrs);
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -384,7 +384,8 @@ std::pair<schema_builder, std::vector<view_ptr>> alter_table_statement::prepare_
                    auto new_where = util::rename_column_in_where_clause(
                            view->view_info()->where_clause(),
                            column_identifier::raw(view_from->text(), true),
-                            column_identifier::raw(view_to->text(), true));
+                            column_identifier::raw(view_to->text(), true),
+                            cql3::dialect{});
                    builder.with_view_info(view->view_info()->base_id(), view->view_info()->base_name(),
                            view->view_info()->include_all_columns(), std::move(new_where));

--- a/cql3/statements/create_service_level_statement.cc
+++ b/cql3/statements/create_service_level_statement.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include "exceptions/exceptions.hh"
 #include "seastarx.hh"
 #include "cql3/statements/create_service_level_statement.hh"
 #include "service/qos/service_level_controller.hh"
@@ -38,6 +39,10 @@ create_service_level_statement::execute(query_processor& qp,
        service::query_state &state,
        const query_options &,
        std::optional<service::group0_guard> guard) const {
+    if (_service_level.starts_with('$')) {
+        throw exceptions::invalid_request_exception("Names starting with '$' are reserved for internal tenants. Use a different name.");
+    }
+
    qos::service_level_options slo = _slo.replace_defaults(qos::service_level_options{});
    return state.get_service_level_controller().add_distributed_service_level(_service_level, slo, _if_not_exists, std::move(guard)).then([] {
        using void_result_msg = cql_transport::messages::result_message::void_message;
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -24,7 +24,6 @@ static std::map<sstring, sstring> prepare_options(
        const sstring& strategy_class,
        const locator::token_metadata& tm,
        std::map<sstring, sstring> options,
-        std::optional<unsigned>& initial_tablets,
        const std::map<sstring, sstring>& old_options = {}) {
    options.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);

@@ -72,6 +71,35 @@ static std::map<sstring, sstring> prepare_options(
    return options;
 }

+ks_prop_defs::ks_prop_defs(std::map<sstring, sstring> options) {
+    std::map<sstring, sstring> replication_opts, storage_opts, tablets_opts, durable_writes_opts;
+
+    auto read_property_into = [] (auto& map, const sstring& name, const sstring& value, const sstring& tag) {
+        map[name.substr(sstring(tag).size() + 1)] = value;
+    };
+
+    for (const auto& [name, value] : options) {
+        if (name.starts_with(KW_DURABLE_WRITES)) {
+            read_property_into(durable_writes_opts, name, value, KW_DURABLE_WRITES);
+        } else if (name.starts_with(KW_REPLICATION)) {
+            read_property_into(replication_opts, name, value, KW_REPLICATION);
+        } else if (name.starts_with(KW_TABLETS)) {
+            read_property_into(tablets_opts, name, value, KW_TABLETS);
+        } else if (name.starts_with(KW_STORAGE)) {
+            read_property_into(storage_opts, name, value, KW_STORAGE);
+        }
+    }
+
+    if (!replication_opts.empty())
+        add_property(KW_REPLICATION, replication_opts);
+    if (!storage_opts.empty())
+        add_property(KW_STORAGE, storage_opts);
+    if (!tablets_opts.empty())
+        add_property(KW_TABLETS, tablets_opts);
+    if (!durable_writes_opts.empty())
+        add_property(KW_DURABLE_WRITES, durable_writes_opts.begin()->second);
+}
+
 void ks_prop_defs::validate() {
    // Skip validation if the strategy class is already set as it means we've already
    // prepared (and redoing it would set strategyClass back to null, which we don't want)
@@ -110,38 +138,31 @@ data_dictionary::storage_options ks_prop_defs::get_storage_options() const {
    return opts;
 }

-std::optional<unsigned> ks_prop_defs::get_initial_tablets(const sstring& strategy_class, bool enabled_by_default) const {
-    // FIXME -- this should be ignored somehow else
-    if (locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) != "org.apache.cassandra.locator.NetworkTopologyStrategy") {
-        return std::nullopt;
-    }
-
+std::optional<unsigned> ks_prop_defs::get_initial_tablets(std::optional<unsigned> default_value) const {
    auto tablets_options = get_map(KW_TABLETS);
    if (!tablets_options) {
-        return enabled_by_default ? std::optional<unsigned>(0) : std::nullopt;
+        return default_value;
    }

-    std::optional<unsigned> ret;
-
+    unsigned initial_count = 0;
    auto it = tablets_options->find("enabled");
    if (it != tablets_options->end()) {
        auto enabled = it->second;
        tablets_options->erase(it);

        if (enabled == "true") {
-            ret = 0; // even if 'initial' is not set, it'll start with auto-detection
+            // nothing
        } else if (enabled == "false") {
-            assert(!ret.has_value());
-            return ret;
+            return std::nullopt;
        } else {
-            throw exceptions::configuration_exception(sstring("Tablets enabled value must be true or false; found ") + it->second);
+            throw exceptions::configuration_exception(sstring("Tablets enabled value must be true or false; found: ") + enabled);
        }
    }

    it = tablets_options->find("initial");
    if (it != tablets_options->end()) {
        try {
-            ret = std::stol(it->second);
+            initial_count = std::stol(it->second);
        } catch (...) {
            throw exceptions::configuration_exception(sstring("Initial tablets value should be numeric; found ") + it->second);
        }
@@ -152,17 +173,22 @@ std::optional<unsigned> ks_prop_defs::get_initial_tablets(const sstring& strateg
        throw exceptions::configuration_exception(sstring("Unrecognized tablets option ") + tablets_options->begin()->first);
    }

-    return ret;
+    return initial_count;
 }

 std::optional<sstring> ks_prop_defs::get_replication_strategy_class() const {
    return _strategy_class;
 }

+bool ks_prop_defs::get_durable_writes() const {
+    return get_boolean(KW_DURABLE_WRITES, true);
+}
+
 lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(sstring ks_name, const locator::token_metadata& tm, const gms::feature_service& feat) {
    auto sc = get_replication_strategy_class().value();
-    std::optional<unsigned> initial_tablets = get_initial_tablets(sc, feat.tablets);
-    auto options = prepare_options(sc, tm, get_replication_options(), initial_tablets);
+    // if tablets options have not been specified, but tablets are globally enabled, set the value to 0 for N.T.S. only
+    auto initial_tablets = get_initial_tablets(feat.tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy" ? std::optional<unsigned>(0) : std::nullopt);
+    auto options = prepare_options(sc, tm, get_replication_options());
    return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
            std::move(options), initial_tablets, get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }
@@ -171,16 +197,14 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
    std::map<sstring, sstring> options;
    const auto& old_options = old->strategy_options();
    auto sc = get_replication_strategy_class();
-    std::optional<unsigned> initial_tablets;
    if (sc) {
-        initial_tablets = get_initial_tablets(*sc, old->initial_tablets().has_value());
-        options = prepare_options(*sc, tm, get_replication_options(), initial_tablets, old_options);
+        options = prepare_options(*sc, tm, get_replication_options(), old_options);
    } else {
        sc = old->strategy_name();
        options = old_options;
-        initial_tablets = old->initial_tablets();
    }
-
+    // if tablets options have not been specified, inherit them if it's tablets-enabled KS
+    auto initial_tablets = get_initial_tablets(old->initial_tablets());
    return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }

--- a/cql3/statements/ks_prop_defs.hh
+++ b/cql3/statements/ks_prop_defs.hh
@@ -49,11 +49,15 @@ public:
 private:
    std::optional<sstring> _strategy_class;
 public:
+    ks_prop_defs() = default;
+    explicit ks_prop_defs(std::map<sstring, sstring> options);
+
    void validate();
    std::map<sstring, sstring> get_replication_options() const;
    std::optional<sstring> get_replication_strategy_class() const;
-    std::optional<unsigned> get_initial_tablets(const sstring& strategy_class, bool enabled_by_default) const;
+    std::optional<unsigned> get_initial_tablets(std::optional<unsigned> default_value) const;
    data_dictionary::storage_options get_storage_options() const;
+    bool get_durable_writes() const;
    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata(sstring ks_name, const locator::token_metadata&, const gms::feature_service&);
    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata&, const gms::feature_service&);
 };
--- a/cql3/statements/property_definitions.hh
+++ b/cql3/statements/property_definitions.hh
@@ -46,14 +46,14 @@ public:
 protected:
    std::optional<sstring> get_simple(const sstring& name) const;

-    std::optional<std::map<sstring, sstring>> get_map(const sstring& name) const;
-
    void remove_from_map_if_exists(const sstring& name, const sstring& key) const;
 public:
    bool has_property(const sstring& name) const;

    std::optional<value_type> get(const sstring& name) const;

+    std::optional<std::map<sstring, sstring>> get_map(const sstring& name) const;
+
    sstring get_string(sstring key, sstring default_value) const;

    // Return a property value, typed as a Boolean
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -63,6 +63,7 @@ protected:

 public:
    virtual future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>> prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type) const = 0;
+    mutable utils::UUID global_req_id;
 };

 }
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -1683,6 +1683,7 @@ schema_ptr mutation_fragments_select_statement::generate_output_schema(schema_pt

 future<exceptions::coordinator_result<service::storage_proxy_coordinator_query_result>>
 mutation_fragments_select_statement::do_query(
+        locator::effective_replication_map_ptr erm_keepalive,
        locator::host_id this_node,
        service::storage_proxy& sp,
        schema_ptr schema,
@@ -1690,7 +1691,7 @@ mutation_fragments_select_statement::do_query(
        dht::partition_range_vector partition_ranges,
        db::consistency_level cl,
        service::storage_proxy_coordinator_query_options optional_params) const {
-    auto res = co_await replica::mutation_dump::dump_mutations(sp.get_db(), schema, _underlying_schema, partition_ranges, *cmd, optional_params.timeout(sp));
+    auto res = co_await replica::mutation_dump::dump_mutations(sp.get_db(), std::move(erm_keepalive), schema, _underlying_schema, partition_ranges, *cmd, optional_params.timeout(sp));
    service::replicas_per_token_range last_replicas;
    if (this_node) {
        last_replicas.emplace(dht::token_range::make_open_ended_both_sides(), std::vector<locator::host_id>{this_node});
@@ -1762,7 +1763,7 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
    if (!aggregate && !_restrictions_need_filtering && (page_size <= 0
            || !service::pager::query_pagers::may_need_paging(*_schema, page_size,
                    *command, key_ranges))) {
-        return do_query({}, qp.proxy(), _schema, command, std::move(key_ranges), cl,
+        return do_query(erm_keepalive, {}, qp.proxy(), _schema, command, std::move(key_ranges), cl,
                {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state(), {}, {}})
        .then(wrap_result_to_error_message([this, erm_keepalive, now, slice = command->slice] (service::storage_proxy_coordinator_query_result&& qr) mutable {
            cql3::selection::result_set_builder builder(*_selection, now);
@@ -1801,8 +1802,8 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
            std::move(key_ranges),
            _restrictions_need_filtering ? _restrictions : nullptr,
            [this, erm_keepalive, this_node] (service::storage_proxy& sp, schema_ptr schema, lw_shared_ptr<query::read_command> cmd, dht::partition_range_vector partition_ranges,
-                    db::consistency_level cl, service::storage_proxy_coordinator_query_options optional_params) {
-                return do_query(this_node, sp, std::move(schema), std::move(cmd), std::move(partition_ranges), cl, std::move(optional_params));
+                    db::consistency_level cl, service::storage_proxy_coordinator_query_options optional_params) mutable {
+                return do_query(std::move(erm_keepalive), this_node, sp, std::move(schema), std::move(cmd), std::move(partition_ranges), cl, std::move(optional_params));
            });

    if (_selection->is_trivial() && !_restrictions_need_filtering && !_per_partition_limit) {
@@ -2032,7 +2033,10 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
            && !restrictions->need_filtering()  // No filtering
            && group_by_cell_indices->empty()   // No GROUP BY
            && db.get_config().enable_parallelized_aggregation()
-            && !is_local_table();
+            && !is_local_table()
+            && !( // Do not parallelize the request if it's single partition read
+                restrictions->partition_key_restrictions_is_all_eq() 
+                && restrictions->partition_key_restrictions_size() == schema->partition_key_size());
    };

    if (_parameters->is_prune_materialized_view()) {
@@ -2558,7 +2562,9 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
    if (!where_clause.empty()) {
        out << " WHERE " << where_clause << " ALLOW FILTERING";
    }
-    return do_with_parser(out.str(), std::mem_fn(&cql3_parser::CqlParser::selectStatement));
+    // In general it's not a good idea to use the default dialect, but here the database is talking to
+    // itself, so we can hope the dialects are mutually compatible here.
+    return do_with_parser(out.str(), dialect{}, std::mem_fn(&cql3_parser::CqlParser::selectStatement));
 }

 }
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -338,6 +338,7 @@ public:
 private:
    future<exceptions::coordinator_result<service::storage_proxy_coordinator_query_result>>
    do_query(
+            locator::effective_replication_map_ptr erm_keepalive,
            locator::host_id this_node,
            service::storage_proxy& sp,
            schema_ptr schema,
--- a/cql3/util.cc
+++ b/cql3/util.cc
@@ -20,7 +20,7 @@ void __sanitizer_finish_switch_fiber(void* fake_stack_save, const void** stack_b

 namespace cql3::util {

-static void do_with_parser_impl_impl(const sstring_view& cql, noncopyable_function<void (cql3_parser::CqlParser& parser)> f) {
+static void do_with_parser_impl_impl(const sstring_view& cql, dialect d, noncopyable_function<void (cql3_parser::CqlParser& parser)> f) {
    cql3_parser::CqlLexer::collector_type lexer_error_collector(cql);
    cql3_parser::CqlParser::collector_type parser_error_collector(cql);
    cql3_parser::CqlLexer::InputStreamType input{reinterpret_cast<const ANTLR_UINT8*>(cql.begin()), ANTLR_ENC_UTF8, static_cast<ANTLR_UINT32>(cql.size()), nullptr};
@@ -29,13 +29,14 @@ static void do_with_parser_impl_impl(const sstring_view& cql, noncopyable_functi
    cql3_parser::CqlParser::TokenStreamType tstream(ANTLR_SIZE_HINT, lexer.get_tokSource());
    cql3_parser::CqlParser parser{&tstream};
    parser.set_error_listener(parser_error_collector);
+    parser.set_dialect(d);
    f(parser);
 }

 #ifndef DEBUG

-void do_with_parser_impl(const sstring_view& cql, noncopyable_function<void (cql3_parser::CqlParser& parser)> f) {
-    return do_with_parser_impl_impl(cql, std::move(f));
+void do_with_parser_impl(const sstring_view& cql, dialect d, noncopyable_function<void (cql3_parser::CqlParser& parser)> f) {
+    return do_with_parser_impl_impl(cql, d, std::move(f));
 }

 #else
@@ -47,6 +48,7 @@ void do_with_parser_impl(const sstring_view& cql, noncopyable_function<void (cql
 struct thunk_args {
    // arguments to do_with_parser_impl_impl
    const sstring_view& cql;
+    dialect d;
    noncopyable_function<void (cql3_parser::CqlParser&)>&& func;
    // Exceptions can't be returned from another stack, so store
    // any thrown exception here
@@ -70,7 +72,7 @@ static void thunk(int p1, int p2) {
    // Complete stack switch started in do_with_parser_impl()
    __sanitizer_finish_switch_fiber(nullptr, &san.stack_bottom, &san.stack_size);
    try {
-        do_with_parser_impl_impl(args->cql, std::move(args->func));
+        do_with_parser_impl_impl(args->cql, args->d, std::move(args->func));
    } catch (...) {
        args->ex = std::current_exception();
    }
@@ -79,11 +81,12 @@ static void thunk(int p1, int p2) {
    setcontext(&args->caller_stack);
 };

-void do_with_parser_impl(const sstring_view& cql, noncopyable_function<void (cql3_parser::CqlParser& parser)> f) {
+void do_with_parser_impl(const sstring_view& cql, dialect d, noncopyable_function<void (cql3_parser::CqlParser& parser)> f) {
    static constexpr size_t stack_size = 1 << 20;
    static thread_local std::unique_ptr<char[]> stack = std::make_unique<char[]>(stack_size);
    thunk_args args{
        .cql = cql,
+        .d = d,
        .func = std::move(f),
    };
    ucontext_t uc;
@@ -92,7 +95,7 @@ void do_with_parser_impl(const sstring_view& cql, noncopyable_function<void (cql
    if (stack.get() <= (char*)&uc && (char*)&uc < stack.get() + stack_size) {
        // We are already running on the large stack, so just call the
        // parser directly.
-        return do_with_parser_impl_impl(cql, std::move(f));
+        return do_with_parser_impl_impl(cql, d, std::move(f));
    }
    uc.uc_stack.ss_sp = stack.get();
    uc.uc_stack.ss_size = stack_size;
@@ -136,12 +139,12 @@ sstring relations_to_where_clause(const expr::expression& e) {
    return boost::algorithm::join(expressions, " AND ");
 }

-expr::expression where_clause_to_relations(const sstring_view& where_clause) {
-    return do_with_parser(where_clause, std::mem_fn(&cql3_parser::CqlParser::whereClause));
+expr::expression where_clause_to_relations(const sstring_view& where_clause, dialect d) {
+    return do_with_parser(where_clause, d, std::mem_fn(&cql3_parser::CqlParser::whereClause));
 }

-sstring rename_column_in_where_clause(const sstring_view& where_clause, column_identifier::raw from, column_identifier::raw to) {
-    std::vector<expr::expression> relations = boolean_factors(where_clause_to_relations(where_clause));
+sstring rename_column_in_where_clause(const sstring_view& where_clause, column_identifier::raw from, column_identifier::raw to, dialect d) {
+    std::vector<expr::expression> relations = boolean_factors(where_clause_to_relations(where_clause, d));
    std::vector<expr::expression> new_relations;
    new_relations.reserve(relations.size());

--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -21,18 +21,19 @@
 #include "cql3/CqlParser.hpp"
 #include "cql3/error_collector.hh"
 #include "cql3/statements/raw/select_statement.hh"
+#include "cql3/dialect.hh"

 namespace cql3 {

 namespace util {


-void do_with_parser_impl(const sstring_view& cql, noncopyable_function<void (cql3_parser::CqlParser& p)> func);
+void do_with_parser_impl(const sstring_view& cql, dialect d, noncopyable_function<void (cql3_parser::CqlParser& p)> func);

 template <typename Func, typename Result = cql3_parser::unwrap_uninitialized_t<std::result_of_t<Func(cql3_parser::CqlParser&)>>>
-Result do_with_parser(const sstring_view& cql, Func&& f) {
+Result do_with_parser(const sstring_view& cql, dialect d, Func&& f) {
    std::optional<Result> ret;
-    do_with_parser_impl(cql, [&] (cql3_parser::CqlParser& parser) {
+    do_with_parser_impl(cql, d, [&] (cql3_parser::CqlParser& parser) {
        ret.emplace(f(parser));
    });
    return std::move(*ret);
@@ -40,9 +41,9 @@ Result do_with_parser(const sstring_view& cql, Func&& f) {

 sstring relations_to_where_clause(const expr::expression& e);

-expr::expression where_clause_to_relations(const sstring_view& where_clause);
+expr::expression where_clause_to_relations(const sstring_view& where_clause, dialect d);

-sstring rename_column_in_where_clause(const sstring_view& where_clause, column_identifier::raw from, column_identifier::raw to);
+sstring rename_column_in_where_clause(const sstring_view& where_clause, column_identifier::raw from, column_identifier::raw to, dialect d);

 /// build a CQL "select" statement with the desired parameters.
 /// If select_all_columns==true, all columns are selected and the value of
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -390,6 +390,12 @@ struct fmt::formatter<data_dictionary::user_types_metadata> {
 };

 auto fmt::formatter<data_dictionary::keyspace_metadata>::format(const data_dictionary::keyspace_metadata& m, fmt::format_context& ctx) const -> decltype(ctx.out()) {
-    return fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, cfMetaData={}, durable_writes={}, userTypes={}}}",
-            m.name(), m.strategy_name(), m.strategy_options(), m.cf_meta_data(), m.durable_writes(), m.user_types());
+    fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
+            m.name(), m.strategy_name(), m.strategy_options(), m.cf_meta_data(), m.durable_writes());
+    if (m.initial_tablets()) {
+        fmt::format_to(ctx.out(), "{{\"initial\":{}}}", m.initial_tablets().value());
+    } else {
+        fmt::format_to(ctx.out(), "{{\"enabled\":false}}");
+    }
+    return fmt::format_to(ctx.out(), ", userTypes={}}}", m.user_types());
 }
--- a/db/CMakeLists.txt
+++ b/db/CMakeLists.txt
@@ -2,7 +2,6 @@ add_library(db STATIC)
 target_sources(db
  PRIVATE
    consistency_level.cc
-    system_auth_keyspace.cc
    system_keyspace.cc
    virtual_table.cc
    virtual_tables.cc
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -133,7 +133,7 @@ future<> db::batchlog_manager::stop() {
 }

 future<size_t> db::batchlog_manager::count_all_batches() const {
-    sstring query = format("SELECT count(*) FROM {}.{}", system_keyspace::NAME, system_keyspace::BATCHLOG);
+    sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG);
    return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> rs) {
       return size_t(rs->one().get_as<int64_t>("count"));
    });
@@ -152,26 +152,26 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

-    auto batch = [this, limiter](const cql3::untyped_result_set::row& row) {
+    auto batch = [this, limiter](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
        auto written_at = row.get_as<db_clock::time_point>("written_at");
        auto id = row.get_as<utils::UUID>("id");
        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
        auto timeout = get_batch_log_timeout();
        if (db_clock::now() < written_at + timeout) {
            blogger.debug("Skipping replay of {}, too fresh", id);
-            return make_ready_future<>();
+            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

        // check version of serialization format
        if (!row.has("version")) {
            blogger.warn("Skipping logged batch because of unknown version");
-            return make_ready_future<>();
+            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

        auto version = row.get_as<int32_t>("version");
        if (version != netw::messaging_service::current_version) {
            blogger.warn("Skipping logged batch because of incorrect version");
-            return make_ready_future<>();
+            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

        auto data = row.get_blob("data");
@@ -253,49 +253,20 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
            m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-        });
+        }).then([] { return make_ready_future<stop_iteration>(stop_iteration::no); });
    };

-    return seastar::with_gate(_gate, [this, batch = std::move(batch)] {
+    return seastar::with_gate(_gate, [this, batch = std::move(batch)] () mutable {
        blogger.debug("Started replayAllFailedBatches (cpu {})", this_shard_id());
-
-        typedef ::shared_ptr<cql3::untyped_result_set> page_ptr;
-        sstring query = format("SELECT id, data, written_at, version FROM {}.{} LIMIT {:d}", system_keyspace::NAME, system_keyspace::BATCHLOG, page_size);
-        return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([this, batch = std::move(batch)](page_ptr page) {
-            return do_with(std::move(page), [this, batch = std::move(batch)](page_ptr & page) mutable {
-                return repeat([this, &page, batch = std::move(batch)]() mutable {
-                    if (page->empty()) {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    }
-                    auto id = page->back().get_as<utils::UUID>("id");
-                    return parallel_for_each(*page, batch).then([this, &page, id]() {
-                        if (page->size() < page_size) {
-                            return make_ready_future<stop_iteration>(stop_iteration::yes); // we've exhausted the batchlog, next query would be empty.
-                        }
-                        sstring query = format("SELECT id, data, written_at, version FROM {}.{} WHERE token(id) > token(?) LIMIT {:d}",
-                                system_keyspace::NAME,
-                                system_keyspace::BATCHLOG,
-                                page_size);
-                        return _qp.execute_internal(query, {id}, cql3::query_processor::cache_internal::yes).then([&page](auto res) {
-                                    page = std::move(res);
-                                    return make_ready_future<stop_iteration>(stop_iteration::no);
-                                });
-                    });
-                });
-            });
-        }).then([] {
-        // TODO FIXME : cleanup()
-#if 0
-            ColumnFamilyStore cfs = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHLOG);
-            cfs.forceBlockingFlush();
-            Collection<Descriptor> descriptors = new ArrayList<>();
-            for (SSTableReader sstr : cfs.getSSTables())
-            descriptors.add(sstr.descriptor);
-            if (!descriptors.isEmpty()) // don't pollute the logs if there is nothing to compact.
-            CompactionManager.instance.submitUserDefined(cfs, descriptors, Integer.MAX_VALUE).get();
-
-#endif
-
+        return _qp.query_internal(
+                format("SELECT id, data, written_at, version FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
+                db::consistency_level::ONE,
+                {},
+                page_size,
+                std::move(batch)).then([this] {
+            // Replaying batches could have generated tombstones, flush to disk,
+            // where they can be compacted away.
+            return replica::database::flush_table_on_all_shards(_qp.proxy().get_db(), system_keyspace::NAME, system_keyspace::BATCHLOG);
        }).then([] {
            blogger.debug("Finished replayAllFailedBatches");
        });
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1099,7 +1099,12 @@ public:
            write(out, uint64_t(0));
        }

-        buf.remove_suffix(buf.size_bytes() - size);
+        auto to_remove = buf.size_bytes() - size;
+        // #20862 - we decrement usage counter based on buf.size() below.
+        // Since we are shrinking buffer here, we need to also decrement
+        // counter already
+        buf.remove_suffix(to_remove);
+        _segment_manager->totals.buffer_list_bytes -= to_remove;

        // Build sector checksums.
        auto id = net::hton(_desc.id);
@@ -3238,6 +3243,10 @@ uint64_t db::commitlog::get_total_size() const {
        ;
 }

+uint64_t db::commitlog::get_buffer_size() const {
+    return _segment_manager->totals.buffer_list_bytes;
+}
+
 uint64_t db::commitlog::get_completed_tasks() const {
    return _segment_manager->totals.allocation_count;
 }
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -297,6 +297,7 @@ public:
    future<> delete_segments(std::vector<sstring>) const;

    uint64_t get_total_size() const;
+    uint64_t get_buffer_size() const;
    uint64_t get_completed_tasks() const;
    uint64_t get_flush_count() const;
    uint64_t get_pending_tasks() const;
--- a/db/config.cc
+++ b/db/config.cc
@@ -99,6 +99,21 @@ error_injection_list_to_json(const std::vector<db::config::error_injection_at_st
    return value_to_json("error_injection_list");
 }

+template <>
+bool
+config_from_string(std::string_view value) {
+    // boost::lexical_cast doesn't accept true/false, which are our output representations
+    // for bools. We want round-tripping, so we need to accept true/false. For backward
+    // compatibility, we also accept 1/0. #19791.
+    if (value == "true" || value == "1") {
+        return true;
+    } else if (value == "false" || value == "0") {
+        return false;
+    } else {
+        throw boost::bad_lexical_cast(typeid(std::string_view), typeid(bool));
+    }
+}
+
 template <>
 const config_type config_type_for<bool> = config_type("bool", value_to_json<bool>);

@@ -177,7 +192,7 @@ struct convert<seastar::log_level> {
        if (!convert<std::string>::decode(node, tmp)) {
            return false;
        }
-        rhs = boost::lexical_cast<seastar::log_level>(tmp);
+        rhs = utils::config_from_string<seastar::log_level>(tmp);
        return true;
    }
 };
@@ -991,7 +1006,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , unspooled_dirty_soft_limit(this, "unspooled_dirty_soft_limit", value_status::Used, 0.6, "Soft limit of unspooled dirty memory expressed as a portion of the hard limit.")
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default)"
        "bytes written to data file. Value must be between 0 and 1.")
-    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .1, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
+    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable.")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting.")
@@ -1031,6 +1046,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Start serializing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , reader_concurrency_semaphore_kill_limit_multiplier(this, "reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
            "Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
+    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 1,
+            "Admit new reads while there are less than this number of requests that need CPU.")
    , twcs_max_window_count(this, "twcs_max_window_count", liveness::LiveUpdate, value_status::Used, 50,
            "The maximum number of compaction windows allowed when making use of TimeWindowCompactionStrategy. A setting of 0 effectively disables the restriction.")
    , initial_sstable_loading_concurrency(this, "initial_sstable_loading_concurrency", value_status::Used, 4u,
@@ -1072,6 +1089,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Make the system.config table UPDATEable.")
    , enable_parallelized_aggregation(this, "enable_parallelized_aggregation", liveness::LiveUpdate, value_status::Used, true,
            "Use on a new, parallel algorithm for performing aggregate queries.")
+    , cql_duplicate_bind_variable_names_refer_to_same_variable(this, "cql_duplicate_bind_variable_names_refer_to_same_variable", liveness::LiveUpdate, value_status::Used, true,
+            "A bind variable that appears twice in a CQL query refers to a single variable (if false, no name matching is performed).")
    , alternator_port(this, "alternator_port", value_status::Used, 0, "Alternator API port.")
    , alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port.")
    , alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address.")
@@ -1157,6 +1176,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , service_levels_interval(this, "service_levels_interval_ms", liveness::LiveUpdate, value_status::Used, 10000, "Controls how often service levels module polls configuration table")
    , error_injections_at_startup(this, "error_injections_at_startup", error_injection_value_status, {}, "List of error injections that should be enabled on startup.")
    , topology_barrier_stall_detector_threshold_seconds(this, "topology_barrier_stall_detector_threshold_seconds", value_status::Used, 2, "Report sites blocking topology barrier if it takes longer than this.")
+    , enable_tablets(this, "enable_tablets", value_status::Used, false, "Enable tablets for newly created keyspaces")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
@@ -1347,7 +1367,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"consistent-topology-changes", feature::UNUSED},
        {"broadcast-tables", feature::BROADCAST_TABLES},
        {"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},
-        {"tablets", feature::TABLETS},
+        {"tablets", feature::UNUSED},
    };
 }

--- a/db/config.hh
+++ b/db/config.hh
@@ -111,7 +111,6 @@ struct experimental_features_t {
        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
-        TABLETS,
    };
    static std::map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
@@ -390,6 +389,7 @@ public:
    named_value<uint64_t> max_memory_for_unlimited_query_hard_limit;
    named_value<uint32_t> reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_kill_limit_multiplier;
+    named_value<uint32_t> reader_concurrency_semaphore_cpu_concurrency;
    named_value<uint32_t> twcs_max_window_count;
    named_value<unsigned> initial_sstable_loading_concurrency;
    named_value<bool> enable_3_1_0_compatibility_mode;
@@ -406,6 +406,7 @@ public:
    named_value<bool> enable_optimized_reversed_reads;
    named_value<bool> enable_cql_config_updates;
    named_value<bool> enable_parallelized_aggregation;
+    named_value<bool> cql_duplicate_bind_variable_names_refer_to_same_variable;

    named_value<uint16_t> alternator_port;
    named_value<uint16_t> alternator_https_port;
@@ -495,6 +496,7 @@ public:

    named_value<std::vector<error_injection_at_startup>> error_injections_at_startup;
    named_value<double> topology_barrier_stall_detector_threshold_seconds;
+    named_value<bool> enable_tablets;

    static const sstring default_tls_priority;
 private:
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -36,7 +36,7 @@ size_t quorum_for(const locator::effective_replication_map& erm) {
 size_t local_quorum_for(const locator::effective_replication_map& erm, const sstring& dc) {
    using namespace locator;

-    auto& rs = erm.get_replication_strategy();
+    const auto& rs = erm.get_replication_strategy();

    if (rs.get_type() == replication_strategy_type::network_topology) {
        const network_topology_strategy* nrs =
@@ -65,7 +65,7 @@ size_t block_for_local_serial(const locator::effective_replication_map& erm) {
 size_t block_for_each_quorum(const locator::effective_replication_map& erm) {
    using namespace locator;

-    auto& rs = erm.get_replication_strategy();
+    const auto& rs = erm.get_replication_strategy();

    if (rs.get_type() == replication_strategy_type::network_topology) {
        const network_topology_strategy* nrs =
@@ -260,7 +260,7 @@ filter_for_query(consistency_level cl,
    size_t bf = block_for(erm, cl);

    if (read_repair == read_repair_decision::DC_LOCAL) {
-        bf = std::max(block_for(erm, cl), local_count);
+        bf = std::max(bf, local_count);
    }

    if (bf >= live_endpoints.size()) { // RRD.DC_LOCAL + CL.LOCAL or CL.ALL
@@ -334,7 +334,13 @@ filter_for_query(consistency_level cl,
        if (!old_node && ht_max - ht_min > 0.01) { // if there is old node or hit rates are close skip calculations
            // local node is always first if present (see storage_proxy::get_endpoints_for_reading)
            unsigned local_idx = erm.get_topology().is_me(epi[0].first) ? 0 : epi.size() + 1;
-            live_endpoints = boost::copy_range<inet_address_vector_replica_set>(miss_equalizing_combination(epi, local_idx, remaining_bf, bool(extra)));
+            auto weighted = boost::copy_range<inet_address_vector_replica_set>(miss_equalizing_combination(epi, local_idx, remaining_bf, bool(extra)));
+            // Workaround for https://github.com/scylladb/scylladb/issues/9285
+            auto last = std::adjacent_find(weighted.begin(), weighted.end());
+            if (last == weighted.end()) {
+                // No duplicates, so use the result based on hit rates
+                live_endpoints = std::move(weighted);
+            }
        }
    }

--- a/db/cql_type_parser.cc
+++ b/db/cql_type_parser.cc
@@ -20,7 +20,9 @@
 #include "utils/sorting.hh"

 static ::shared_ptr<cql3::cql3_type::raw> parse_raw(const sstring& str) {
-    return cql3::util::do_with_parser(str,
+    // In general it's a bad idea to use the default dialect, but type parsing
+    // should be dialect-agnostic.
+    return cql3::util::do_with_parser(str, cql3::dialect{},
        [] (cql3_parser::CqlParser& parser) {
            return parser.comparator_type(true);
        });
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -167,6 +167,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
        return io_check([name = _hints_dir.c_str()] { return recursive_touch_directory(name); }).then([this] () {
            commitlog::config cfg;

+            cfg.sched_group = _shard_manager.local_db().commitlog()->active_config().sched_group;
            cfg.commit_log_location = _hints_dir.c_str();
            cfg.commitlog_segment_size_in_mb = resource_manager::hint_segment_size_in_mb;
            cfg.commitlog_total_space_in_mb = resource_manager::max_hints_per_ep_size_mb;
--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -76,23 +76,6 @@ future<timespec> hint_sender::get_last_file_modification(const sstring& fname) {
    });
 }

-future<> hint_sender::do_send_one_mutation(frozen_mutation_and_schema m, locator::effective_replication_map_ptr ermp, const inet_address_vector_replica_set& natural_endpoints) {
-    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints] () mutable -> future<> {
-        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
-        // to be generated as a result of hints sending.
-        const auto& tm = ermp->get_token_metadata();
-        const auto maybe_addr = tm.get_endpoint_for_host_id_if_known(end_point_key());
-
-        if (maybe_addr && boost::range::find(natural_endpoints, *maybe_addr) != natural_endpoints.end()) {
-            manager_logger.trace("Sending directly to {}", end_point_key());
-            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), *maybe_addr);
-        } else {
-            manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", end_point_key());
-            return _proxy.send_hint_to_all_replicas(std::move(m));
-        }
-    });
-}
-
 bool hint_sender::can_send() noexcept {
    if (stopping() && !draining()) {
        return false;
@@ -274,11 +257,30 @@ void hint_sender::start() {
 }

 future<> hint_sender::send_one_mutation(frozen_mutation_and_schema m) {
-    auto erm = _db.find_column_family(m.s).get_effective_replication_map();
+    auto ermp = _db.find_column_family(m.s).get_effective_replication_map();
    auto token = dht::get_token(*m.s, m.fm.key());
-    inet_address_vector_replica_set natural_endpoints = erm->get_natural_endpoints(std::move(token));
+    inet_address_vector_replica_set natural_endpoints = ermp->get_natural_endpoints(std::move(token));

-    return do_send_one_mutation(std::move(m), std::move(erm), std::move(natural_endpoints));
+    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints] () mutable -> future<> {
+        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
+        // to be generated as a result of hints sending.
+        const auto& tm = ermp->get_token_metadata();
+        const auto maybe_addr = tm.get_endpoint_for_host_id_if_known(end_point_key());
+
+        if (maybe_addr && boost::range::find(natural_endpoints, *maybe_addr) != natural_endpoints.end() && !tm.is_leaving(end_point_key())) {
+            manager_logger.trace("Sending directly to {}", end_point_key());
+            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), *maybe_addr);
+        } else {
+            if (manager_logger.is_enabled(log_level::trace)) {
+                if (tm.is_leaving(end_point_key())) {
+                    manager_logger.trace("The original target endpoint {} is leaving. Mutating from scratch...", end_point_key());
+                } else {
+                    manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", end_point_key());
+                }
+            }
+            return _proxy.send_hint_to_all_replicas(std::move(m));
+        }
+    });
 }

 future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fragmented_temporary_buffer buf, db::replay_position rp, gc_clock::duration secs_since_file_mod, const sstring& fname) {
--- a/db/hints/internal/hint_sender.hh
+++ b/db/hints/internal/hint_sender.hh
@@ -233,18 +233,14 @@ private:
    /// \return
    const column_mapping& get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr);

-    /// \brief Perform a single mutation send attempt.
+    /// \brief Send one mutation out.
    ///
    /// If the original destination end point is still a replica for the given mutation - send the mutation directly
    /// to it, otherwise execute the mutation "from scratch" with CL=ALL.
    ///
-    /// \param m mutation to send
-    /// \param ermp points to the effective_replication_map used to obtain \c natural_endpoints
-    /// \param natural_endpoints current replicas for the given mutation
-    /// \return future that resolves when the operation is complete
-    future<> do_send_one_mutation(frozen_mutation_and_schema m, locator::effective_replication_map_ptr ermp, const inet_address_vector_replica_set& natural_endpoints);
-
-    /// \brief Send one mutation out.
+    /// The mutation will be sent with CL=ALL semantics to all current replicas also in case if the original destination
+    /// is leaving the cluster - otherwise the hint might be applied only on the leaving node and streaming might
+    /// miss it.
    ///
    /// \param m mutation to send
    /// \return future that resolves when the mutation sending processing is complete.
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -278,7 +278,7 @@ sync_point::shard_rps manager::calculate_current_sync_point(std::span<const gms:
        auto it = _ep_managers.find(*hid);
        if (it != _ep_managers.end()) {
            const hint_endpoint_manager& ep_man = it->second;
-            rps[addr] = ep_man.last_written_replay_position();
+            rps[*hid] = ep_man.last_written_replay_position();
        }
    }

@@ -316,10 +316,14 @@ future<> manager::wait_for_sync_point(abort_source& as, const sync_point::shard_
    hid_rps.reserve(rps.size());

    for (const auto& [addr, rp] : rps) {
-        const auto maybe_hid = tmptr->get_host_id_if_known(addr);
-        // Ignore the IPs we cannot map.
-        if (maybe_hid) [[likely]] {
-            hid_rps.emplace(*maybe_hid, rp);
+        if (std::holds_alternative<gms::inet_address>(addr)) {
+            const auto maybe_hid = tmptr->get_host_id_if_known(std::get<gms::inet_address>(addr));
+            // Ignore the IPs we cannot map.
+            if (maybe_hid) [[likely]] {
+                hid_rps.emplace(*maybe_hid, rp);
+            }
+        } else {
+            hid_rps.emplace(std::get<locator::host_id>(addr), rp);
        }
    }

@@ -409,6 +413,12 @@ bool manager::have_ep_manager(const std::variant<locator::host_id, gms::inet_add
 bool manager::store_hint(endpoint_id host_id, gms::inet_address ip, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm,
        tracing::trace_state_ptr tr_state) noexcept
 {
+    if (utils::get_local_injector().enter("reject_incoming_hints")) {
+        manager_logger.debug("Rejecting a hint to {} / {} due to an error injection", host_id, ip);
+        ++_stats.dropped;
+        return false;
+    }
+
    if (stopping() || draining_all() || !started() || !can_hint_for(host_id)) {
        manager_logger.trace("Can't store a hint to {}", host_id);
        ++_stats.dropped;
@@ -554,10 +564,16 @@ future<> manager::change_host_filter(host_filter filter) {
            const auto maybe_host_id_and_ip = std::invoke([&] () -> std::optional<pair_type> {
                try {
                    locator::host_id_or_endpoint hid_or_ep{de.name};
-                    if (hid_or_ep.has_host_id()) {
+
+                    // If hinted handoff is host-ID-based, hint directories representing IP addresses must've
+                    // been created by mistake and they're invalid. The same for pre-host-ID hinted handoff
+                    // -- hint directories representing host IDs are NOT valid.
+                    if (hid_or_ep.has_host_id() && _uses_host_id) {
                        return std::make_optional(pair_type{hid_or_ep.id(), hid_or_ep.resolve_endpoint(*tmptr)});
-                    } else {
+                    } else if (hid_or_ep.has_endpoint() && !_uses_host_id) {
                        return std::make_optional(pair_type{hid_or_ep.resolve_id(*tmptr), hid_or_ep.endpoint()});
+                    } else {
+                        return std::nullopt;
                    }
                } catch (...) {
                    return std::nullopt;
@@ -565,6 +581,8 @@ future<> manager::change_host_filter(host_filter filter) {
            });

            if (!maybe_host_id_and_ip) {
+                manager_logger.warn("Encountered a hint directory of invalid name while changing the host filter: {}. "
+                        "Hints stored in it won't be replayed.", de.name);
                co_return;
            }

@@ -618,12 +636,12 @@ bool manager::check_dc_for(endpoint_id ep) const noexcept {
    }
 }

-future<> manager::drain_for(endpoint_id endpoint) noexcept {
+future<> manager::drain_for(endpoint_id host_id, gms::inet_address ip) noexcept {
    if (!started() || stopping() || draining_all()) {
        co_return;
    }

-    manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);
+    manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", host_id);

    const auto holder = seastar::gate::holder{_draining_eps_gate};
    // As long as we hold on to this lock, no migration of hinted handoff to host IDs
@@ -642,7 +660,7 @@ future<> manager::drain_for(endpoint_id endpoint) noexcept {

    std::exception_ptr eptr = nullptr;

-    if (_proxy.local_db().get_token_metadata().get_topology().is_me(endpoint)) {
+    if (_proxy.local_db().get_token_metadata().get_topology().is_me(host_id)) {
        set_draining_all();

        try {
@@ -657,28 +675,45 @@ future<> manager::drain_for(endpoint_id endpoint) noexcept {
        _ep_managers.clear();
        _hint_directory_manager.clear();
    } else {
-        auto it = _ep_managers.find(endpoint);
-
-        if (it != _ep_managers.end()) {
-            try {
-                co_await drain_ep_manager(it->second);
-            } catch (...) {
-                eptr = std::current_exception();
+        const auto maybe_host_id = std::invoke([&] () -> std::optional<locator::host_id> {
+            if (_uses_host_id) {
+                return host_id;
            }
+            // Before the whole cluster is migrated to the host-ID-based hinted handoff,
+            // one hint directory may correspond to multiple target nodes. If *any* of them
+            // leaves the cluster, we should drain the hint directory. This is why we need
+            // to rely on this mapping here.
+            const auto maybe_mapping = _hint_directory_manager.get_mapping(host_id, ip);
+            if (maybe_mapping) {
+                return maybe_mapping->first;
+            }
+            return std::nullopt;
+        });

-            // We can't provide the function with `it` here because we co_await above,
-            // so iterators could have been invalidated.
-            // This never throws.
-            _ep_managers.erase(endpoint);
-            _hint_directory_manager.remove_mapping(endpoint);
+        if (maybe_host_id) {
+            auto it = _ep_managers.find(*maybe_host_id);
+
+            if (it != _ep_managers.end()) {
+                try {
+                    co_await drain_ep_manager(it->second);
+                } catch (...) {
+                    eptr = std::current_exception();
+                }
+
+                // We can't provide the function with `it` here because we co_await above,
+                // so iterators could have been invalidated.
+                // This never throws.
+                _ep_managers.erase(*maybe_host_id);
+                _hint_directory_manager.remove_mapping(*maybe_host_id);
+            }
        }
    }

    if (eptr) {
-        manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+        manager_logger.error("Exception when draining {}: {}", host_id, eptr);
    }

-    manager_logger.trace("drain_for: finished draining {}", endpoint);
+    manager_logger.trace("drain_for: finished draining {}", host_id);
 }

 void manager::update_backlog(size_t backlog, size_t max_backlog) {
@@ -700,8 +735,6 @@ future<> manager::with_file_update_mutex_for(const std::variant<locator::host_id
    return _ep_managers.at(host_id).with_file_update_mutex(std::move(func));
 }

-// The function assumes that if `_uses_host_id == true`, then there are no directories that represent IP addresses,
-// i.e. every directory is either valid and represents a host ID, or is invalid (so it should be ignored anyway).
 future<> manager::initialize_endpoint_managers() {
    auto maybe_create_ep_mgr = [this] (const locator::host_id& host_id, const gms::inet_address& ip) -> future<> {
        if (!check_dc_for(host_id)) {
@@ -729,16 +762,29 @@ future<> manager::initialize_endpoint_managers() {

        // The directory is invalid, so there's nothing more to do.
        if (!maybe_host_id_or_ep) {
+            manager_logger.warn("Encountered a hint directory of invalid name while initializing endpoint managers: {}. "
+                    "Hints stored in it won't be replayed", de.name);
            co_return;
        }

        if (_uses_host_id) {
+            // If hinted handoff is host-ID-based but the directory doesn't represent a host ID,
+            // it's invalid. Ignore it.
+            if (!maybe_host_id_or_ep->has_host_id()) {
+                co_return;
+            }
+
            // If hinted handoff is host-ID-based, `get_ep_manager` will NOT use the passed IP address,
            // so we simply pass the default value there.
            co_return co_await maybe_create_ep_mgr(maybe_host_id_or_ep->id(), gms::inet_address{});
        }

        // If we have got to this line, hinted handoff is still IP-based and we need to map the IP.
+        if (!maybe_host_id_or_ep->has_endpoint()) {
+            // If the directory name doesn't represent an IP, it's invalid. We ignore it.
+            co_return;
+        }
+
        const auto maybe_host_id = std::invoke([&] () -> std::optional<locator::host_id> {
            try {
                return maybe_host_id_or_ep->resolve_id(*tmptr);
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -317,11 +317,16 @@ public:
    /// In both cases - removes the corresponding hints' directories after all hints have been drained and erases the
    /// corresponding hint_endpoint_manager objects.
    ///
-    /// \param endpoint node that left the cluster
-    future<> drain_for(endpoint_id endpoint) noexcept;
+    /// \param host_id host ID of the node that left the cluster
+    /// \param ip the IP of the node that left the cluster
+    future<> drain_for(endpoint_id host_id, gms::inet_address ip) noexcept;

    void update_backlog(size_t backlog, size_t max_backlog);

+    bool uses_host_id() const noexcept {
+        return _uses_host_id;
+    }
+
 private:
    bool stopping() const noexcept {
        return _state.contains(state::stopping);
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -148,10 +148,16 @@ void space_watchdog::on_timer() {
                auto maybe_variant = std::invoke([&] () -> std::optional<std::variant<locator::host_id, gms::inet_address>> {
                    try {
                        const auto hid_or_ep = locator::host_id_or_endpoint{de.name};
-                        if (hid_or_ep.has_host_id()) {
+
+                        // If hinted handoff is host-ID-based, hint directories representing IP addresses must've
+                        // been created by mistake and they're invalid. The same for pre-host-ID hinted handoff
+                        // -- hint directories representing host IDs are NOT valid.
+                        if (hid_or_ep.has_host_id() && shard_manager.uses_host_id()) {
                            return std::variant<locator::host_id, gms::inet_address>(hid_or_ep.id());
-                        } else {
+                        } else if (hid_or_ep.has_endpoint() && !shard_manager.uses_host_id()) {
                            return std::variant<locator::host_id, gms::inet_address>(hid_or_ep.endpoint());
+                        } else {
+                            return std::nullopt;
                        }
                    } catch (...) {
                        return std::nullopt;
@@ -173,6 +179,8 @@ void space_watchdog::on_timer() {
                // Case 3: The directory isn't managed by an endpoint manager, and it represents neither an IP address,
                //         nor a host ID.
                else {
+                    // We use trace here to prevent flooding logs with unnecessary information.
+                    resource_manager_logger.trace("Encountered a hint directory of invalid name while scanning: {}", de.name);
                    return scan_one_ep_dir(dir / de.name, shard_manager, {});
                }
            }).get();
--- a/db/hints/sync_point.cc
+++ b/db/hints/sync_point.cc
@@ -26,52 +26,63 @@ namespace hints {
 //
 // Format V1 (encoded in base64):
 //   uint8_t 0x01 - version of format
-//   sync_point_v1 - encoded using IDL
+//   sync_point_v1_or_v2 - encoded using IDL
 //
 // Format V2 (encoded in base64):
 //   uint8_t 0x02 - version of format
-//   sync_point_v1 - encoded using IDL
+//   sync_point_v1_or_v2 - encoded using IDL
 //   uint64_t - checksum computed using the xxHash algorithm
 //
-// sync_point_v1:
+// Format V3 (encoded in base64):
+//   uint8_t 0x03 - version of format
+//   sync_point_v3 - encoded using IDL
+//   uint64_t - checksum computed using the xxHash algorithm
+//
+// sync_point_v1_or_v2:
 //   UUID host_id - ID of the host which created the sync point
 //   uint16_t shard_count - the number of shards in this sync point
-//   per_manager_sync_point_v1 regular_sp - replay positions for regular mutation hint queues
-//   per_manager_sync_point_v1 mv_sp - replay positions for materialized view hint queues
+//   per_manager_sync_point_v1_or_v2 regular_sp - replay positions for regular mutation hint queues
+//   per_manager_sync_point_v1_or_v2 mv_sp - replay positions for materialized view hint queues
 //
-// per_manager_sync_point_v1:
-//   std::vector<gms::inet_address> addresses - addresses for which this sync point defines replay positions
+// per_manager_sync_point_v1_or_v2:
+//   std::vector<gms::inet_address> endpoints - addresses for which this sync point defines replay positions
 //   std::vector<db::replay_position> flattened_rps:
-//       A flattened collection of replay positions for all addresses and shards.
+//       A flattened collection of replay positions for all endpoints and shards.
 //       Replay positions are grouped by address, in the same order as in
-//       the `addresses` field, and there is one replay position for each of
+//       the `endpoints` field, and there is one replay position for each of
 //       the shards (shard count is defined by the `shard_count`) field.
 //       Flattened representation was chosen in order to save space on
 //       vector lengths etc.
+//
+// sync_point_v3:
+//   similar to sync_point_v1_or_v2 except it uses per_manager_sync_point_v3 instead
+//   of per_manager_sync_point_v1_or_v2, which has locator::host_id instead of
+//   gms::inet_address.

 static constexpr size_t version_size = sizeof(uint8_t);
 static constexpr size_t checksum_size = sizeof(uint64_t);

-static std::vector<sync_point::shard_rps> decode_one_type_v1(uint16_t shard_count, const per_manager_sync_point_v1& v1) {
+template <typename PerManagerType>
+static std::vector<sync_point::shard_rps> decode_one_type(uint16_t shard_count, const PerManagerType& v) {
    std::vector<sync_point::shard_rps> ret;

-    if (size_t(shard_count) * v1.addresses.size() != v1.flattened_rps.size()) {
+    if (size_t(shard_count) * v.endpoints.size() != v.flattened_rps.size()) {
        throw std::runtime_error(format("Could not decode the sync point - there should be {} rps in flattened_rps, but there are only {}",
-                size_t(shard_count) * v1.addresses.size(), v1.flattened_rps.size()));
+                size_t(shard_count) * v.endpoints.size(), v.flattened_rps.size()));
    }

    ret.resize(std::max(unsigned(shard_count), smp::count));

-    auto rps_it = v1.flattened_rps.begin();
-    for (const auto addr : v1.addresses) {
+    auto rps_it = v.flattened_rps.begin();
+    for (const auto ep : v.endpoints) {
        uint16_t shard;
        for (shard = 0; shard < shard_count; shard++) {
-            ret[shard].emplace(addr, *rps_it++);
+            ret[shard].emplace(ep, *rps_it++);
        }
        // Fill missing shards with zero replay positions so that segments
        // which were moved across shards will be correctly waited on
        for (; shard < smp::count; shard++) {
-            ret[shard].emplace(addr, db::replay_position());
+            ret[shard].emplace(ep, db::replay_position());
        }
    }

@@ -94,50 +105,62 @@ sync_point sync_point::decode(sstring_view s) {
    seastar::simple_memory_input_stream in{raw_s.data(), raw_s.size()};

    uint8_t version = ser::serializer<uint8_t>::read(in);
-    if (version == 2) {
+    if (version == 2 || version == 3) {
        if (raw_s.size() < version_size + checksum_size) {
-            throw std::runtime_error("Could not decode the sync point encoded in the V2 format - serialized blob is too short");
+            throw std::runtime_error("Could not decode the sync point encoded in the V2/V3 format - serialized blob is too short");
        }

        seastar::simple_memory_input_stream in_checksum{raw_s.end() - checksum_size, checksum_size};
        uint64_t checksum = ser::serializer<uint64_t>::read(in_checksum);
        if (checksum != calculate_checksum(raw_s.substr(0, raw_s.size() - checksum_size))) {
-            throw std::runtime_error("Could not decode the sync point encoded in the V2 format - wrong checksum");
+            throw std::runtime_error("Could not decode the sync point encoded in the V2/V3 format - wrong checksum");
        }
    }
    else if (version != 1) {
        throw std::runtime_error(format("Unsupported sync point format version: {}", int(version)));
    }

-    sync_point_v1 v1 = ser::serializer<sync_point_v1>::read(in);
+    if (version == 1 || version == 2) {
+        sync_point_v1_or_v2 v = ser::serializer<sync_point_v1_or_v2>::read(in);
+
+        return sync_point{
+            v.host_id,
+            decode_one_type(v.shard_count, v.regular_sp),
+            decode_one_type(v.shard_count, v.mv_sp),
+        };
+    }
+
+    // version == 3
+    sync_point_v3 v3 = ser::serializer<sync_point_v3>::read(in);

    return sync_point{
-        v1.host_id,
-        decode_one_type_v1(v1.shard_count, v1.regular_sp),
-        decode_one_type_v1(v1.shard_count, v1.mv_sp),
+        v3.host_id,
+        decode_one_type(v3.shard_count, v3.regular_sp),
+        decode_one_type(v3.shard_count, v3.mv_sp),
    };
 }

-static per_manager_sync_point_v1 encode_one_type_v1(unsigned shards, const std::vector<sync_point::shard_rps>& rps) {
-    per_manager_sync_point_v1 ret;
+static per_manager_sync_point_v3 encode_one_type_v3(unsigned shards, const std::vector<sync_point::shard_rps>& rps) {
+    per_manager_sync_point_v3 ret;

-    // Gather all addresses, from all shards
-    std::unordered_set<gms::inet_address> all_addrs;
+    // Gather all endpoints, from all shards
+    std::unordered_set<locator::host_id> all_eps;
    for (const auto& shard_rps : rps) {
        for (const auto& p : shard_rps) {
-            all_addrs.insert(p.first);
+            // New sync points are created with host_id only
+            all_eps.insert(std::get<locator::host_id>(p.first));
        }
    }

-    ret.flattened_rps.reserve(size_t(shards) * all_addrs.size());
+    ret.flattened_rps.reserve(size_t(shards) * all_eps.size());

-    // Encode into v1 struct
-    // For each address, we encode a replay position for all shards.
+    // Encode into v3 struct
+    // For each endpoint, we encode a replay position for all shards.
    // If there is no replay position for a shard, we use a zero replay position.
-    for (const auto addr : all_addrs) {
-        ret.addresses.push_back(addr);
+    for (const auto ep : all_eps) {
+        ret.endpoints.push_back(ep);
        for (const auto& shard_rps : rps) {
-            auto it = shard_rps.find(addr);
+            auto it = shard_rps.find(ep);
            if (it != shard_rps.end()) {
                ret.flattened_rps.push_back(it->second);
            } else {
@@ -154,24 +177,24 @@ static per_manager_sync_point_v1 encode_one_type_v1(unsigned shards, const std::
 }

 sstring sync_point::encode() const {
-    // Encode as v1 structure
-    sync_point_v1 v1;
-    v1.host_id = this->host_id;
-    v1.shard_count = std::max(this->regular_per_shard_rps.size(), this->mv_per_shard_rps.size());
-    v1.regular_sp = encode_one_type_v1(v1.shard_count, this->regular_per_shard_rps);
-    v1.mv_sp = encode_one_type_v1(v1.shard_count, this->mv_per_shard_rps);
+    // Encode as v3 structure
+    sync_point_v3 v3;
+    v3.host_id = this->host_id;
+    v3.shard_count = std::max(this->regular_per_shard_rps.size(), this->mv_per_shard_rps.size());
+    v3.regular_sp = encode_one_type_v3(v3.shard_count, this->regular_per_shard_rps);
+    v3.mv_sp = encode_one_type_v3(v3.shard_count, this->mv_per_shard_rps);

    // Measure how much space we need
    seastar::measuring_output_stream measure;
-    ser::serializer<sync_point_v1>::write(measure, v1);
+    ser::serializer<sync_point_v3>::write(measure, v3);

    // Reserve version_size bytes for the version and checksum_size bytes for the checksum
    bytes serialized{bytes::initialized_later{}, version_size + measure.size() + checksum_size};

-    // Encode using V2 format
+    // Encode using V3 format
    seastar::simple_memory_output_stream out{reinterpret_cast<char*>(serialized.data()), serialized.size()};
-    ser::serializer<uint8_t>::write(out, 2);
-    ser::serializer<sync_point_v1>::write(out, v1);
+    ser::serializer<uint8_t>::write(out, 3);
+    ser::serializer<sync_point_v3>::write(out, v3);
    sstring_view serialized_s(reinterpret_cast<const char*>(serialized.data()), version_size + measure.size());
    uint64_t checksum = calculate_checksum(serialized_s);
    ser::serializer<uint64_t>::write(out, checksum);
--- a/db/hints/sync_point.hh
+++ b/db/hints/sync_point.hh
@@ -22,7 +22,8 @@ namespace hints {
 // A sync point is a collection of positions in hint queues which can be waited on.
 // The sync point encompasses one type of hints manager only.
 struct sync_point {
-    using shard_rps = std::unordered_map<gms::inet_address, db::replay_position>;
+    using host_id_or_addr = std::variant<locator::host_id, gms::inet_address>;
+    using shard_rps = std::unordered_map<host_id_or_addr, db::replay_position>;
    // ID of the host which created this sync point
    locator::host_id host_id;
    std::vector<shard_rps> regular_per_shard_rps;
@@ -40,21 +41,41 @@ struct sync_point {
 // IDL type
 // Contains per-endpoint and per-shard information about replay positions
 // for a particular type of hint queues (regular mutation hints or MV update hints)
-struct per_manager_sync_point_v1 {
-    std::vector<gms::inet_address> addresses;
+struct per_manager_sync_point_v1_or_v2 {
+    std::vector<gms::inet_address> endpoints;
    std::vector<db::replay_position> flattened_rps;
 };

 // IDL type
-struct sync_point_v1 {
+struct sync_point_v1_or_v2 {
    locator::host_id host_id;
    uint16_t shard_count;

    // Sync point information for regular mutation hints
-    db::hints::per_manager_sync_point_v1 regular_sp;
+    db::hints::per_manager_sync_point_v1_or_v2 regular_sp;

    // Sync point information for materialized view hints
-    db::hints::per_manager_sync_point_v1 mv_sp;
+    db::hints::per_manager_sync_point_v1_or_v2 mv_sp;
+};
+
+// IDL type
+// same as per_manager_sync_point_v1_or_v2 except that it stores the
+// endpoints as host_id instead of address
+struct per_manager_sync_point_v3 {
+    std::vector<locator::host_id> endpoints;
+    std::vector<db::replay_position> flattened_rps;
+};
+
+// IDL type
+struct sync_point_v3 {
+    locator::host_id host_id;
+    uint16_t shard_count;
+
+    // Sync point information for regular mutation hints
+    db::hints::per_manager_sync_point_v3 regular_sp;
+
+    // Sync point information for materialized view hints
+    db::hints::per_manager_sync_point_v3 mv_sp;
 };

 }
--- a/db/paxos_grace_seconds_extension.hh
+++ b/db/paxos_grace_seconds_extension.hh
@@ -55,6 +55,10 @@ public:
        return ser::serialize_to_buffer<bytes>(_paxos_gc_sec);
    }

+    std::string options_to_string() const override {
+        return std::to_string(_paxos_gc_sec);
+    }
+
    static int32_t deserialize(const bytes_view& buffer) {
        return ser::deserialize_from_buffer(buffer, boost::type<int32_t>());
    }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -14,7 +14,6 @@
 #include "gms/feature_service.hh"
 #include "partition_slice_builder.hh"
 #include "dht/i_partitioner.hh"
-#include "system_auth_keyspace.hh"
 #include "system_keyspace.hh"
 #include "query-result-set.hh"
 #include "query-result-writer.hh"
@@ -235,7 +234,6 @@ future<> save_system_schema(cql3::query_processor& qp) {
    co_await save_system_schema_to_keyspace(qp, schema_tables::NAME);
    // #2514 - make sure "system" is written to system_schema.keyspaces.
    co_await save_system_schema_to_keyspace(qp, system_keyspace::NAME);
-    co_await save_system_schema_to_keyspace(qp, system_auth_keyspace::NAME);
 }

 namespace v3 {
@@ -793,40 +791,35 @@ redact_columns_for_missing_features(mutation&& m, schema_features features) {
 */
 future<table_schema_version> calculate_schema_digest(distributed<service::storage_proxy>& proxy, schema_features features, noncopyable_function<bool(std::string_view)> accept_keyspace)
 {
-    auto map = [&proxy, features, accept_keyspace = std::move(accept_keyspace)] (sstring table) mutable -> future<std::vector<mutation>> {
+    using mutations_generator = coroutine::experimental::generator<mutation>;
+
+    auto map = [&proxy, features, accept_keyspace = std::move(accept_keyspace)] (sstring table) mutable -> mutations_generator {
        auto& db = proxy.local().get_db();
        auto rs = co_await db::system_keyspace::query_mutations(db, NAME, table);
        auto s = db.local().find_schema(NAME, table);
-        std::vector<mutation> mutations;
        for (auto&& p : rs->partitions()) {
-            auto mut = co_await unfreeze_gently(p.mut(), s);
-            auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
+            auto partition_key = value_cast<sstring>(utf8_type->deserialize(::partition_key(p.mut().key()).get_component(*s, 0)));
            if (!accept_keyspace(partition_key)) {
                continue;
            }
-            mut = redact_columns_for_missing_features(std::move(mut), features);
-            mutations.emplace_back(std::move(mut));
-        }
-        co_return mutations;
-    };
-    auto reduce = [features] (auto& hash, auto&& mutations) {
-        for (const mutation& m : mutations) {
-            feed_hash_for_schema_digest(hash, m, features);
+            auto mut = co_await unfreeze_gently(p.mut(), s);
+            co_yield redact_columns_for_missing_features(std::move(mut), features);
        }
    };
    auto hash = md5_hasher();
    auto tables = all_table_names(features);
    {
        for (auto& table: tables) {
-            auto mutations = co_await map(table);
-            if (diff_logger.is_enabled(logging::log_level::trace)) {
-                for (const mutation& m : mutations) {
+            auto gen_mutations = map(table);
+            while (auto mut_opt = co_await gen_mutations()) {
+                auto& m = *mut_opt;
+                feed_hash_for_schema_digest(hash, m, features);
+                if (diff_logger.is_enabled(logging::log_level::trace)) {
                    md5_hasher h;
                    feed_hash_for_schema_digest(h, m, features);
                    diff_logger.trace("Digest {} for {}, compacted={}", h.finalize(), m, compact_for_schema_digest(m));
                }
            }
-            reduce(hash, mutations);
        }
        co_return utils::UUID_gen::get_name_UUID(hash.finalize());
    }
@@ -1296,7 +1289,6 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, shar
    schema_ptr s = keyspaces();
    // compare before/after schemas of the affected keyspaces only
    std::set<sstring> keyspaces;
-    std::set<table_id> column_families;
    std::unordered_map<keyspace_name, table_selector> affected_tables;
    bool has_tablet_mutations = false;
    for (auto&& mutation : mutations) {
@@ -1311,7 +1303,6 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, shar
        }

        keyspaces.emplace(std::move(keyspace_name));
-        column_families.emplace(mutation.column_family_id());
        // We must force recalculation of schema version after the merge, since the resulting
        // schema may be a mix of the old and new schemas, with the exception of entries
        // that originate from group 0.
@@ -2021,7 +2012,9 @@ static shared_ptr<cql3::functions::user_aggregate> create_aggregate(replica::dat

    bytes_opt initcond = std::nullopt;
    if (initcond_str) {
-        auto expr = cql3::util::do_with_parser(*initcond_str, std::mem_fn(&cql3_parser::CqlParser::term));
+        // In general using the default dialect is wrong, but here the database is communicating with itself,
+        // not the user, so any dialect should work.
+        auto expr = cql3::util::do_with_parser(*initcond_str, cql3::dialect{}, std::mem_fn(&cql3_parser::CqlParser::term));
        auto dummy_ident = ::make_shared<cql3::column_identifier>("", true);
        auto column_spec = make_lw_shared<cql3::column_specification>("", "", dummy_ident, state_type);
        auto raw = cql3::expr::evaluate(prepare_expression(expr, db.as_data_dictionary(), "", nullptr, {column_spec}), cql3::query_options::DEFAULT);
--- a/db/system_auth_keyspace.cc
+++ b/db/system_auth_keyspace.cc
@@ -1,141 +0,0 @@
-/*
- * Modified by ScyllaDB
- * Copyright (C) 2024-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
- */
-
-#include "system_auth_keyspace.hh"
-#include "system_keyspace.hh"
-#include "db/schema_tables.hh"
-#include "schema/schema_builder.hh"
-#include "types/set.hh"
-
-namespace db {
-
-// all system auth tables use schema commitlog
-namespace {
-    const auto set_use_schema_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
-        if (ks_name == system_auth_keyspace::NAME) {
-            props.enable_schema_commitlog();
-        }
-    });
-} // anonymous namespace
-
-namespace system_auth_keyspace {
-
-// use the same gc setting as system_schema tables
-using days = std::chrono::duration<int, std::ratio<24 * 3600>>;
-// FIXME: in some cases time-based gc may cause data resurrection,
-// for more info see https://github.com/scylladb/scylladb/issues/15607
-static constexpr auto auth_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();
-
-schema_ptr roles() {
-    static thread_local auto schema = [] {
-        schema_builder builder(generate_legacy_id(NAME, ROLES), NAME, ROLES,
-        // partition key
-        {{"role", utf8_type}},
-        // clustering key
-        {},
-        // regular columns
-        {
-            {"can_login", boolean_type},
-            {"is_superuser", boolean_type},
-            {"member_of", set_type_impl::get_instance(utf8_type, true)},
-            {"salted_hash", utf8_type}
-        },
-        // static columns
-        {},
-        // regular column name type
-        utf8_type,
-        // comment
-        "roles for authentication and RBAC"
-        );
-        builder.set_gc_grace_seconds(auth_gc_grace);
-        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
-        return builder.build();
-    }();
-    return schema;
-}
-
-schema_ptr role_members() {
-    static thread_local auto schema = [] {
-        schema_builder builder(generate_legacy_id(NAME, ROLE_MEMBERS), NAME, ROLE_MEMBERS,
-        // partition key
-        {{"role", utf8_type}},
-        // clustering key
-        {{"member", utf8_type}},
-        // regular columns
-        {},
-        // static columns
-        {},
-        // regular column name type
-        utf8_type,
-        // comment
-        "joins users and their granted roles in RBAC"
-        );
-        builder.set_gc_grace_seconds(auth_gc_grace);
-        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
-        return builder.build();
-    }();
-    return schema;
-}
-
-schema_ptr role_attributes() {
-    static thread_local auto schema = [] {
-        schema_builder builder(generate_legacy_id(NAME, ROLE_ATTRIBUTES), NAME, ROLE_ATTRIBUTES,
-        // partition key
-        {{"role", utf8_type}},
-        // clustering key
-        {{"name", utf8_type}},
-        // regular columns
-        {
-            {"value", utf8_type}
-        },
-        // static columns
-        {},
-        // regular column name type
-        utf8_type,
-        // comment
-        "role permissions in RBAC"
-        );
-        builder.set_gc_grace_seconds(auth_gc_grace);
-        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
-        return builder.build();
-    }();
-    return schema;
-}
-
-schema_ptr role_permissions() {
-    static thread_local auto schema = [] {
-        schema_builder builder(generate_legacy_id(NAME, ROLE_PERMISSIONS), NAME, ROLE_PERMISSIONS,
-        // partition key
-        {{"role", utf8_type}},
-        // clustering key
-        {{"resource", utf8_type}},
-        // regular columns
-        {
-            {"permissions", set_type_impl::get_instance(utf8_type, true)}
-        },
-        // static columns
-        {},
-        // regular column name type
-        utf8_type,
-        // comment
-        "role permissions for CassandraAuthorizer"
-        );
-        builder.set_gc_grace_seconds(auth_gc_grace);
-        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
-        return builder.build();
-    }();
-    return schema;
-}
-
-std::vector<schema_ptr> all_tables() {
-    return {roles(), role_members(), role_attributes(), role_permissions()};
-}
-
-} // namespace system_auth_keyspace
-} // namespace db
--- a/db/system_auth_keyspace.hh
+++ b/db/system_auth_keyspace.hh
@@ -1,38 +0,0 @@
-/*
- * Modified by ScyllaDB
- * Copyright (C) 2024-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
- */
-
-#pragma once
-
-#include "schema/schema_fwd.hh"
-#include <vector>
-
-namespace db {
-
-namespace system_auth_keyspace {
-    enum class version_t: int64_t {
-        v1 = 1,
-        v2 = 2,
-    };
-    static constexpr auto NAME = "system_auth_v2";
-    // tables
-    static constexpr auto ROLES = "roles";
-    static constexpr auto ROLE_MEMBERS = "role_members";
-    static constexpr auto ROLE_ATTRIBUTES = "role_attributes";
-    static constexpr auto ROLE_PERMISSIONS = "role_permissions";
-
-
-    schema_ptr roles();
-    schema_ptr role_members();
-    schema_ptr role_attributes();
-    schema_ptr role_permissions();
-
-    std::vector<schema_ptr> all_tables();
-}; // namespace system_auth_keyspace
-
-} // namespace db
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -18,7 +18,6 @@
 #include <seastar/core/on_internal_error.hh>
 #include "system_keyspace.hh"
 #include "cql3/untyped_result_set.hh"
-#include "db/system_auth_keyspace.hh"
 #include "thrift/server.hh"
 #include "cql3/query_processor.hh"
 #include "partition_slice_builder.hh"
@@ -88,6 +87,10 @@ namespace {
            system_keyspace::SCYLLA_LOCAL,
            system_keyspace::COMMITLOG_CLEANUPS,
            system_keyspace::SERVICE_LEVELS_V2,
+            system_keyspace::ROLES,
+            system_keyspace::ROLE_MEMBERS,
+            system_keyspace::ROLE_ATTRIBUTES,
+            system_keyspace::ROLE_PERMISSIONS,
            system_keyspace::v3::CDC_LOCAL
        };
        if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
@@ -233,12 +236,15 @@ schema_ptr system_keyspace::topology() {
            .with_column("request_id", timeuuid_type)
            .with_column("ignore_nodes", set_type_impl::get_instance(uuid_type, true), column_kind::static_column)
            .with_column("new_cdc_generation_data_uuid", timeuuid_type, column_kind::static_column)
+            .with_column("new_keyspace_rf_change_ks_name", utf8_type, column_kind::static_column)
+            .with_column("new_keyspace_rf_change_data", map_type_impl::get_instance(utf8_type, utf8_type, false), column_kind::static_column)
            .with_column("version", long_type, column_kind::static_column)
            .with_column("fence_version", long_type, column_kind::static_column)
            .with_column("transition_state", utf8_type, column_kind::static_column)
            .with_column("committed_cdc_generations", set_type_impl::get_instance(cdc_generation_ts_id_type, true), column_kind::static_column)
            .with_column("unpublished_cdc_generations", set_type_impl::get_instance(cdc_generation_ts_id_type, true), column_kind::static_column)
            .with_column("global_topology_request", utf8_type, column_kind::static_column)
+            .with_column("global_topology_request_id", timeuuid_type, column_kind::static_column)
            .with_column("enabled_features", set_type_impl::get_instance(utf8_type, true), column_kind::static_column)
            .with_column("session", uuid_type, column_kind::static_column)
            .with_column("tablet_balancing_enabled", boolean_type, column_kind::static_column)
@@ -1139,6 +1145,103 @@ schema_ptr system_keyspace::service_levels_v2() {
    return schema;
 }

+schema_ptr system_keyspace::roles() {
+    static thread_local auto schema = [] {
+        schema_builder builder(generate_legacy_id(NAME, ROLES), NAME, ROLES,
+        // partition key
+        {{"role", utf8_type}},
+        // clustering key
+        {},
+        // regular columns
+        {
+            {"can_login", boolean_type},
+            {"is_superuser", boolean_type},
+            {"member_of", set_type_impl::get_instance(utf8_type, true)},
+            {"salted_hash", utf8_type}
+        },
+        // static columns
+        {},
+        // regular column name type
+        utf8_type,
+        // comment
+        "roles for authentication and RBAC"
+        );
+        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
+        return builder.build();
+    }();
+    return schema;
+}
+
+schema_ptr system_keyspace::role_members() {
+    static thread_local auto schema = [] {
+        schema_builder builder(generate_legacy_id(NAME, ROLE_MEMBERS), NAME, ROLE_MEMBERS,
+        // partition key
+        {{"role", utf8_type}},
+        // clustering key
+        {{"member", utf8_type}},
+        // regular columns
+        {},
+        // static columns
+        {},
+        // regular column name type
+        utf8_type,
+        // comment
+        "joins users and their granted roles in RBAC"
+        );
+        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
+        return builder.build();
+    }();
+    return schema;
+}
+
+schema_ptr system_keyspace::role_attributes() {
+    static thread_local auto schema = [] {
+        schema_builder builder(generate_legacy_id(NAME, ROLE_ATTRIBUTES), NAME, ROLE_ATTRIBUTES,
+        // partition key
+        {{"role", utf8_type}},
+        // clustering key
+        {{"name", utf8_type}},
+        // regular columns
+        {
+            {"value", utf8_type}
+        },
+        // static columns
+        {},
+        // regular column name type
+        utf8_type,
+        // comment
+        "role permissions in RBAC"
+        );
+        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
+        return builder.build();
+    }();
+    return schema;
+}
+
+schema_ptr system_keyspace::role_permissions() {
+    static thread_local auto schema = [] {
+        schema_builder builder(generate_legacy_id(NAME, ROLE_PERMISSIONS), NAME, ROLE_PERMISSIONS,
+        // partition key
+        {{"role", utf8_type}},
+        // clustering key
+        {{"resource", utf8_type}},
+        // regular columns
+        {
+            {"permissions", set_type_impl::get_instance(utf8_type, true)}
+        },
+        // static columns
+        {},
+        // regular column name type
+        utf8_type,
+        // comment
+        "role permissions for CassandraAuthorizer"
+        );
+        builder.with_version(system_keyspace::generate_schema_version(builder.uuid()));
+        return builder.build();
+    }();
+    return schema;
+}
+
 schema_ptr system_keyspace::legacy::hints() {
    static thread_local auto schema = [] {
        schema_builder builder(generate_legacy_id(NAME, HINTS), NAME, HINTS,
@@ -2130,10 +2233,16 @@ future<> system_keyspace::set_bootstrap_state(bootstrap_state state) {
    });
 }

+std::vector<schema_ptr> system_keyspace::auth_tables() {
+    return {roles(), role_members(), role_attributes(), role_permissions()};
+}
+
 std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
    std::vector<schema_ptr> r;
    auto schema_tables = db::schema_tables::all_tables(schema_features::full());
    std::copy(schema_tables.begin(), schema_tables.end(), std::back_inserter(r));
+    auto auth_tables = system_keyspace::auth_tables();
+    std::copy(auth_tables.begin(), auth_tables.end(), std::back_inserter(r));
    r.insert(r.end(), { built_indexes(), hints(), batchlog(), paxos(), local(),
                    peers(), peer_events(), range_xfers(),
                    compactions_in_progress(), compaction_history(),
@@ -2149,14 +2258,11 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
                    topology(), cdc_generations_v3(), topology_requests(), service_levels_v2(),
    });

-    auto auth_tables = db::system_auth_keyspace::all_tables();
-    std::copy(auth_tables.begin(), auth_tables.end(), std::back_inserter(r));
-
    if (cfg.check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
        r.insert(r.end(), {broadcast_kv_store()});
    }

-    if (cfg.check_experimental(db::experimental_features_t::feature::TABLETS)) {
+    if (cfg.enable_tablets()) {
        r.insert(r.end(), {tablets()});
    }

@@ -2691,17 +2797,17 @@ future<std::optional<mutation>> system_keyspace::get_group0_schema_version() {

 static constexpr auto AUTH_VERSION_KEY = "auth_version";

-future<system_auth_keyspace::version_t> system_keyspace::get_auth_version() {
+future<system_keyspace::auth_version_t> system_keyspace::get_auth_version() {
    auto str_opt = co_await get_scylla_local_param(AUTH_VERSION_KEY);
    if (!str_opt) {
-        co_return db::system_auth_keyspace::version_t::v1;
+        co_return auth_version_t::v1;
    }
    auto& str = *str_opt;
    if (str == "" || str == "1") {
-        co_return db::system_auth_keyspace::version_t::v1;
+        co_return auth_version_t::v1;
    }
    if (str == "2") {
-        co_return db::system_auth_keyspace::version_t::v2;
+        co_return auth_version_t::v2;
    }
    on_internal_error(slogger, fmt::format("unexpected auth_version in scylla_local got {}", str));
 }
@@ -2719,7 +2825,7 @@ static service::query_state& internal_system_query_state() {
    return qs;
 };

-future<mutation> system_keyspace::make_auth_version_mutation(api::timestamp_type ts, db::system_auth_keyspace::version_t version) {
+future<mutation> system_keyspace::make_auth_version_mutation(api::timestamp_type ts, db::system_keyspace::auth_version_t version) {
    static sstring query = format("INSERT INTO {}.{} (key, value) VALUES (?, ?);", db::system_keyspace::NAME, db::system_keyspace::SCYLLA_LOCAL);
    auto muts = co_await _qp.get_mutations_internal(query, internal_system_query_state(), ts, {AUTH_VERSION_KEY, std::to_string(int64_t(version))});
    if (muts.size() != 1) {
@@ -2967,6 +3073,11 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            ret.committed_cdc_generations = decode_cdc_generations_ids(deserialize_set_column(*topology(), some_row, "committed_cdc_generations"));
        }

+        if (some_row.has("new_keyspace_rf_change_data")) {
+            ret.new_keyspace_rf_change_ks_name = some_row.get_as<sstring>("new_keyspace_rf_change_ks_name");
+            ret.new_keyspace_rf_change_data = some_row.get_map<sstring,sstring>("new_keyspace_rf_change_data");
+        }
+
        if (!ret.committed_cdc_generations.empty()) {
            // Sanity check for CDC generation data consistency.
            auto gen_id = ret.committed_cdc_generations.back();
@@ -2998,6 +3109,10 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            ret.global_request.emplace(req);
        }

+        if (some_row.has("global_topology_request_id")) {
+            ret.global_request_id = some_row.get_as<utils::UUID>("global_topology_request_id");
+        }
+
        if (some_row.has("enabled_features")) {
            ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
        }
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -14,7 +14,6 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "db/system_auth_keyspace.hh"
 #include "gms/gossiper.hh"
 #include "schema/schema_fwd.hh"
 #include "utils/UUID.hh"
@@ -180,6 +179,12 @@ public:
    static constexpr auto TABLETS = "tablets";
    static constexpr auto SERVICE_LEVELS_V2 = "service_levels_v2";

+    // auth
+    static constexpr auto ROLES = "roles";
+    static constexpr auto ROLE_MEMBERS = "role_members";
+    static constexpr auto ROLE_ATTRIBUTES = "role_attributes";
+    static constexpr auto ROLE_PERMISSIONS = "role_permissions";
+
    struct v3 {
        static constexpr auto BATCHES = "batches";
        static constexpr auto PAXOS = "paxos";
@@ -267,6 +272,12 @@ public:
    static schema_ptr tablets();
    static schema_ptr service_levels_v2();

+    // auth
+    static schema_ptr roles();
+    static schema_ptr role_members();
+    static schema_ptr role_attributes();
+    static schema_ptr role_permissions();
+
    static table_schema_version generate_schema_version(table_id table_id, uint16_t offset = 0);

    future<> build_bootstrap_info();
@@ -310,7 +321,9 @@ public:
    template <typename T>
    future<std::optional<T>> get_scylla_local_param_as(const sstring& key);

+    static std::vector<schema_ptr> auth_tables();
    static std::vector<schema_ptr> all_tables(const db::config& cfg);
+
    future<> make(
            locator::effective_replication_map_factory&,
            replica::database&);
@@ -577,11 +590,16 @@ public:
    // returns the corresponding mutation. Otherwise returns nullopt.
    future<std::optional<mutation>> get_group0_schema_version();

+    enum class auth_version_t: int64_t {
+        v1 = 1,
+        v2 = 2,
+    };
+
    // If the `auth_version` key in `system.scylla_local` is present (either live or tombstone),
    // returns the corresponding mutation. Otherwise returns nullopt.
    future<std::optional<mutation>> get_auth_version_mutation();
-    future<mutation> make_auth_version_mutation(api::timestamp_type ts, db::system_auth_keyspace::version_t version);
-    future<system_auth_keyspace::version_t> get_auth_version();
+    future<mutation> make_auth_version_mutation(api::timestamp_type ts, auth_version_t version);
+    future<auth_version_t> get_auth_version();

    future<> sstables_registry_create_entry(sstring location, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc);
    future<> sstables_registry_update_entry_status(sstring location, sstables::generation_type gen, sstring status);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -1576,6 +1576,18 @@ bool needs_static_row(const mutation_partition& mp, const std::vector<view_and_b
    return mp.partition_tombstone() || !mp.static_row().empty();
 }

+bool should_generate_view_updates_on_this_shard(const schema_ptr& base, const locator::effective_replication_map_ptr& ermp, dht::token token) {
+    // Based on the computation in get_view_natural_endpoint, this is used
+    // to detect beforehand the case that we're a "normal" replica which is
+    // paired with a view replica and sends view updates to.
+    // For a pending replica, for example, this will return false.
+    // Also, for the case of intra-node migration, we check that this shard is ready for reads.
+    const auto my_host_id = ermp->get_token_metadata_ptr()->get_topology().my_host_id();
+    const auto replicas = ermp->get_replicas(token);
+    return std::find(replicas.begin(), replicas.end(), my_host_id) != replicas.end()
+        && ermp->shard_for_reads(*base, token) == this_shard_id();
+}
+
 // Calculate the node ("natural endpoint") to which this node should send
 // a view update.
 //
@@ -1611,7 +1623,8 @@ get_view_natural_endpoint(
        bool network_topology,
        const dht::token& base_token,
        const dht::token& view_token,
-        bool use_legacy_self_pairing) {
+        bool use_legacy_self_pairing,
+        replica::cf_stats& cf_stats) {
    auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
    auto me = topology.my_host_id();
    auto my_datacenter = topology.get_datacenter();
@@ -1625,25 +1638,26 @@ get_view_natural_endpoint(
        }
    }

+    auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
    for (auto&& view_endpoint : view_erm->get_replicas(view_token)) {
        if (use_legacy_self_pairing) {
+            auto it = std::find(base_endpoints.begin(), base_endpoints.end(),
+                view_endpoint);
            // If this base replica is also one of the view replicas, we use
            // ourselves as the view replica.
-            if (view_endpoint == me) {
+            if (view_endpoint == me && it != base_endpoints.end()) {
                return topology.my_address();
            }
            // We have to remove any endpoint which is shared between the base
            // and the view, as it will select itself and throw off the counts
            // otherwise.
-            auto it = std::find(base_endpoints.begin(), base_endpoints.end(),
-                view_endpoint);
            if (it != base_endpoints.end()) {
                base_endpoints.erase(it);
-            } else if (!network_topology || topology.get_datacenter(view_endpoint) == my_datacenter) {
+            } else if (!network_topology || view_topology.get_datacenter(view_endpoint) == my_datacenter) {
                view_endpoints.push_back(view_endpoint);
            }
        } else {
-            if (!network_topology || topology.get_datacenter(view_endpoint) == my_datacenter) {
+            if (!network_topology || view_topology.get_datacenter(view_endpoint) == my_datacenter) {
                view_endpoints.push_back(view_endpoint);
            }
        }
@@ -1654,11 +1668,27 @@ get_view_natural_endpoint(
    if (base_it == base_endpoints.end()) {
        // This node is not a base replica of this key, so we return empty
        // FIXME: This case shouldn't happen, and if it happens, a view update
-        // would be lost. We should reported or count this case.
+        // would be lost.
+        ++cf_stats.total_view_updates_on_wrong_node;
        return {};
    }
    auto replica = view_endpoints[base_it - base_endpoints.begin()];
-    return topology.get_node(replica).endpoint();
+
+    // https://github.com/scylladb/scylladb/issues/19439
+    // With tablets, a node being replaced might transition to "left" state
+    // but still be kept as a replica. In such case, the IP of the replaced
+    // node will be lost and `endpoint()` will return an empty IP here.
+    // As of writing this, storage proxy was not migrated to host IDs yet
+    // (#6403) and hints are not prepared to handle nodes that are left
+    // but are still replicas. Therefore, there is no other sensible option
+    // right now but to give up attempt to send the update or write a hint
+    // to the paired, permanently down replica.
+    const auto ep = view_topology.get_node(replica).endpoint();
+    if (ep != gms::inet_address{}) {
+        return ep;
+    } else {
+        return std::nullopt;
+    }
 }

 static future<> apply_to_remote_endpoints(service::storage_proxy& proxy, locator::effective_replication_map_ptr ermp,
@@ -1715,6 +1745,7 @@ future<> view_update_generator::mutate_MV(
 {
    auto base_ermp = base->table().get_effective_replication_map();
    static constexpr size_t max_concurrent_updates = 128;
+    co_await utils::get_local_injector().inject("delay_before_get_view_natural_endpoint", 8000ms);
    co_await max_concurrent_for_each(view_updates, max_concurrent_updates,
            [this, base_token, &stats, &cf_stats, tr_state, &pending_view_updates, allow_hints, wait_for_all, base_ermp] (frozen_mutation_and_schema mut) mutable -> future<> {
        auto view_token = dht::get_token(*mut.s, mut.fm.key());
@@ -1727,7 +1758,7 @@ future<> view_update_generator::mutate_MV(
        // TODO: Maybe allow users to set use_legacy_self_pairing explicitly
        // on a view, like we have the synchronous_updates_flag.
        bool use_legacy_self_pairing = !ks.uses_tablets();
-        auto target_endpoint = get_view_natural_endpoint(base_ermp, view_ermp, network_topology, base_token, view_token, use_legacy_self_pairing);
+        auto target_endpoint = get_view_natural_endpoint(base_ermp, view_ermp, network_topology, base_token, view_token, use_legacy_self_pairing, cf_stats);
        auto remote_endpoints = view_ermp->get_pending_endpoints(view_token);
        auto sem_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_updates.split(memory_usage_of(mut)));

@@ -2650,7 +2681,7 @@ future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_
    });
 }

-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, locator::token_metadata_ptr tmptr, const replica::table& t,
        streaming::stream_reason reason) {
    if (is_internal_keyspace(t.schema()->ks_name())) {
        return make_ready_future<bool>(false);
@@ -2658,9 +2689,9 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
    if (reason == streaming::stream_reason::repair && !t.views().empty()) {
        return make_ready_future<bool>(true);
    }
-    return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
+    return do_with(t.views(), [&sys_dist_ks, tmptr = std::move(tmptr)] (auto& views) {
        return map_reduce(views,
-                [&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
+                [&sys_dist_ks, tmptr] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, *tmptr, view->ks_name(), view->cf_name()); },
                false,
                std::logical_or<bool>());
    });
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -14,6 +14,7 @@
 #include "readers/flat_mutation_reader_v2.hh"
 #include "mutation/frozen_mutation.hh"
 #include "data_dictionary/data_dictionary.hh"
+#include "locator/abstract_replication_strategy.hh"

 class frozen_mutation_and_schema;

@@ -315,6 +316,10 @@ future<query::clustering_row_ranges> calculate_affected_clustering_ranges(

 bool needs_static_row(const mutation_partition& mp, const std::vector<view_and_base>& views);

+// Whether this node and shard should generate and send view updates for the given token.
+// Checks that the node is one of the replicas (not a pending replicas), and is ready for reads.
+bool should_generate_view_updates_on_this_shard(const schema_ptr& base, const locator::effective_replication_map_ptr& ermp, dht::token token);
+
 size_t memory_usage_of(const frozen_mutation_and_schema& mut);

 /**
--- a/db/view/view_update_checks.hh
+++ b/db/view/view_update_checks.hh
@@ -10,6 +10,7 @@

 #include <seastar/core/future.hh>
 #include "streaming/stream_reason.hh"
+#include "locator/token_metadata_fwd.hh"
 #include "seastarx.hh"

 namespace replica {
@@ -22,13 +23,9 @@ class system_distributed_keyspace;

 }

-namespace locator {
-class token_metadata;
-}
-
 namespace db::view {

-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, locator::token_metadata_ptr tmptr, const replica::table& t,
        streaming::stream_reason reason);

 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -7,7 +7,7 @@
 */

 #include "db/view/view_update_backlog.hh"
-#include "exceptions/exceptions.hh"
+#include <seastar/core/timed_out_error.hh>
 #include "gms/inet_address.hh"
 #include <seastar/util/defer.hh>
 #include <boost/range/adaptor/map.hpp>
@@ -370,6 +370,17 @@ future<> view_update_generator::populate_views(const replica::table& table,
    }
 }

+
+// Generating view updates for a single client request can take a long time and might not finish before the timeout is
+// reached. In such case this exception is thrown.
+// "Generating a view update" means creating a view update and scheduling it to be sent later.
+// This exception isn't thrown if the sending timeouts, it's only concrened with generating.
+struct view_update_generation_timeout_exception : public seastar::timed_out_error {
+    const char* what() const noexcept override {
+        return "Request timed out - couldn't prepare materialized view updates in time";
+    }
+};
+
 /**
 * Given some updates on the base table and the existing values for the rows affected by that update, generates the
 * mutations to be applied to the base table's views, and sends them to the paired view replicas.
@@ -446,7 +457,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
            }

            if (db::timeout_clock::now() > timeout) {
-                err = std::make_exception_ptr(exceptions::view_update_generation_timeout_exception());
+                err = std::make_exception_ptr(view_update_generation_timeout_exception());
                break;
            }
        }
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -551,10 +551,10 @@ public:
            return map_reduce_tables<stats>([] (replica::table& t) {
                logalloc::occupancy_stats s;
                uint64_t partition_count = 0;
-                for (replica::memtable* active_memtable : t.active_memtables()) {
-                    s += active_memtable->region().occupancy();
-                    partition_count += active_memtable->partition_count();
-                }
+                t.for_each_active_memtable([&] (replica::memtable& active_memtable) {
+                    s += active_memtable.region().occupancy();
+                    partition_count += active_memtable.partition_count();
+                });
                return stats{s.total_space(), s.free_space(), partition_count};
            }, stats::reduce).then([] (stats s) {
                return std::vector<std::pair<sstring, sstring>>{
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -28,6 +28,25 @@ if __name__ == '__main__':
                        help='enable compress on systemd-coredump')
    args = parser.parse_args()

+    # Seems like specific version of systemd pacakge on RHEL9 has a bug on
+    # SELinux configuration, it introduced "systemd-container-coredump" module
+    # to provide rule for systemd-coredump but not enabled by default.
+    # We have to manually load it, otherwise it causes permission errror.
+    # (#19325)
+    if is_redhat_variant() and distro.major_version() == '9':
+        if not shutil.which('getenforce'):
+            pkg_install('libselinux-utils')
+        if not shutil.which('semodule'):
+            pkg_install('policycoreutils')
+        enforce = out('getenforce')
+        if enforce != "Disabled":
+            if os.path.exists('/usr/share/selinux/packages/targeted/systemd-container-coredump.pp.bz2'):
+                modules = out('semodule -l')
+                match = re.match(r'^systemd-container-coredump$', modules, re.MULTILINE)
+                if not match:
+                    run('semodule -v -i /usr/share/selinux/packages/targeted/systemd-container-coredump.pp.bz2', shell=True, check=True)
+                    run('semodule -v -e systemd-container-coredump', shell=True, check=True)
+
    # abrt-ccpp.service needs to stop before enabling systemd-coredump,
    # since both will try to install kernel coredump handler
    # (This will only requires for abrt < 2.14)
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -325,9 +325,27 @@ WantedBy=local-fs.target
        os.chown(dpath, uid, gid)

    if is_debian_variant():
+        if not shutil.which('update-initramfs'):
+            pkg_install('initramfs-tools')
        run('update-initramfs -u', shell=True, check=True)

    if not udev_info.uuid_link:
        LOGGER.error(f'Error detected, dumping udev env parameters on {fsdev}')
        udev_info.verify()
        udev_info.dump_variables()
+
+    if is_redhat_variant():
+        if not shutil.which('getenforce'):
+            pkg_install('libselinux-utils')
+        if not shutil.which('restorecon'):
+            pkg_install('policycoreutils')
+        if not shutil.which('semanage'):
+            pkg_install('policycoreutils-python-utils')
+        selinux_status = out('getenforce')
+        selinux_context = out('matchpathcon -n /var/lib/systemd/coredump')
+        selinux_type = selinux_context.split(':')[2]
+        run(f'semanage fcontext -a -t {selinux_type} "{root}/coredump(/.*)?"', shell=True, check=True)
+        if selinux_status != 'Disabled':
+            run(f'restorecon -F -v -R {root}', shell=True, check=True)
+        else:
+            Path('/.autorelabel').touch(exist_ok=True)
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -103,7 +103,7 @@ run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
-run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
 run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -158,33 +158,6 @@ Obsoletes:      scylla-server < 1.1
 %description conf
 This package contains the main scylla configuration file.

-# we need to refuse upgrade if current scylla < 1.7.3 && commitlog remains
-%pretrans conf
-ver=$(rpm -qi scylla-server | grep Version | awk '{print $3}')
-if [ -n "$ver" ]; then
-    ver_fmt=$(echo $ver | awk -F. '{printf "%d%02d%02d", $1,$2,$3}')
-    if [ $ver_fmt -lt 10703 ]; then
-        # for <scylla-1.2
-        if [ ! -f /opt/scylladb/lib/scylla/scylla_config_get.py ]; then
-            echo
-            echo "Error: Upgrading from scylla-$ver to scylla-%{version} is not supported."
-            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to %{version}."
-            echo
-            exit 1
-        fi
-        commitlog_directory=$(/opt/scylladb/lib/scylla/scylla_config_get.py -g commitlog_directory)
-        commitlog_files=$(ls $commitlog_directory | wc -l)
-        if [ $commitlog_files -ne 0 ]; then
-            echo
-            echo "Error: Upgrading from scylla-$ver to scylla-%{version} is not supported when commitlog is not clean."
-            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to %{version}."
-            echo "Also make sure $commitlog_directory is empty."
-            echo
-            exit 1
-        fi
-    fi
-fi
-
 %files conf
 %defattr(-,root,root)
 %attr(0755,root,root) %dir %{_sysconfdir}/scylla
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -85,7 +85,7 @@ redirects: setup
 # Preview commands
 .PHONY: preview
 preview: setup
-	$(POETRY) run sphinx-autobuild -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml --host $(PREVIEW_HOST) --port 5500 --ignore *.csv --ignore *.yaml
+	$(POETRY) run sphinx-autobuild -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml --host $(PREVIEW_HOST) --port 5500 --ignore *.csv --ignore *.json --ignore *.yaml

 .PHONY: multiversionpreview
 multiversionpreview: multiversion
--- a/docs/_ext/scylladb_cc_properties.py
+++ b/docs/_ext/scylladb_cc_properties.py
@@ -1,23 +1,19 @@
-import os
 import re
-import yaml
 from typing import Any, Dict, List

-import jinja2
-
 from sphinx import addnodes
 from sphinx.application import Sphinx
 from sphinx.directives import ObjectDescription
 from sphinx.util import logging, ws_re
-from sphinx.util.display import status_iterator
 from sphinx.util.docfields import Field
 from sphinx.util.docutils import switch_source_input, SphinxDirective
 from sphinx.util.nodes import make_id, nested_parse_with_titles
-from sphinx.jinja2glue import BuiltinTemplateLoader
 from docutils import nodes
 from docutils.parsers.rst import directives
 from docutils.statemachine import StringList

+from utils import maybe_add_filters
+
 logger = logging.getLogger(__name__)

 class DBConfigParser:
@@ -152,51 +148,6 @@ class DBConfigParser:
        return DBConfigParser.all_properties[name]


-def readable_desc(description: str) -> str:
-    """
-    This function is deprecated and maintained only for backward compatibility 
-    with previous versions. Use ``readable_desc_rst``instead.
-    """
-    return (
-        description.replace("\\n", "")
-        .replace('<', '&lt;')
-        .replace('>', '&gt;')
-        .replace("\n", "<br>")
-        .replace("\\t", "- ")
-        .replace('"', "")
-    )
-
-
-def readable_desc_rst(description):
-    indent = ' ' * 3
-    lines = description.split('\n')
-    cleaned_lines = []
-    
-    for line in lines:
-
-        cleaned_line = line.replace('\\n', '\n')
-
-        if line.endswith('"'):
-            cleaned_line = cleaned_line[:-1] + ' '
-
-        cleaned_line = cleaned_line.lstrip()
-        cleaned_line = cleaned_line.replace('"', '')
-        
-        if cleaned_line != '':
-            cleaned_line = indent + cleaned_line
-            cleaned_lines.append(cleaned_line)
-    
-    return ''.join(cleaned_lines)
-
-
-def maybe_add_filters(builder):
-    env = builder.templates.environment
-    if 'readable_desc' not in env.filters:
-        env.filters['readable_desc'] = readable_desc
-
-    if 'readable_desc_rst' not in env.filters:
-        env.filters['readable_desc_rst'] = readable_desc_rst
-

 class ConfigOption(ObjectDescription):
    has_content = True
--- a/docs/_ext/scylladb_metrics.py
+++ b/docs/_ext/scylladb_metrics.py
@@ -0,0 +1,188 @@
+import os
+import sys
+import json
+from sphinx import addnodes
+from sphinx.directives import ObjectDescription
+from sphinx.util.docfields import Field
+from sphinx.util.docutils import switch_source_input
+from sphinx.util.nodes import make_id
+from sphinx.util import logging, ws_re
+from docutils.parsers.rst import Directive, directives
+from docutils.statemachine import StringList
+from sphinxcontrib.datatemplates.directive import DataTemplateJSON
+from utils import maybe_add_filters
+
+sys.path.insert(0, os.path.abspath("../../scripts"))
+import scripts.get_description as metrics
+
+LOGGER = logging.getLogger(__name__)
+
+
+class MetricsProcessor:
+
+    MARKER = "::description"
+
+    def _create_output_directory(self, app, metrics_directory):
+        output_directory = os.path.join(app.builder.srcdir, metrics_directory)
+        os.makedirs(output_directory, exist_ok=True)
+        return output_directory
+
+    def _process_single_file(self, file_path, destination_path, metrics_config_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        if self.MARKER in content and not os.path.exists(destination_path):
+            try:
+                metrics_file = metrics.get_metrics_from_file(file_path, "scylla", metrics.get_metrics_information(metrics_config_path))
+                with open(destination_path, 'w+', encoding='utf-8') as f:
+                    json.dump(metrics_file, f, indent=4)
+            except SystemExit:
+                LOGGER.info(f'Skipping file: {file_path}')
+            except Exception as error:
+                LOGGER.info(error)
+
+    def _process_metrics_files(self, repo_dir, output_directory, metrics_config_path):
+        for root, _, files in os.walk(repo_dir):
+            for file in files:
+                if file.endswith(".cc"):
+                    file_path = os.path.join(root, file)
+                    file_name = os.path.splitext(file)[0] + ".json"
+                    destination_path = os.path.join(output_directory, file_name)
+                    self._process_single_file(file_path, destination_path, metrics_config_path)
+
+    def run(self, app, exception=None):
+        repo_dir = os.path.abspath(os.path.join(app.srcdir, ".."))
+        metrics_config_path = os.path.join(repo_dir, app.config.scylladb_metrics_config_path)
+        output_directory = self._create_output_directory(app, app.config.scylladb_metrics_directory)
+
+        self._process_metrics_files(repo_dir, output_directory, metrics_config_path)
+
+
+class MetricsTemplateDirective(DataTemplateJSON):
+    option_spec = DataTemplateJSON.option_spec.copy()
+    option_spec["title"] = lambda x: x
+
+    def _make_context(self, data, config, env):
+        context = super()._make_context(data, config, env)
+        context["title"] = self.options.get("title")
+        return context
+
+    def run(self):
+        return super().run()
+
+
+class MetricsOption(ObjectDescription):
+    has_content = True
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = False
+    option_spec = {
+        'type': directives.unchanged,
+        'component': directives.unchanged,
+        'key': directives.unchanged,
+        'source': directives.unchanged,
+    }
+
+    doc_field_types = [
+        Field('type', label='Type', has_arg=False, names=('type',)),
+        Field('component', label='Component', has_arg=False, names=('component',)),
+        Field('key', label='Key', has_arg=False, names=('key',)),
+        Field('source', label='Source', has_arg=False, names=('source',)),
+    ]
+
+    def handle_signature(self, sig: str, signode: addnodes.desc_signature):
+        signode.clear()
+        signode += addnodes.desc_name(sig, sig)
+        return ws_re.sub(' ', sig)
+
+    @property
+    def env(self):
+        return self.state.document.settings.env
+
+    def _render(self, name, option_type, component, key, source):
+        item = {'name': name, 'type': option_type, 'component': component, 'key': key, 'source': source }
+        template = self.config.scylladb_metrics_option_template
+        return self.env.app.builder.templates.render(template, item)
+
+    def transform_content(self, contentnode: addnodes.desc_content) -> None:
+        name = self.arguments[0]
+        option_type = self.options.get('type', '')
+        component = self.options.get('component', '')
+        key = self.options.get('key', '')
+        source_file = self.options.get('source', '')
+        _, lineno = self.get_source_info()
+        source = f'scylladb_metrics:{lineno}:<{name}>'
+        fields = StringList(self._render(name, option_type, component, key, source_file).splitlines(), source=source, parent_offset=lineno)
+        with switch_source_input(self.state, fields):
+            self.state.nested_parse(fields, 0, contentnode)
+
+    def add_target_and_index(self, name: str, sig: str, signode: addnodes.desc_signature) -> None:
+        node_id = make_id(self.env, self.state.document, self.objtype, name)
+        signode['ids'].append(node_id)
+        self.state.document.note_explicit_target(signode)
+        entry = f'{name}; metrics option'
+        self.indexnode['entries'].append(('pair', entry, node_id, '', None))
+        self.env.get_domain('std').note_object(self.objtype, name, node_id, location=signode)
+
+class MetricsDirective(Directive):
+    TEMPLATE = 'metrics.tmpl'
+    required_arguments = 0
+    optional_arguments = 1
+    option_spec = {'template': directives.path}
+    has_content = True
+
+    def _process_file(self, file, relative_path_from_current_rst):
+        data_directive = MetricsTemplateDirective(
+            name=self.name,
+            arguments=[os.path.join(relative_path_from_current_rst, file)],
+            options=self.options,
+            content=self.content,
+            lineno=self.lineno,
+            content_offset=self.content_offset,
+            block_text=self.block_text,
+            state=self.state,
+            state_machine=self.state_machine,
+        )
+        data_directive.options["template"] = self.options.get('template', self.TEMPLATE)
+        data_directive.options["title"] = file.replace('_', ' ').replace('.json','').capitalize()
+        return data_directive.run()
+
+    def _get_relative_path(self, output_directory, app, docname):
+        current_rst_path = os.path.join(app.builder.srcdir, docname + ".rst")
+        return os.path.relpath(output_directory, os.path.dirname(current_rst_path))
+
+
+    def run(self):
+        maybe_add_filters(self.state.document.settings.env.app.builder)
+        app = self.state.document.settings.env.app
+        docname = self.state.document.settings.env.docname
+        metrics_directory = os.path.join(app.builder.srcdir, app.config.scylladb_metrics_directory)
+        output = []
+        try:
+            relative_path_from_current_rst = self._get_relative_path(metrics_directory, app, docname)
+            files = os.listdir(metrics_directory)
+            for _, file in enumerate(files):
+                output.extend(self._process_file(file, relative_path_from_current_rst))
+        except Exception as error:
+            LOGGER.info(error)
+        return output
+
+def setup(app):
+    app.add_config_value("scylladb_metrics_directory", default="_data/metrics", rebuild="html")
+    app.add_config_value("scylladb_metrics_config_path", default='scripts/metrics-config.yml', rebuild="html")
+    app.add_config_value('scylladb_metrics_option_template', default='metrics_option.tmpl', rebuild='html', types=[str])
+    app.connect("builder-inited", MetricsProcessor().run)
+    app.add_object_type(
+        'metrics_option',
+        'metrics_option',
+        objname='metrics option')
+    app.add_directive_to_domain('std', 'metrics_option', MetricsOption, override=True)
+    app.add_directive("metrics_option", MetricsOption)
+    app.add_directive("scylladb_metrics", MetricsDirective)
+
+   
+    return {
+        "version": "0.1",
+        "parallel_read_safe": True,
+        "parallel_write_safe": True,
+    }
+
--- a/docs/_ext/utils.py
+++ b/docs/_ext/utils.py
@@ -0,0 +1,44 @@
+def readable_desc(description: str) -> str:
+    """
+    This function is deprecated and maintained only for backward compatibility 
+    with previous versions. Use ``readable_desc_rst``instead.
+    """
+    return (
+        description.replace("\\n", "")
+        .replace('<', '&lt;')
+        .replace('>', '&gt;')
+        .replace("\n", "<br>")
+        .replace("\\t", "- ")
+        .replace('"', "")
+    )
+
+
+def readable_desc_rst(description):
+    indent = ' ' * 3
+    lines = description.split('\n')
+    cleaned_lines = []
+    
+    for line in lines:
+
+        cleaned_line = line.replace('\\n', '\n')
+
+        if line.endswith('"'):
+            cleaned_line = cleaned_line[:-1] + ' '
+
+        cleaned_line = cleaned_line.lstrip()
+        cleaned_line = cleaned_line.replace('"', '')
+        
+        if cleaned_line != '':
+            cleaned_line = indent + cleaned_line
+            cleaned_lines.append(cleaned_line)
+    
+    return ''.join(cleaned_lines)
+
+
+def maybe_add_filters(builder):
+    env = builder.templates.environment
+    if 'readable_desc' not in env.filters:
+        env.filters['readable_desc'] = readable_desc
+
+    if 'readable_desc_rst' not in env.filters:
+        env.filters['readable_desc_rst'] = readable_desc_rst
--- a/docs/_static/css/custom.css
+++ b/docs/_static/css/custom.css
@@ -41,6 +41,6 @@ dl dt:hover > a.headerlink {
    visibility: visible;
 }

-dl.confval {
+dl.confval, dl.metrics_option {
    border-bottom: 1px solid #cacaca;
 }
--- a/docs/_templates/metrics.tmpl
+++ b/docs/_templates/metrics.tmpl
@@ -0,0 +1,19 @@
+.. -*- mode: rst -*-
+
+{{title}}
+{{ '-' * title|length }}
+
+{% if data  %}
+{% for key, value in data.items() %}
+.. _metricsprop_{{ key }}:
+
+.. metrics_option:: {{ key }}
+  :type: {{value[0]}}
+  :source: {{value[4]}}
+  :component: {{value[2]}}
+  :key: {{value[3]}}
+
+  {{value[1] | readable_desc_rst}}
+
+{% endfor %}
+{% endif %}
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`*Please replace this line with justification for the backport/\ labels added to this PR**`