doc: remove wrong image upgrade info (5.2-to-2023.1)

This commit removes the information about the recommended way of upgrading ScyllaDB images - by updating ScyllaDB and OS packages in one step. This upgrade procedure is not supported (it was implemented, but then reverted). Refs https://github.com/scylladb/scylladb/issues/15733 Closes scylladb/scylladb#21876 Fixes https://github.com/scylladb/scylla-enterprise/issues/5041 Fixes https://github.com/scylladb/scylladb/issues/21898 (cherry picked from commit 98860905d8)
repair: use find_column_family in insert_repair_meta
2026-04-26 19:35:12 +00:00 · 2024-12-12 15:27:24 +02:00 · 2024-08-14 22:20:38 +03:00 · 2024-08-14 20:15:50 +03:00 · 2024-08-07 10:52:39 +02:00 · 2024-08-05 16:28:19 +02:00
101 changed files with 1752 additions and 542 deletions
--- a/.github/scripts/label_promoted_commits.py
+++ b/.github/scripts/label_promoted_commits.py
@@ -0,0 +1,87 @@
+from github import Github
+import argparse
+import re
+import sys
+import os
+
+try:
+    github_token = os.environ["GITHUB_TOKEN"]
+except KeyError:
+    print("Please set the 'GITHUB_TOKEN' environment variable")
+    sys.exit(1)
+
+
+def parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--repository', type=str, required=True,
+                        help='Github repository name (e.g., scylladb/scylladb)')
+    parser.add_argument('--commit_before_merge', type=str, required=True, help='Git commit ID to start labeling from ('
+                                                                               'newest commit).')
+    parser.add_argument('--commit_after_merge', type=str, required=True,
+                        help='Git commit ID to end labeling at (oldest '
+                             'commit, exclusive).')
+    parser.add_argument('--update_issue', type=bool, default=False, help='Set True to update issues when backport was '
+                                                                         'done')
+    parser.add_argument('--ref', type=str, required=True, help='PR target branch')
+    return parser.parse_args()
+
+
+def add_comment_and_close_pr(pr, comment):
+    if pr.state == 'open':
+        pr.create_issue_comment(comment)
+        pr.edit(state="closed")
+
+
+def mark_backport_done(repo, ref_pr_number, branch):
+    pr = repo.get_pull(int(ref_pr_number))
+    label_to_remove = f'backport/{branch}'
+    label_to_add = f'{label_to_remove}-done'
+    current_labels = [label.name for label in pr.get_labels()]
+    if label_to_remove in current_labels:
+        pr.remove_from_labels(label_to_remove)
+    if label_to_add not in current_labels:
+        pr.add_to_labels(label_to_add)
+
+
+def main():
+    # This script is triggered by a push event to either the master branch or a branch named branch-x.y (where x and y represent version numbers). Based on the pushed branch, the script performs the following actions:
+    # - When ref branch is `master`, it will add the `promoted-to-master` label, which we need later for the auto backport process
+    # - When ref branch is `branch-x.y` (which means we backported a patch), it will replace in the original PR the `backport/x.y` label with `backport/x.y-done` and will close the backport PR (Since GitHub close only the one referring to default branch)
+    args = parser()
+    pr_pattern = re.compile(r'Closes .*#([0-9]+)')
+    target_branch = re.search(r'branch-(\d+\.\d+)', args.ref)
+    g = Github(github_token)
+    repo = g.get_repo(args.repository, lazy=False)
+    commits = repo.compare(head=args.commit_after_merge, base=args.commit_before_merge)
+    processed_prs = set()
+    # Print commit information
+    for commit in commits.commits:
+        print(f'Commit sha is: {commit.sha}')
+        match = pr_pattern.search(commit.commit.message)
+        if match:
+            pr_number = int(match.group(1))
+            if pr_number in processed_prs:
+                continue
+            if target_branch:
+                pr = repo.get_pull(pr_number)
+                branch_name = target_branch[1]
+                refs_pr = re.findall(r'Refs (?:#|https.*?)(\d+)', pr.body)
+                if refs_pr:
+                    print(f'branch-{target_branch.group(1)}, pr number is: {pr_number}')
+                    # 1. change the backport label of the parent PR to note that
+                    #    we've merge the corresponding backport PR
+                    # 2. close the backport PR and leave a comment on it to note
+                    #    that it has been merged with a certain git commit,
+                    ref_pr_number = refs_pr[0]
+                    mark_backport_done(repo, ref_pr_number, branch_name)
+                    comment = f'Closed via {commit.sha}'
+                    add_comment_and_close_pr(pr, comment)
+            else:
+                print(f'master branch, pr number is: {pr_number}')
+                pr = repo.get_pull(pr_number)
+                pr.add_to_labels('promoted-to-master')
+            processed_prs.add(pr_number)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/workflows/add-label-when-promoted.yaml
+++ b/.github/workflows/add-label-when-promoted.yaml
@@ -0,0 +1,36 @@
+name: Check if commits are promoted
+
+on:
+  push:
+    branches:
+      - master
+      - branch-*.*
+
+env:
+  DEFAULT_BRANCH: 'master'
+
+jobs:
+  check-commit:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+    steps:
+      - name: Dump GitHub context
+        env:
+          GITHUB_CONTEXT: ${{ toJson(github) }}
+        run: echo "$GITHUB_CONTEXT"
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.repository }}
+          ref: ${{ env.DEFAULT_BRANCH }}
+          fetch-depth: 0  # Fetch all history for all tags and branches
+
+      - name: Install dependencies
+        run: sudo apt-get install -y python3-github
+
+      - name: Run python script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python .github/scripts/label_promoted_commits.py --commit_before_merge ${{ github.event.before }} --commit_after_merge ${{ github.event.after }} --repository ${{ github.repository }} --ref ${{ github.ref }}
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.4.7
+VERSION=5.4.10

 if test -f version
 then
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -208,7 +208,10 @@ protected:
        sstring local_dc = topology.get_datacenter();
        std::unordered_set<gms::inet_address> local_dc_nodes = topology.get_datacenter_endpoints().at(local_dc);
        for (auto& ip : local_dc_nodes) {
-            if (_gossiper.is_alive(ip)) {
+            // Note that it's not enough for the node to be is_alive() - a
+            // node joining the cluster is also "alive" but not responsive to
+            // requests. We need the node to be in normal state. See #19694.
+            if (_gossiper.is_normal(ip)) {
                rjson::push_back(results, rjson::from_string(ip.to_sstring()));
            }
        }
--- a/api/api.cc
+++ b/api/api.cc
@@ -314,7 +314,7 @@ void req_params::process(const request& req) {
            continue;
        }
        try {
-            ent.value = req.param[name];
+            ent.value = req.get_path_param(name);
        } catch (std::out_of_range&) {
            throw httpd::bad_param_exception(fmt::format("Mandatory parameter '{}' was not provided", name));
        }
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -54,7 +54,7 @@ static const char* str_to_regex(const sstring& v) {
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [](std::unique_ptr<request> req) {

-        auto id = ::make_shared<scollectd::type_instance_id>(req->param["pluginid"],
+        auto id = ::make_shared<scollectd::type_instance_id>(req->get_path_param("pluginid"),
                req->get_query_param("instance"), req->get_query_param("type"),
                req->get_query_param("type_instance"));

@@ -91,7 +91,7 @@ void set_collectd(http_context& ctx, routes& r) {
    });

    cd::enable_collectd.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
-        std::regex plugin(req->param["pluginid"].c_str());
+        std::regex plugin(req->get_path_param("pluginid").c_str());
        std::regex instance(str_to_regex(req->get_query_param("instance")));
        std::regex type(str_to_regex(req->get_query_param("type")));
        std::regex type_instance(str_to_regex(req->get_query_param("type_instance")));
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -333,7 +333,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t{0}, [](replica::column_family& cf) {
            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed(std::mem_fn(&replica::memtable::partition_count)), uint64_t(0));
        }, std::plus<>());
    });
@@ -353,7 +353,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_off_heap_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
                return active_memtable->region().occupancy().total_space();
            }), uint64_t(0));
@@ -369,7 +369,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_live_data_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
            return boost::accumulate(cf.active_memtables() | boost::adaptors::transformed([] (replica::memtable* active_memtable) {
                return active_memtable->region().occupancy().used_space();
            }), uint64_t(0));
@@ -394,7 +394,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_cf_all_memtables_off_heap_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        warn(unimplemented::cause::INDEXES);
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
            return cf.occupancy().total_space();
        }, std::plus<int64_t>());
    });
@@ -410,7 +410,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_cf_all_memtables_live_data_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        warn(unimplemented::cause::INDEXES);
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
            return cf.occupancy().used_space();
        }, std::plus<int64_t>());
    });
@@ -425,7 +425,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_memtable_switch_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(ctx,req->param["name"] ,&replica::column_family_stats::memtable_switch_count);
+        return get_cf_stats(ctx,req->get_path_param("name") ,&replica::column_family_stats::memtable_switch_count);
    });

    cf::get_all_memtable_switch_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -434,7 +434,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_estimated_row_size_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), utils::estimated_histogram(0), [](replica::column_family& cf) {
            utils::estimated_histogram res(0);
            for (auto sstables = cf.get_sstables(); auto& i : *sstables) {
                res.merge(i->get_stats_metadata().estimated_partition_size);
@@ -446,7 +446,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_estimated_row_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
            uint64_t res = 0;
            for (auto sstables = cf.get_sstables(); auto& i : *sstables) {
                res += i->get_stats_metadata().estimated_partition_size.count();
@@ -457,7 +457,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_estimated_column_count_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), utils::estimated_histogram(0), [](replica::column_family& cf) {
            utils::estimated_histogram res(0);
            for (auto sstables = cf.get_sstables(); auto& i : *sstables) {
                res.merge(i->get_stats_metadata().estimated_cells_count);
@@ -474,7 +474,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_pending_flushes.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(ctx,req->param["name"] ,&replica::column_family_stats::pending_flushes);
+        return get_cf_stats(ctx,req->get_path_param("name") ,&replica::column_family_stats::pending_flushes);
    });

    cf::get_all_pending_flushes.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -482,7 +482,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_read.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_count(ctx,req->param["name"] ,&replica::column_family_stats::reads);
+        return get_cf_stats_count(ctx,req->get_path_param("name") ,&replica::column_family_stats::reads);
    });

    cf::get_all_read.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -490,7 +490,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_write.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_count(ctx, req->param["name"] ,&replica::column_family_stats::writes);
+        return get_cf_stats_count(ctx, req->get_path_param("name") ,&replica::column_family_stats::writes);
    });

    cf::get_all_write.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -498,19 +498,19 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::reads);
+        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::reads);
    });

    cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_rate_and_histogram(ctx, req->param["name"], &replica::column_family_stats::reads);
+        return get_cf_rate_and_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::reads);
    });

    cf::get_read_latency.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_sum(ctx,req->param["name"] ,&replica::column_family_stats::reads);
+        return get_cf_stats_sum(ctx,req->get_path_param("name") ,&replica::column_family_stats::reads);
    });

    cf::get_write_latency.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats_sum(ctx, req->param["name"] ,&replica::column_family_stats::writes);
+        return get_cf_stats_sum(ctx, req->get_path_param("name") ,&replica::column_family_stats::writes);
    });

    cf::get_all_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -522,11 +522,11 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::writes);
+        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::writes);
    });

    cf::get_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_rate_and_histogram(ctx, req->param["name"], &replica::column_family_stats::writes);
+        return get_cf_rate_and_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::writes);
    });

    cf::get_all_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -538,7 +538,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), [](replica::column_family& cf) {
            return cf.estimate_pending_compactions();
        }, std::plus<int64_t>());
    });
@@ -550,7 +550,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_live_ss_table_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(ctx, req->param["name"], &replica::column_family_stats::live_sstable_count);
+        return get_cf_stats(ctx, req->get_path_param("name"), &replica::column_family_stats::live_sstable_count);
    });

    cf::get_all_live_ss_table_count.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -558,11 +558,11 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_unleveled_sstables.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_unleveled_sstables(ctx, req->param["name"]);
+        return get_cf_unleveled_sstables(ctx, req->get_path_param("name"));
    });

    cf::get_live_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return sum_sstable(ctx, req->param["name"], false);
+        return sum_sstable(ctx, req->get_path_param("name"), false);
    });

    cf::get_all_live_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -570,7 +570,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_total_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return sum_sstable(ctx, req->param["name"], true);
+        return sum_sstable(ctx, req->get_path_param("name"), true);
    });

    cf::get_all_total_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
@@ -579,7 +579,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_min_row_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], INT64_MAX, min_partition_size, min_int64);
+        return map_reduce_cf(ctx, req->get_path_param("name"), INT64_MAX, min_partition_size, min_int64);
    });

    // FIXME: this refers to partitions, not rows.
@@ -589,7 +589,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    // FIXME: this refers to partitions, not rows.
    cf::get_max_row_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), max_partition_size, max_int64);
+        return map_reduce_cf(ctx, req->get_path_param("name"), int64_t(0), max_partition_size, max_int64);
    });

    // FIXME: this refers to partitions, not rows.
@@ -600,7 +600,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    // FIXME: this refers to partitions, not rows.
    cf::get_mean_row_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
        // Cassandra 3.x mean values are truncated as integrals.
-        return map_reduce_cf(ctx, req->param["name"], integral_ratio_holder(), mean_partition_size, std::plus<integral_ratio_holder>());
+        return map_reduce_cf(ctx, req->get_path_param("name"), integral_ratio_holder(), mean_partition_size, std::plus<integral_ratio_holder>());
    });

    // FIXME: this refers to partitions, not rows.
@@ -610,7 +610,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_get_false_positive();
@@ -628,7 +628,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_recent_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_get_recent_false_positive();
@@ -646,7 +646,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), ratio_holder(), [] (replica::column_family& cf) {
            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_false_positive_as_ratio_holder), ratio_holder());
        }, std::plus<>());
    });
@@ -658,7 +658,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), ratio_holder(), [] (replica::column_family& cf) {
            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_recent_false_positive_as_ratio_holder), ratio_holder());
        }, std::plus<>());
    });
@@ -670,7 +670,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_size();
@@ -688,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->filter_memory_size();
@@ -706,7 +706,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), uint64_t(0), [] (replica::column_family& cf) {
            auto sstables = cf.get_sstables();
            return std::accumulate(sstables->begin(), sstables->end(), uint64_t(0), [](uint64_t s, auto& sst) {
                return s + sst->get_summary().memory_footprint();
@@ -729,7 +729,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        // We are missing the off heap memory calculation
        // Return 0 is the wrong value. It's a work around
        // until the memory calculation will be available
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
+        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -742,7 +742,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    cf::get_speculative_retries.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
+        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -755,7 +755,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    cf::get_key_cache_hit_rate.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
+        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -780,7 +780,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    cf::get_row_cache_hit_out_of_range.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
+        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });

@@ -791,7 +791,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_row_cache_hit.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_raw(ctx, req->param["name"], utils::rate_moving_average(), [](const replica::column_family& cf) {
+        return map_reduce_cf_raw(ctx, req->get_path_param("name"), utils::rate_moving_average(), [](const replica::column_family& cf) {
            return cf.get_row_cache().stats().hits.rate();
        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
            return make_ready_future<json::json_return_type>(meter_to_json(m));
@@ -807,7 +807,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_row_cache_miss.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_raw(ctx, req->param["name"], utils::rate_moving_average(), [](const replica::column_family& cf) {
+        return map_reduce_cf_raw(ctx, req->get_path_param("name"), utils::rate_moving_average(), [](const replica::column_family& cf) {
            return cf.get_row_cache().stats().misses.rate();
        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
            return make_ready_future<json::json_return_type>(meter_to_json(m));
@@ -824,57 +824,57 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_cas_prepare.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
            return cf.get_stats().cas_prepare.histogram();
        });
    });

    cf::get_cas_propose.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
            return cf.get_stats().cas_accept.histogram();
        });
    });

    cf::get_cas_commit.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
            return cf.get_stats().cas_learn.histogram();
        });
    });

    cf::get_sstables_per_read_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](replica::column_family& cf) {
+        return map_reduce_cf(ctx, req->get_path_param("name"), utils::estimated_histogram(0), [](replica::column_family& cf) {
            return cf.get_stats().estimated_sstable_per_read;
        },
        utils::estimated_histogram_merge, utils_json::estimated_histogram());
    });

    cf::get_tombstone_scanned_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::tombstone_scanned);
+        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::tombstone_scanned);
    });

    cf::get_live_scanned_histogram.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &replica::column_family_stats::live_scanned);
+        return get_cf_histogram(ctx, req->get_path_param("name"), &replica::column_family_stats::live_scanned);
    });

    cf::get_col_update_time_delta_histogram.set(r, [] (std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
+        //auto id = get_uuid(req->get_path_param("name"), ctx.db.local());
        std::vector<double> res;
        return make_ready_future<json::json_return_type>(res);
    });

    cf::get_auto_compaction.set(r, [&ctx] (const_req req) {
-        auto uuid = get_uuid(req.param["name"], ctx.db.local());
+        auto uuid = get_uuid(req.get_path_param("name"), ctx.db.local());
        replica::column_family& cf = ctx.db.local().find_column_family(uuid);
        return !cf.is_auto_compaction_disabled_by_user();
    });

    cf::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/enable_auto_compaction: name={}", req->param["name"]);
+        apilog.info("column_family/enable_auto_compaction: name={}", req->get_path_param("name"));
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
-            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
+            return foreach_column_family(ctx, req->get_path_param("name"), [](replica::column_family &cf) {
                cf.enable_auto_compaction();
            }).then([g = std::move(g)] {
                return make_ready_future<json::json_return_type>(json_void());
@@ -883,10 +883,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/disable_auto_compaction: name={}", req->param["name"]);
+        apilog.info("column_family/disable_auto_compaction: name={}", req->get_path_param("name"));
        return ctx.db.invoke_on(0, [&ctx, req = std::move(req)] (replica::database& db) {
            auto g = replica::database::autocompaction_toggle_guard(db);
-            return foreach_column_family(ctx, req->param["name"], [](replica::column_family &cf) {
+            return foreach_column_family(ctx, req->get_path_param("name"), [](replica::column_family &cf) {
                return cf.disable_auto_compaction();
            }).then([g = std::move(g)] {
                return make_ready_future<json::json_return_type>(json_void());
@@ -895,14 +895,14 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_tombstone_gc.set(r, [&ctx] (const_req req) {
-        auto uuid = get_uuid(req.param["name"], ctx.db.local());
+        auto uuid = get_uuid(req.get_path_param("name"), ctx.db.local());
        replica::table& t = ctx.db.local().find_column_family(uuid);
        return t.tombstone_gc_enabled();
    });

    cf::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/enable_tombstone_gc: name={}", req->param["name"]);
-        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
+        apilog.info("column_family/enable_tombstone_gc: name={}", req->get_path_param("name"));
+        return foreach_column_family(ctx, req->get_path_param("name"), [](replica::table& t) {
            t.set_tombstone_gc_enabled(true);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -910,8 +910,8 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        apilog.info("column_family/disable_tombstone_gc: name={}", req->param["name"]);
-        return foreach_column_family(ctx, req->param["name"], [](replica::table& t) {
+        apilog.info("column_family/disable_tombstone_gc: name={}", req->get_path_param("name"));
+        return foreach_column_family(ctx, req->get_path_param("name"), [](replica::table& t) {
            t.set_tombstone_gc_enabled(false);
        }).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -919,7 +919,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_built_indexes.set(r, [&ctx, &sys_ks](std::unique_ptr<http::request> req) {
-        auto ks_cf = parse_fully_qualified_cf_name(req->param["name"]);
+        auto ks_cf = parse_fully_qualified_cf_name(req->get_path_param("name"));
        auto&& ks = std::get<0>(ks_cf);
        auto&& cf_name = std::get<1>(ks_cf);
        return sys_ks.local().load_view_build_progress().then([ks, cf_name, &ctx](const std::vector<db::system_keyspace_view_build_progress>& vb) mutable {
@@ -957,7 +957,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_compression_ratio.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto uuid = get_uuid(req->param["name"], ctx.db.local());
+        auto uuid = get_uuid(req->get_path_param("name"), ctx.db.local());

        return ctx.db.map_reduce(sum_ratio<double>(), [uuid](replica::database& db) {
            replica::column_family& cf = db.find_column_family(uuid);
@@ -968,21 +968,21 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_read_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
            return cf.get_stats().reads.histogram();
        });
    });

    cf::get_write_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const replica::column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->get_path_param("name"), [](const replica::column_family& cf) {
            return cf.get_stats().writes.histogram();
        });
    });

    cf::set_compaction_strategy_class.set(r, [&ctx](std::unique_ptr<http::request> req) {
        sstring strategy = req->get_query_param("class_name");
-        apilog.info("column_family/set_compaction_strategy_class: name={} strategy={}", req->param["name"], strategy);
-        return foreach_column_family(ctx, req->param["name"], [strategy](replica::column_family& cf) {
+        apilog.info("column_family/set_compaction_strategy_class: name={} strategy={}", req->get_path_param("name"), strategy);
+        return foreach_column_family(ctx, req->get_path_param("name"), [strategy](replica::column_family& cf) {
            cf.set_compaction_strategy(sstables::compaction_strategy::type(strategy));
        }).then([] {
                return make_ready_future<json::json_return_type>(json_void());
@@ -990,7 +990,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_compaction_strategy_class.set(r, [&ctx](const_req req) {
-        return ctx.db.local().find_column_family(get_uuid(req.param["name"], ctx.db.local())).get_compaction_strategy().name();
+        return ctx.db.local().find_column_family(get_uuid(req.get_path_param("name"), ctx.db.local())).get_compaction_strategy().name();
    });

    cf::set_compression_parameters.set(r, [](std::unique_ptr<http::request> req) {
@@ -1006,7 +1006,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
    });

    cf::get_sstable_count_per_level.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        return map_reduce_cf_raw(ctx, req->param["name"], std::vector<uint64_t>(), [](const replica::column_family& cf) {
+        return map_reduce_cf_raw(ctx, req->get_path_param("name"), std::vector<uint64_t>(), [](const replica::column_family& cf) {
            return cf.sstable_count_per_level();
        }, concat_sstable_count_per_level).then([](const std::vector<uint64_t>& res) {
            return make_ready_future<json::json_return_type>(res);
@@ -1015,7 +1015,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

    cf::get_sstables_for_key.set(r, [&ctx](std::unique_ptr<http::request> req) {
        auto key = req->get_query_param("key");
-        auto uuid = get_uuid(req->param["name"], ctx.db.local());
+        auto uuid = get_uuid(req->get_path_param("name"), ctx.db.local());

        return ctx.db.map_reduce0([key, uuid] (replica::database& db) -> future<std::unordered_set<sstring>> {
            auto sstables = co_await db.find_column_family(uuid).get_sstables_by_partition_key(key);
@@ -1031,7 +1031,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace


    cf::toppartitions.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        auto name = req->param["name"];
+        auto name = req->get_path_param("name");
        auto [ks, cf] = parse_fully_qualified_cf_name(name);

        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
@@ -1058,7 +1058,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        }
        auto [ks, cf] = parse_fully_qualified_cf_name(*params.get("name"));
        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.info("column_family/force_major_compaction: name={} flush={}", req->param["name"], flush);
+        apilog.info("column_family/force_major_compaction: name={} flush={}", req->get_path_param("name"), flush);

        auto keyspace = validate_keyspace(ctx, ks);
        std::vector<table_info> table_infos = {table_info{
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/exception.hh>

 #include "compaction_manager.hh"
 #include "compaction/compaction_manager.hh"
@@ -109,7 +110,7 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    });

    cm::stop_keyspace_compaction.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto ks_name = validate_keyspace(ctx, req->param);
+        auto ks_name = validate_keyspace(ctx, req);
        auto table_names = parse_tables(ks_name, ctx, req->query_parameters, "tables");
        if (table_names.empty()) {
            table_names = map_keys(ctx.db.local().find_keyspace(ks_name).metadata().get()->cf_meta_data());
@@ -152,10 +153,13 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    });

    cm::get_compaction_history.set(r, [&ctx] (std::unique_ptr<http::request> req) {
-        std::function<future<>(output_stream<char>&&)> f = [&ctx](output_stream<char>&& s) {
-            return do_with(output_stream<char>(std::move(s)), true, [&ctx] (output_stream<char>& s, bool& first){
-                return s.write("[").then([&ctx, &s, &first] {
-                    return ctx.db.local().get_compaction_manager().get_compaction_history([&s, &first](const db::compaction_history_entry& entry) mutable {
+        std::function<future<>(output_stream<char>&&)> f = [&ctx] (output_stream<char>&& out) -> future<> {
+            auto s = std::move(out);
+            bool first = true;
+            std::exception_ptr ex;
+            try {
+                co_await s.write("[");
+                co_await ctx.db.local().get_compaction_manager().get_compaction_history([&s, &first](const db::compaction_history_entry& entry) mutable -> future<> {
                        cm::history h;
                        h.id = entry.id.to_sstring();
                        h.ks = std::move(entry.ks);
@@ -169,18 +173,21 @@ void set_compaction_manager(http_context& ctx, routes& r) {
                            e.value = it.second;
                            h.rows_merged.push(std::move(e));
                        }
-                        auto fut = first ? make_ready_future<>() : s.write(", ");
+                        if (!first) {
+                            co_await s.write(", ");
+                        }
                        first = false;
-                        return fut.then([&s, h = std::move(h)] {
-                            return formatter::write(s, h);
-                        });
-                    }).then([&s] {
-                        return s.write("]").then([&s] {
-                            return s.close();
-                        });
+                        co_await formatter::write(s, h);
                    });
-                });
-            });
+                co_await s.write("]");
+                co_await s.flush();
+            } catch (...) {
+                ex = std::current_exception();
+            }
+            co_await s.close();
+            if (ex) {
+                co_await coroutine::return_exception_ptr(std::move(ex));
+            }
        };
        return make_ready_future<json::json_return_type>(std::move(f));
    });
--- a/api/config.cc
+++ b/api/config.cc
@@ -91,7 +91,7 @@ void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx
    });

    cs::find_config_id.set(r, [&cfg] (const_req r) {
-        auto id = r.param["id"];
+        auto id = r.get_path_param("id");
        for (auto&& cfg_ref : cfg.values()) {
            auto&& cfg = cfg_ref.get();
            if (id == cfg.name()) {
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -24,7 +24,7 @@ namespace hf = httpd::error_injection_json;
 void set_error_injection(http_context& ctx, routes& r) {

    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->param["injection"];
+        sstring injection = req->get_path_param("injection");
        bool one_shot = req->get_query_param("one_shot") == "True";
        auto params = req->content;

@@ -56,7 +56,7 @@ void set_error_injection(http_context& ctx, routes& r) {
    });

    hf::disable_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->param["injection"];
+        sstring injection = req->get_path_param("injection");

        auto& errinj = utils::get_local_injector();
        return errinj.disable_on_all(injection).then([] {
@@ -72,7 +72,7 @@ void set_error_injection(http_context& ctx, routes& r) {
    });

    hf::message_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->param["injection"];
+        sstring injection = req->get_path_param("injection");
        auto& errinj = utils::get_local_injector();
        return errinj.receive_message_on_all(injection).then([] {
            return make_ready_future<json::json_return_type>(json::json_void());
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -80,9 +80,9 @@ void set_failure_detector(http_context& ctx, routes& r, gms::gossiper& g) {

    fd::get_endpoint_state.set(r, [&g] (std::unique_ptr<request> req) {
        return g.container().invoke_on(0, [req = std::move(req)] (gms::gossiper& g) {
-            auto state = g.get_endpoint_state_ptr(gms::inet_address(req->param["addr"]));
+            auto state = g.get_endpoint_state_ptr(gms::inet_address(req->get_path_param("addr")));
            if (!state) {
-                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->param["addr"]));
+                return make_ready_future<json::json_return_type>(format("unknown endpoint {}", req->get_path_param("addr")));
            }
            std::stringstream ss;
            g.append_endpoint_state(ss, *state);
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -31,21 +31,21 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
    });

    httpd::gossiper_json::get_endpoint_downtime.set(r, [&g] (std::unique_ptr<request> req) -> future<json::json_return_type> {
-        gms::inet_address ep(req->param["addr"]);
+        gms::inet_address ep(req->get_path_param("addr"));
        // synchronize unreachable_members on all shards
        co_await g.get_unreachable_members_synchronized();
        co_return g.get_endpoint_downtime(ep);
    });

    httpd::gossiper_json::get_current_generation_number.set(r, [&g] (std::unique_ptr<http::request> req) {
-        gms::inet_address ep(req->param["addr"]);
+        gms::inet_address ep(req->get_path_param("addr"));
        return g.get_current_generation_number(ep).then([] (gms::generation_type res) {
            return make_ready_future<json::json_return_type>(res.value());
        });
    });

    httpd::gossiper_json::get_current_heart_beat_version.set(r, [&g] (std::unique_ptr<http::request> req) {
-        gms::inet_address ep(req->param["addr"]);
+        gms::inet_address ep(req->get_path_param("addr"));
        return g.get_current_heart_beat_version(ep).then([] (gms::version_type res) {
            return make_ready_future<json::json_return_type>(res.value());
        });
@@ -53,17 +53,17 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {

    httpd::gossiper_json::assassinate_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
        if (req->get_query_param("unsafe") != "True") {
-            return g.assassinate_endpoint(req->param["addr"]).then([] {
+            return g.assassinate_endpoint(req->get_path_param("addr")).then([] {
                return make_ready_future<json::json_return_type>(json_void());
            });
        }
-        return g.unsafe_assassinate_endpoint(req->param["addr"]).then([] {
+        return g.unsafe_assassinate_endpoint(req->get_path_param("addr")).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    httpd::gossiper_json::force_remove_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
-        gms::inet_address ep(req->param["addr"]);
+        gms::inet_address ep(req->get_path_param("addr"));
        return g.force_remove_endpoint(ep, gms::null_permit_id).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -24,7 +24,7 @@ using namespace json;

 void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_registry>& raft_gr) {
    r::trigger_snapshot.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
-        raft::group_id gid{utils::UUID{req->param["group_id"]}};
+        raft::group_id gid{utils::UUID{req->get_path_param("group_id")}};
        auto timeout_dur = std::invoke([timeout_str = req->get_query_param("timeout")] {
            if (timeout_str.empty()) {
                return std::chrono::seconds{60};
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -58,15 +58,19 @@ namespace ss = httpd::storage_service_json;
 namespace sp = httpd::storage_proxy_json;
 using namespace json;

-sstring validate_keyspace(http_context& ctx, sstring ks_name) {
+sstring validate_keyspace(const http_context& ctx, sstring ks_name) {
    if (ctx.db.local().has_keyspace(ks_name)) {
        return ks_name;
    }
    throw bad_param_exception(replica::no_such_keyspace(ks_name).what());
 }

-sstring validate_keyspace(http_context& ctx, const parameters& param) {
-    return validate_keyspace(ctx, param["keyspace"]);
+sstring validate_keyspace(const http_context& ctx, const std::unique_ptr<http::request>& req) {
+    return validate_keyspace(ctx, req->get_path_param("keyspace"));
+}
+
+sstring validate_keyspace(const http_context& ctx, const http::request& req) {
+    return validate_keyspace(ctx, req.get_path_param("keyspace"));
 }

 locator::host_id validate_host_id(const sstring& param) {
@@ -171,7 +175,7 @@ using ks_cf_func = std::function<future<json::json_return_type>(http_context&, s

 static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    return [&ctx, f = std::move(f)](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
        return f(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
    };
@@ -338,7 +342,7 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair) {
        // returns immediately, not waiting for the repair to finish. The user
        // then has other mechanisms to track the ongoing repair's progress,
        // or stop it.
-        return repair_start(repair, validate_keyspace(ctx, req->param),
+        return repair_start(repair, validate_keyspace(ctx, req),
                options_map).then([] (int i) {
                    return make_ready_future<json::json_return_type>(i);
                });
@@ -421,7 +425,7 @@ void unset_repair(http_context& ctx, routes& r) {

 void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>& sst_loader) {
    ss::load_new_ss_tables.set(r, [&ctx, &sst_loader](std::unique_ptr<http::request> req) {
-        auto ks = validate_keyspace(ctx, req->param);
+        auto ks = validate_keyspace(ctx, req);
        auto cf = req->get_query_param("cf");
        auto stream = req->get_query_param("load_and_stream");
        auto primary_replica = req->get_query_param("primary_replica_only");
@@ -452,8 +456,8 @@ void unset_sstables_loader(http_context& ctx, routes& r) {

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb) {
    ss::view_build_statuses.set(r, [&ctx, &vb] (std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
-        auto view = req->param["view"];
+        auto keyspace = validate_keyspace(ctx, req);
+        auto view = req->get_path_param("view");
        return vb.local().view_build_statuses(std::move(keyspace), std::move(view)).then([] (std::unordered_map<sstring, sstring> status) {
            std::vector<storage_service_json::mapper> res;
            return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
@@ -590,7 +594,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::get_range_to_endpoint_map.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        std::vector<ss::maplist_mapper> res;
        co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace),
                [](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
@@ -615,7 +619,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::get_pending_range_to_endpoint_map.set(r, [&ctx](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        std::vector<ss::maplist_mapper> res;
        return make_ready_future<json::json_return_type>(res);
    });
@@ -631,7 +635,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::describe_ring.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) {
-        return describe_ring_as_json(ss, validate_keyspace(ctx, req->param));
+        return describe_ring_as_json(ss, validate_keyspace(ctx, req));
    });

    ss::get_host_id_map.set(r, [&ss](const_req req) {
@@ -664,7 +668,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::get_natural_endpoints.set(r, [&ctx, &ss](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req.param);
+        auto keyspace = validate_keyspace(ctx, req);
        return container_to_vec(ss.local().get_natural_endpoints(keyspace, req.get_query_param("cf"),
                req.get_query_param("key")));
    });
@@ -733,7 +737,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& db = ctx.db;
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
@@ -796,7 +800,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
        auto& db = ctx.db;
@@ -905,7 +909,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::truncate.set(r, [&ctx](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto column_family = req->get_query_param("cf");
        return make_ready_future<json::json_return_type>(json_void());
    });
@@ -1039,14 +1043,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::bulk_load.set(r, [](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto path = req->param["path"];
+        auto path = req->get_path_param("path");
        return make_ready_future<json::json_return_type>(json_void());
    });

    ss::bulk_load_async.set(r, [](std::unique_ptr<http::request> req) {
        //TBD
        unimplemented();
-        auto path = req->param["path"];
+        auto path = req->get_path_param("path");
        return make_ready_future<json::json_return_type>(json_void());
    });

@@ -1134,7 +1138,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
@@ -1142,7 +1146,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
@@ -1150,7 +1154,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::enable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("enable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
@@ -1158,7 +1162,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::disable_tombstone_gc.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
+        auto keyspace = validate_keyspace(ctx, req);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

        apilog.info("disable_tombstone_gc: keyspace={} tables={}", keyspace, tables);
@@ -1254,7 +1258,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::get_effective_ownership.set(r, [&ctx, &ss] (std::unique_ptr<http::request> req) {
-        auto keyspace_name = req->param["keyspace"] == "null" ? "" : validate_keyspace(ctx, req->param);
+        auto keyspace_name = req->get_path_param("keyspace") == "null" ? "" : validate_keyspace(ctx, req);
        return ss.local().effective_ownership(keyspace_name).then([] (auto&& ownership) {
            std::vector<storage_service_json::mapper> res;
            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
@@ -1542,8 +1546,10 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
                        });
                    }).then([&s] {
                        return s.write("]").then([&s] {
-                            return s.close();
+                            return s.flush();
                        });
+                    }).finally([&s] {
+                        return s.close();
                    });
                });
            };
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -37,11 +37,11 @@ namespace api {

 // verify that the keyspace is found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective keyspace error.
-sstring validate_keyspace(http_context& ctx, sstring ks_name);
+sstring validate_keyspace(const http_context& ctx, sstring ks_name);

 // verify that the keyspace parameter is found, otherwise a bad_param_exception exception is thrown
 // containing the description of the respective keyspace error.
-sstring validate_keyspace(http_context& ctx, const httpd::parameters& param);
+sstring validate_keyspace(const http_context& ctx, const std::unique_ptr<http::request>& req);

 // splits a request parameter assumed to hold a comma-separated list of table names
 // verify that the tables are found, otherwise a bad_param_exception exception is thrown
--- a/api/stream_manager.cc
+++ b/api/stream_manager.cc
@@ -106,7 +106,7 @@ void set_stream_manager(http_context& ctx, routes& r, sharded<streaming::stream_
    });

    hs::get_total_incoming_bytes.set(r, [&sm](std::unique_ptr<request> req) {
-        gms::inet_address peer(req->param["peer"]);
+        gms::inet_address peer(req->get_path_param("peer"));
        return sm.map_reduce0([peer](streaming::stream_manager& sm) {
            return sm.get_progress_on_all_shards(peer).then([] (auto sbytes) {
                return sbytes.bytes_received;
@@ -127,7 +127,7 @@ void set_stream_manager(http_context& ctx, routes& r, sharded<streaming::stream_
    });

    hs::get_total_outgoing_bytes.set(r, [&sm](std::unique_ptr<request> req) {
-        gms::inet_address peer(req->param["peer"]);
+        gms::inet_address peer(req->get_path_param("peer"));
        return sm.map_reduce0([peer] (streaming::stream_manager& sm) {
            return sm.get_progress_on_all_shards(peer).then([] (auto sbytes) {
                return sbytes.bytes_sent;
--- a/api/system.cc
+++ b/api/system.cc
@@ -119,9 +119,9 @@ void set_system(http_context& ctx, routes& r) {

    hs::get_logger_level.set(r, [](const_req req) {
        try {
-            return logging::level_name(logging::logger_registry().get_logger_level(req.param["name"]));
+            return logging::level_name(logging::logger_registry().get_logger_level(req.get_path_param("name")));
        } catch (std::out_of_range& e) {
-            throw bad_param_exception("Unknown logger name " + req.param["name"]);
+            throw bad_param_exception("Unknown logger name " + req.get_path_param("name"));
        }
        // just to keep the compiler happy
        return sstring();
@@ -130,9 +130,9 @@ void set_system(http_context& ctx, routes& r) {
    hs::set_logger_level.set(r, [](const_req req) {
        try {
            logging::log_level level = boost::lexical_cast<logging::log_level>(std::string(req.get_query_param("level")));
-            logging::logger_registry().set_logger_level(req.param["name"], level);
+            logging::logger_registry().set_logger_level(req.get_path_param("name"), level);
        } catch (std::out_of_range& e) {
-            throw bad_param_exception("Unknown logger name " + req.param["name"]);
+            throw bad_param_exception("Unknown logger name " + req.get_path_param("name"));
        } catch (boost::bad_lexical_cast& e) {
            throw bad_param_exception("Unknown logging level " + req.get_query_param("level"));
        }
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/exception.hh>

 #include "task_manager.hh"
 #include "api/api-doc/task_manager.json.hh"
@@ -124,7 +125,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {
            chunked_stats local_res;
            tasks::task_manager::module_ptr module;
            try {
-                module = tm.find_module(req->param["module"]);
+                module = tm.find_module(req->get_path_param("module"));
            } catch (...) {
                throw bad_param_exception(fmt::format("{}", std::current_exception()));
            }
@@ -139,25 +140,34 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {

        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
            auto s = std::move(os);
-            auto res = std::move(r);
-            co_await s.write("[");
-            std::string delim = "";
-            for (auto& v: res) {
-                for (auto& stats: v) {
-                    co_await s.write(std::exchange(delim, ", "));
-                    tm::task_stats ts;
-                    ts = stats;
-                    co_await formatter::write(s, ts);
+            std::exception_ptr ex;
+            try {
+                auto res = std::move(r);
+                co_await s.write("[");
+                std::string delim = "";
+                for (auto& v: res) {
+                    for (auto& stats: v) {
+                        co_await s.write(std::exchange(delim, ", "));
+                        tm::task_stats ts;
+                        ts = stats;
+                        co_await formatter::write(s, ts);
+                    }
                }
+                co_await s.write("]");
+                co_await s.flush();
+            } catch (...) {
+                ex = std::current_exception();
            }
-            co_await s.write("]");
            co_await s.close();
+            if (ex) {
+                co_await coroutine::return_exception_ptr(std::move(ex));
+            }
        };
        co_return std::move(f);
    });

    tm::get_task_status.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
        tasks::task_manager::foreign_task_ptr task;
        try {
            task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
@@ -174,7 +184,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {
    });

    tm::abort_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
        try {
            co_await tasks::task_manager::invoke_on_task(ctx.tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
                if (!task->is_abortable()) {
@@ -189,7 +199,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {
    });

    tm::wait_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
        tasks::task_manager::foreign_task_ptr task;
        try {
            task = co_await tasks::task_manager::invoke_on_task(ctx.tm, id, std::function([] (tasks::task_manager::task_ptr task) {
@@ -210,7 +220,7 @@ void set_task_manager(http_context& ctx, routes& r, db::config& cfg) {

    tm::get_task_status_recursively.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& _ctx = ctx;
-        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
        std::queue<tasks::task_manager::foreign_task_ptr> q;
        utils::chunked_vector<full_task_status> res;

--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -83,7 +83,7 @@ void set_task_manager_test(http_context& ctx, routes& r) {
    });

    tmt::finish_test_task.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto id = tasks::task_id{utils::UUID{req->param["task_id"]}};
+        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
        auto it = req->query_parameters.find("error");
        bool fail = it != req->query_parameters.end();
        std::string error = fail ? it->second : "";
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -144,12 +144,21 @@ std::ostream& operator<<(std::ostream& os, compaction_type_options::scrub::quara
 }

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks,
+        const api::timestamp_type compacting_max_timestamp) {
    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
        return api::min_timestamp;
    }

-    auto timestamp = table_s.min_memtable_timestamp();
+    auto timestamp = api::max_timestamp;
+    auto memtable_min_timestamp = table_s.min_memtable_timestamp();
+    // Use memtable timestamp if it contains data older than the sstables being compacted,
+    // and if the memtable also contains the key we're calculating max purgeable timestamp for.
+    // First condition helps to not penalize the common scenario where memtable only contains
+    // newer data.
+    if (memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
+        timestamp = memtable_min_timestamp;
+    }
    std::optional<utils::hashed_key> hk;
    for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
        if (compacting_set.contains(sst)) {
@@ -441,6 +450,7 @@ protected:
    uint64_t _end_size = 0;
    // fully expired files, which are skipped, aren't taken into account.
    uint64_t _compacting_data_file_size = 0;
+    api::timestamp_type _compacting_max_timestamp = api::min_timestamp;
    uint64_t _estimated_partitions = 0;
    double _estimated_droppable_tombstone_ratio = 0;
    uint64_t _bloom_filter_checks = 0;
@@ -739,6 +749,7 @@ private:
            auto gc_before = sst->get_gc_before_for_drop_estimation(gc_clock::now(), _table_s.get_tombstone_gc_state(), _schema);
            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_before);
            _compacting_data_file_size += sst->ondisk_data_size();
+
            // TODO:
            // Note that this is not fully correct. Since we might be merging sstables that originated on
            // another shard (#cpu changed), we might be comparing RP:s with differing shard ids,
@@ -747,6 +758,8 @@ private:
            // this is kind of ok, esp. since we will hopefully not be trying to recover based on
            // compacted sstables anyway (CL should be clean by then).
            _rp = std::max(_rp, sst_stats.position);
+
+            _compacting_max_timestamp = std::max(_compacting_max_timestamp, sst->get_stats_metadata().max_timestamp);
        }
        log_info("{} {}", report_start_desc(), formatted_msg);
        if (ssts->size() < _sstables.size()) {
@@ -869,7 +882,7 @@ private:
            };
        }
        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp);
        };
    }

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1327,13 +1327,20 @@ private:
                }));
        };

-        auto get_next_job = [&] () -> std::optional<sstables::compaction_descriptor> {
-            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), sstables::reshape_mode::strict);
-            return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
+        auto get_next_job = [&] () -> future<std::optional<sstables::compaction_descriptor>> {
+            auto candidates = get_reshape_candidates();
+            if (candidates.empty()) {
+                co_return std::nullopt;
+            }
+            // all sstables added to maintenance set share the same underlying storage.
+            auto& storage = candidates.front()->get_storage();
+            sstables::reshape_config cfg = co_await sstables::make_reshape_config(storage, sstables::reshape_mode::strict);
+            auto desc = t.get_compaction_strategy().get_reshaping_job(get_reshape_candidates(), t.schema(), cfg);
+            co_return desc.sstables.size() ? std::make_optional(std::move(desc)) : std::nullopt;
        };

        std::exception_ptr err;
-        while (auto desc = get_next_job()) {
+        while (auto desc = co_await get_next_job()) {
            auto compacting = compacting_sstable_registration(_cm, _cm.get_compaction_state(&t), desc->sstables);
            auto on_replace = compacting.update_on_sstable_replacement();

--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -75,7 +75,7 @@ reader_consumer_v2 compaction_strategy_impl::make_interposer_consumer(const muta
 }

 compaction_descriptor
-compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+compaction_strategy_impl::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
    return compaction_descriptor();
 }

@@ -700,8 +700,8 @@ compaction_backlog_tracker compaction_strategy::make_backlog_tracker() const {
 }

 sstables::compaction_descriptor
-compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
-    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, mode);
+compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
+    return _compaction_strategy_impl->get_reshaping_job(std::move(input), schema, cfg);
 }

 uint64_t compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate, schema_ptr schema) const {
@@ -739,6 +739,13 @@ compaction_strategy make_compaction_strategy(compaction_strategy_type strategy,
    return compaction_strategy(std::move(impl));
 }

+future<reshape_config> make_reshape_config(const sstables::storage& storage, reshape_mode mode) {
+    co_return sstables::reshape_config{
+        .mode = mode,
+        .free_storage_space = co_await storage.free_space() / smp::count,
+    };
+}
+
 }

 namespace compaction {
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -31,6 +31,7 @@ class sstable;
 class sstable_set;
 struct compaction_descriptor;
 struct resharding_descriptor;
+class storage;

 class compaction_strategy {
    ::shared_ptr<compaction_strategy_impl> _compaction_strategy_impl;
@@ -122,11 +123,13 @@ public:
    //
    // The caller should also pass a maximum number of SSTables which is the maximum amount of
    // SSTables that can be added into a single job.
-    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const;
+    compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const;

 };

 // Creates a compaction_strategy object from one of the strategies available.
 compaction_strategy make_compaction_strategy(compaction_strategy_type strategy, const std::map<sstring, sstring>& options);

+future<reshape_config> make_reshape_config(const sstables::storage& storage, reshape_mode mode);
+
 }
--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -76,6 +76,6 @@ public:
        return false;
    }

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const;
 };
 }
--- a/compaction/compaction_strategy_type.hh
+++ b/compaction/compaction_strategy_type.hh
@@ -8,6 +8,8 @@

 #pragma once

+#include <cstdint>
+
 namespace sstables {

 enum class compaction_strategy_type {
@@ -18,4 +20,10 @@ enum class compaction_strategy_type {
 };

 enum class reshape_mode { strict, relaxed };
+
+struct reshape_config {
+    reshape_mode mode;
+    const uint64_t free_storage_space;
+};
+
 }
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -146,7 +146,8 @@ int64_t leveled_compaction_strategy::estimated_pending_compactions(table_state&
 }

 compaction_descriptor
-leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
+    auto mode = cfg.mode;
    std::array<std::vector<shared_sstable>, leveled_manifest::MAX_LEVELS> level_info;

    auto is_disjoint = [schema] (const std::vector<shared_sstable>& sstables, unsigned tolerance) -> std::tuple<bool, unsigned> {
@@ -203,7 +204,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    if (level_info[0].size() > offstrategy_threshold) {
        size_tiered_compaction_strategy stcs(_stcs_options);
-        return stcs.get_reshaping_job(std::move(level_info[0]), schema, mode);
+        return stcs.get_reshaping_job(std::move(level_info[0]), schema, cfg);
    }

    for (unsigned level = leveled_manifest::MAX_LEVELS - 1; level > 0; --level) {
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -74,7 +74,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;
 };

 }
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -297,8 +297,9 @@ size_tiered_compaction_strategy::most_interesting_bucket(const std::vector<sstab
 }

 compaction_descriptor
-size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const
+size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const
 {
+    auto mode = cfg.mode;
    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));

--- a/compaction/size_tiered_compaction_strategy.hh
+++ b/compaction/size_tiered_compaction_strategy.hh
@@ -96,7 +96,7 @@ public:

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;

    friend class ::size_tiered_backlog_tracker;
 };
--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -48,6 +48,7 @@ public:
    virtual sstables::shared_sstable make_sstable() const = 0;
    virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
    virtual api::timestamp_type min_memtable_timestamp() const = 0;
+    virtual bool memtable_has_key(const dht::decorated_key& key) const = 0;
    virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
    virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
    virtual bool tombstone_gc_enabled() const noexcept = 0;
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -555,7 +555,13 @@ future<> shard_reshaping_compaction_task_impl::run() {
                | boost::adaptors::filtered([&filter = _filter] (const auto& sst) {
            return filter(sst);
        }));
-        auto desc = table.get_compaction_strategy().get_reshaping_job(std::move(reshape_candidates), table.schema(), _mode);
+        if (reshape_candidates.empty()) {
+            break;
+        }
+        // all sstables were found in the same sstable_directory instance, so they share the same underlying storage.
+        auto& storage = reshape_candidates.front()->get_storage();
+        auto cfg = co_await sstables::make_reshape_config(storage, _mode);
+        auto desc = table.get_compaction_strategy().get_reshaping_job(std::move(reshape_candidates), table.schema(), cfg);
        if (desc.sstables.empty()) {
            break;
        }
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -223,12 +223,14 @@ reader_consumer_v2 time_window_compaction_strategy::make_interposer_consumer(con
 }

 compaction_descriptor
-time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const {
+time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
+    auto mode = cfg.mode;
    std::vector<shared_sstable> single_window;
    std::vector<shared_sstable> multi_window;

    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
+    const uint64_t target_job_size = cfg.free_storage_space * reshape_target_space_overhead;

    if (mode == reshape_mode::relaxed) {
        offstrategy_threshold = max_sstables;
@@ -260,22 +262,40 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
            multi_window.size(), !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0,
            single_window.size(), !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0);

-    auto need_trimming = [max_sstables, schema, &is_disjoint] (const std::vector<shared_sstable>& ssts) {
-        // All sstables can be compacted at once if they're disjoint, given that partitioned set
-        // will incrementally open sstables which translates into bounded memory usage.
-        return ssts.size() > max_sstables && !is_disjoint(ssts);
+    auto get_job_size = [] (const std::vector<shared_sstable>& ssts) {
+        return boost::accumulate(ssts | boost::adaptors::transformed(std::mem_fn(&sstable::bytes_on_disk)), uint64_t(0));
+    };
+
+    // Targets a space overhead of 10%. All disjoint sstables can be compacted together as long as they won't
+    // cause an overhead above target. Otherwise, the job targets a maximum of #max_threshold sstables.
+    auto need_trimming = [&] (const std::vector<shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
+        const size_t min_sstables = 2;
+        auto is_above_target_size = job_size > target_job_size;
+
+        return (ssts.size() > max_sstables && !is_disjoint) ||
+               (ssts.size() > min_sstables && is_above_target_size);
+    };
+
+    auto maybe_trim_job = [&need_trimming] (std::vector<shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
+        while (need_trimming(ssts, job_size, is_disjoint)) {
+            auto sst = ssts.back();
+            ssts.pop_back();
+            job_size -= sst->bytes_on_disk();
+        }
    };

    if (!multi_window.empty()) {
+        auto disjoint = is_disjoint(multi_window);
+        auto job_size = get_job_size(multi_window);
        // Everything that spans multiple windows will need reshaping
-        if (need_trimming(multi_window)) {
+        if (need_trimming(multi_window, job_size, disjoint)) {
            // When trimming, let's keep sstables with overlapping time window, so as to reduce write amplification.
            // For example, if there are N sstables spanning window W, where N <= 32, then we can produce all data for W
            // in a single compaction round, removing the need to later compact W to reduce its number of files.
            boost::partial_sort(multi_window, multi_window.begin() + max_sstables, [](const shared_sstable &a, const shared_sstable &b) {
                return a->get_stats_metadata().max_timestamp < b->get_stats_metadata().max_timestamp;
            });
-            multi_window.resize(max_sstables);
+            maybe_trim_job(multi_window, job_size, disjoint);
        }
        compaction_descriptor desc(std::move(multi_window));
        desc.options = compaction_type_options::make_reshape();
@@ -294,15 +314,17 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
                std::copy(ssts.begin(), ssts.end(), std::back_inserter(single_window));
                continue;
            }
+
            // reuse STCS reshape logic which will only compact similar-sized files, to increase overall efficiency
            // when reshaping time buckets containing a huge amount of files
-            auto desc = size_tiered_compaction_strategy(_stcs_options).get_reshaping_job(std::move(ssts), schema, mode);
+            auto desc = size_tiered_compaction_strategy(_stcs_options).get_reshaping_job(std::move(ssts), schema, cfg);
            if (!desc.sstables.empty()) {
                return desc;
            }
        }
    }
    if (!single_window.empty()) {
+        maybe_trim_job(single_window, get_job_size(single_window), all_disjoint);
        compaction_descriptor desc(std::move(single_window));
        desc.options = compaction_type_options::make_reshape();
        return desc;
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -78,6 +78,7 @@ public:
    // To prevent an explosion in the number of sstables we cap it.
    // Better co-locate some windows into the same sstables than OOM.
    static constexpr uint64_t max_data_segregation_window_count = 100;
+    static constexpr float reshape_target_space_overhead = 0.1f;

    using bucket_t = std::vector<shared_sstable>;
    enum class bucket_compaction_mode { none, size_tiered, major };
@@ -170,7 +171,7 @@ public:
        return true;
    }

-    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_mode mode) const override;
+    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;
 };

 }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -815,7 +815,7 @@ bool query_processor::has_more_results(cql3::internal_query_state& state) const

 future<> query_processor::for_each_cql_result(
        cql3::internal_query_state& state,
-         noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)>&& f) {
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
    do {
        auto msg = co_await execute_paged_internal(state);
        for (auto& row : *msg) {
@@ -1116,14 +1116,14 @@ future<> query_processor::query_internal(
        db::consistency_level cl,
        const std::initializer_list<data_value>& values,
        int32_t page_size,
-        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
    auto query_state = create_paged_state(query_string, cl, values, page_size);
    co_return co_await for_each_cql_result(query_state, std::move(f));
 }

 future<> query_processor::query_internal(
        const sstring& query_string,
-        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
    return query_internal(query_string, db::consistency_level::ONE, {}, 1000, std::move(f));
 }

--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -307,7 +307,7 @@ public:
            db::consistency_level cl,
            const std::initializer_list<data_value>& values,
            int32_t page_size,
-            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);

    /*
     * \brief iterate over all cql results using paging
@@ -322,7 +322,7 @@ public:
     */
    future<> query_internal(
            const sstring& query_string,
-            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);

    class cache_internal_tag;
    using cache_internal = bool_class<cache_internal_tag>;
@@ -479,7 +479,7 @@ private:
     */
    future<> for_each_cql_result(
            cql3::internal_query_state& state,
-             noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);

    /*!
     * \brief check, based on the state if there are additional results
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -2004,7 +2004,10 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
            )
            && !restrictions->need_filtering()  // No filtering
            && group_by_cell_indices->empty()   // No GROUP BY
-            && db.get_config().enable_parallelized_aggregation();
+            && db.get_config().enable_parallelized_aggregation()
+            && !( // Do not parallelize the request if it's single partition read
+                restrictions->partition_key_restrictions_is_all_eq() 
+                && restrictions->partition_key_restrictions_size() == schema->partition_key_size());
    };

    if (_parameters->is_prune_materialized_view()) {
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -135,7 +135,7 @@ future<> db::batchlog_manager::stop() {
 }

 future<size_t> db::batchlog_manager::count_all_batches() const {
-    sstring query = format("SELECT count(*) FROM {}.{}", system_keyspace::NAME, system_keyspace::BATCHLOG);
+    sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG);
    return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> rs) {
       return size_t(rs->one().get_as<int64_t>("count"));
    });
@@ -154,26 +154,26 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

-    auto batch = [this, limiter](const cql3::untyped_result_set::row& row) {
+    auto batch = [this, limiter](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
        auto written_at = row.get_as<db_clock::time_point>("written_at");
        auto id = row.get_as<utils::UUID>("id");
        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
        auto timeout = get_batch_log_timeout();
        if (db_clock::now() < written_at + timeout) {
            blogger.debug("Skipping replay of {}, too fresh", id);
-            return make_ready_future<>();
+            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

        // check version of serialization format
        if (!row.has("version")) {
            blogger.warn("Skipping logged batch because of unknown version");
-            return make_ready_future<>();
+            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

        auto version = row.get_as<int32_t>("version");
        if (version != netw::messaging_service::current_version) {
            blogger.warn("Skipping logged batch because of incorrect version");
-            return make_ready_future<>();
+            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

        auto data = row.get_blob("data");
@@ -255,49 +255,20 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
            m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-        });
+        }).then([] { return make_ready_future<stop_iteration>(stop_iteration::no); });
    };

-    return seastar::with_gate(_gate, [this, batch = std::move(batch)] {
+    return seastar::with_gate(_gate, [this, batch = std::move(batch)] () mutable {
        blogger.debug("Started replayAllFailedBatches (cpu {})", this_shard_id());
-
-        typedef ::shared_ptr<cql3::untyped_result_set> page_ptr;
-        sstring query = format("SELECT id, data, written_at, version FROM {}.{} LIMIT {:d}", system_keyspace::NAME, system_keyspace::BATCHLOG, page_size);
-        return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([this, batch = std::move(batch)](page_ptr page) {
-            return do_with(std::move(page), [this, batch = std::move(batch)](page_ptr & page) mutable {
-                return repeat([this, &page, batch = std::move(batch)]() mutable {
-                    if (page->empty()) {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    }
-                    auto id = page->back().get_as<utils::UUID>("id");
-                    return parallel_for_each(*page, batch).then([this, &page, id]() {
-                        if (page->size() < page_size) {
-                            return make_ready_future<stop_iteration>(stop_iteration::yes); // we've exhausted the batchlog, next query would be empty.
-                        }
-                        sstring query = format("SELECT id, data, written_at, version FROM {}.{} WHERE token(id) > token(?) LIMIT {:d}",
-                                system_keyspace::NAME,
-                                system_keyspace::BATCHLOG,
-                                page_size);
-                        return _qp.execute_internal(query, {id}, cql3::query_processor::cache_internal::yes).then([&page](auto res) {
-                                    page = std::move(res);
-                                    return make_ready_future<stop_iteration>(stop_iteration::no);
-                                });
-                    });
-                });
-            });
-        }).then([] {
-        // TODO FIXME : cleanup()
-#if 0
-            ColumnFamilyStore cfs = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHLOG);
-            cfs.forceBlockingFlush();
-            Collection<Descriptor> descriptors = new ArrayList<>();
-            for (SSTableReader sstr : cfs.getSSTables())
-            descriptors.add(sstr.descriptor);
-            if (!descriptors.isEmpty()) // don't pollute the logs if there is nothing to compact.
-            CompactionManager.instance.submitUserDefined(cfs, descriptors, Integer.MAX_VALUE).get();
-
-#endif
-
+        return _qp.query_internal(
+                format("SELECT id, data, written_at, version FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
+                db::consistency_level::ONE,
+                {},
+                page_size,
+                std::move(batch)).then([this] {
+            // Replaying batches could have generated tombstones, flush to disk,
+            // where they can be compacted away.
+            return replica::database::flush_table_on_all_shards(_qp.proxy().get_db(), system_keyspace::NAME, system_keyspace::BATCHLOG);
        }).then([] {
            blogger.debug("Finished replayAllFailedBatches");
        });
--- a/db/config.cc
+++ b/db/config.cc
@@ -951,7 +951,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , unspooled_dirty_soft_limit(this, "unspooled_dirty_soft_limit", value_status::Used, 0.6, "Soft limit of unspooled dirty memory expressed as a portion of the hard limit")
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default) "
        "bytes written to data file. Value must be between 0 and 1.")
-    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .1, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
+    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting")
@@ -991,6 +991,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Start serializing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , reader_concurrency_semaphore_kill_limit_multiplier(this, "reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
            "Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
+    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 1,
+            "Admit new reads while there are less than this number of requests that need CPU.")
    , twcs_max_window_count(this, "twcs_max_window_count", liveness::LiveUpdate, value_status::Used, 50,
            "The maximum number of compaction windows allowed when making use of TimeWindowCompactionStrategy. A setting of 0 effectively disables the restriction.")
    , initial_sstable_loading_concurrency(this, "initial_sstable_loading_concurrency", value_status::Used, 4u,
--- a/db/config.hh
+++ b/db/config.hh
@@ -373,6 +373,7 @@ public:
    named_value<uint64_t> max_memory_for_unlimited_query_hard_limit;
    named_value<uint32_t> reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_kill_limit_multiplier;
+    named_value<uint32_t> reader_concurrency_semaphore_cpu_concurrency;
    named_value<uint32_t> twcs_max_window_count;
    named_value<unsigned> initial_sstable_loading_concurrency;
    named_value<bool> enable_3_1_0_compatibility_mode;
--- a/db/paxos_grace_seconds_extension.hh
+++ b/db/paxos_grace_seconds_extension.hh
@@ -55,6 +55,10 @@ public:
        return ser::serialize_to_buffer<bytes>(_paxos_gc_sec);
    }

+    std::string options_to_string() const override {
+        return std::to_string(_paxos_gc_sec);
+    }
+
    static int32_t deserialize(const bytes_view& buffer) {
        return ser::deserialize_from_buffer(buffer, boost::type<int32_t>());
    }
--- a/docs/troubleshooting/sstable-corruption.rst
+++ b/docs/troubleshooting/sstable-corruption.rst
@@ -21,7 +21,7 @@ For example:
 
 In this scenario, a missing ``TOC`` file will prevent the Scylla node from starting.

-The SSTable corporation problem can be different, for example, other missing or unreadable files. The following solution apply for all of the scenarios.
+The SSTable corruption problem can be different, for example, other missing or unreadable files. The following solution applies to all scenarios.

 Solution
 ^^^^^^^^
--- a/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
+++ b/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
@@ -167,54 +167,27 @@ Download and install the new release

   .. group-tab:: EC2/GCP/Azure Ubuntu Image

-        Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
+      Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.

-        There are two alternative upgrade procedures: upgrading ScyllaDB and simultaneously updating 3rd party and OS packages - recommended if you 
-        are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04, and upgrading ScyllaDB without updating 
-        any external packages.
+      If you’re using the ScyllaDB official image (recommended), see
+      the **Debian/Ubuntu** tab for upgrade instructions. If you’re using your
+      own image and have installed ScyllaDB packages for Ubuntu or Debian,
+      you need to apply an extended upgrade procedure:
+      
+      #. Update the ScyllaDB deb repo (see above).
+      #. Configure Java 1.8 (see above).
+      #. Install the new ScyllaDB version with the additional 
+         ``scylla-enterprise-machine-image`` package:

-        **To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-        Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Load the new repo:
-
-            .. code:: sh
-
-               sudo apt-get update
-
-        #. Run the following command to update the manifest file:
-
-            .. code:: sh
-
-               cat scylla-enterprise-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-
-            Where:
-
-              * ``<version>`` - The ScyllaDB Enterprise version to which you are upgrading ( |NEW_VERSION| ).
-              * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-
-            The file is included in the ScyllaDB Enterprise packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-            Example:
-
-                .. code:: sh
-
-                   cat scylla-enterprise-packages-2022.2.0-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-
-                .. note::
-
-                   Alternatively, you can update the manifest file with the following command:
-
-                   ``sudo apt-get install $(awk '{print $1'} scylla-enterprise-packages-<version>-<arch>.txt) -y``
-
-
-
-        To upgrade ScyllaDB without updating any external packages, follow the :ref:`download and installation instructions for Debian/Ubuntu <upgrade-debian-ubuntu-5.2-to-enterprise-2023.1>`.
+          .. code::
+         
+           sudo apt-get clean all
+           sudo apt-get update
+           sudo apt-get dist-upgrade scylla-enterprise
+           sudo apt-get dist-upgrade scylla-enterprise-machine-image

+      #. Run ``scylla_setup`` without running ``io_setup``.
+      #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.

 Start the node
 --------------
--- a/docs/using-scylla/cdc/cdc-intro.rst
+++ b/docs/using-scylla/cdc/cdc-intro.rst
@@ -87,7 +87,7 @@ The following libraries are available:
 More information
 ----------------

-`Scylla University: Change Data Capture (CDC) lesson <https://university.scylladb.com/courses/scylla-operations/lessons/change-data-capture-cdc/>`_ -  Learn how to use CDC. Some of the topics covered are:
+`Scylla University: Change Data Capture (CDC) lesson <https://university.scylladb.com/courses/data-modeling/lessons/change-data-capture-cdc/>`_ -  Learn how to use CDC. Some of the topics covered are:

 * An overview of Change Data Capture,  what exactly is it, what are some common use cases, what does it do, and an overview of how it works
 * How can that data be consumed? Different options for consuming the data changes including normal CQL, a layered approach, and integrators
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -2343,8 +2343,13 @@ bool gossiper::is_alive(inet_address ep) const {
 }

 future<> gossiper::wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout) {
+    return wait_alive([nodes = std::move(nodes)] { return nodes; }, timeout);
+}
+
+future<> gossiper::wait_alive(noncopyable_function<std::vector<gms::inet_address>()> get_nodes, std::chrono::milliseconds timeout) {
    auto start_time = std::chrono::steady_clock::now();
    for (;;) {
+        auto nodes = get_nodes();
        std::vector<gms::inet_address> live_nodes;
        for (const auto& node: nodes) {
            size_t nr_alive = co_await container().map_reduce0([node] (gossiper& g) -> size_t {
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -500,6 +500,7 @@ public:
    bool is_dead_state(const endpoint_state& eps) const;
    // Wait for nodes to be alive on all shards
    future<> wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout);
+    future<> wait_alive(noncopyable_function<std::vector<gms::inet_address>()> get_nodes, std::chrono::milliseconds timeout);

    // Wait for `n` live nodes to show up in gossip (including ourself).
    future<> wait_for_live_nodes_to_show_up(size_t n);
--- a/main.cc
+++ b/main.cc
@@ -165,12 +165,29 @@ struct convert<::object_storage_endpoint_param> {
        ep.endpoint = node["name"].as<std::string>();
        ep.config.port = node["port"].as<unsigned>();
        ep.config.use_https = node["https"].as<bool>(false);
-        if (node["aws_region"]) {
+        if (node["aws_region"] || std::getenv("AWS_DEFAULT_REGION")) {
            ep.config.aws.emplace();
-            ep.config.aws->region = node["aws_region"].as<std::string>();
-            ep.config.aws->access_key_id = node["aws_access_key_id"].as<std::string>();
-            ep.config.aws->secret_access_key = node["aws_secret_access_key"].as<std::string>();
-            ep.config.aws->session_token = node["aws_session_token"].as<std::string>("");
+
+            // https://github.com/scylladb/scylla-pkg/issues/3845
+            // Allow picking up aws values via standard env vars as well.
+            // Value in config has prio, but fall back to env.
+            // This has the added benefit of potentially reducing the amount of
+            // sensitive data in config files (i.e. credentials)
+            auto get_node_value_or_env = [&](const char* key, const char* var) {
+                auto child = node[key];
+                if (child) {
+                    return child.as<std::string>();
+                }
+                auto val = std::getenv(var);
+                if (val) {
+                    return std::string(val);
+                }
+                return std::string{};
+            };
+            ep.config.aws->region = get_node_value_or_env("aws_region", "AWS_DEFAULT_REGION");
+            ep.config.aws->access_key_id = get_node_value_or_env("aws_access_key_id", "AWS_ACCESS_KEY_ID");
+            ep.config.aws->secret_access_key = get_node_value_or_env("aws_secret_access_key", "AWS_SECRET_ACCESS_KEY");
+            ep.config.aws->session_token = get_node_value_or_env("aws_session_token", "AWS_SESSION_TOKEN");
        }
        return true;
    }
@@ -1242,7 +1259,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            }

            netw::messaging_service::scheduling_config scfg;
-            scfg.statement_tenants = { {dbcfg.statement_scheduling_group, "$user"}, {default_scheduling_group(), "$system"} };
+            scfg.statement_tenants = {
+                    {dbcfg.statement_scheduling_group, "$user"},
+                    {default_scheduling_group(), "$system"},
+                    {dbcfg.streaming_scheduling_group, "$maintenance"}
+            };
            scfg.streaming = dbcfg.streaming_scheduling_group;
            scfg.gossip = dbcfg.gossip_scheduling_group;

--- a/mutation/mutation_partition_v2.cc
+++ b/mutation/mutation_partition_v2.cc
@@ -221,6 +221,12 @@ stop_iteration mutation_partition_v2::apply_monotonically(const schema& s, const
    alloc_strategy_unique_ptr<rows_entry> p_sentinel;
    alloc_strategy_unique_ptr<rows_entry> this_sentinel;
    auto insert_sentinel_back = defer([&] {
+        // Note: this lambda will be run by a destructor (of the `defer` guard),
+        // so it mustn't throw, or else it will crash the node.
+        //
+        // To prevent a `bad_alloc` during the tree insertion, we have to preallocate
+        // some memory for the new tree nodes. This is done by the `hold_reserve`
+        // constructed after the lambda.
        if (this_sentinel) {
            assert(p_i != p._rows.end());
            auto rt = this_sentinel->range_tombstone();
@@ -254,6 +260,15 @@ stop_iteration mutation_partition_v2::apply_monotonically(const schema& s, const
        }
    });

+    // This guard will ensure that LSA reserves one free segment more than it
+    // needs for internal reasons.
+    //
+    // It will be destroyed immediately before the sentinel-inserting `defer`
+    // happens, ensuring that the sentinel insertion has at least one free LSA segment
+    // to work with. This should be enough, since we only need to allocate a few
+    // B-tree nodes.
+    auto memory_reserve_for_sentinel_inserts = hold_reserve(logalloc::segment_size);
+
    while (p_i != p._rows.end()) {
        rows_entry& src_e = *p_i;

--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -637,7 +637,9 @@ void fsm::step(server_id from, Message&& msg) {
            _last_election_time = _clock.now();

            if (current_leader() != from) {
-                on_internal_error_noexcept(logger, "Got append request/install snapshot/read_quorum from an unexpected leader");
+                on_internal_error_noexcept(logger, format(
+                    "Got append request/install snapshot/read_quorum from an unexpected leader,"
+                    " expected leader: {}, message from: {}", current_leader(), from));
            }
        }
    }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -926,12 +926,15 @@ future<> reader_concurrency_semaphore::execution_loop() noexcept {
                e.pr.set_exception(std::current_exception());
            }

+            // We now possibly have >= CPU concurrency, so even if the above read
+            // didn't release any resources, just dequeueing it from the
+            // _ready_list could allow us to admit new reads.
+            maybe_admit_waiters();
+
            if (need_preempt()) {
                co_await coroutine::maybe_yield();
            }
        }
-
-        maybe_admit_waiters();
    }
 }

@@ -968,14 +971,21 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
    maybe_admit_waiters();
 }

-reader_concurrency_semaphore::reader_concurrency_semaphore(int count, ssize_t memory, sstring name, size_t max_queue_length,
-            utils::updateable_value<uint32_t> serialize_limit_multiplier, utils::updateable_value<uint32_t> kill_limit_multiplier)
+reader_concurrency_semaphore::reader_concurrency_semaphore(
+        int count,
+        ssize_t memory,
+        sstring name,
+        size_t max_queue_length,
+        utils::updateable_value<uint32_t> serialize_limit_multiplier,
+        utils::updateable_value<uint32_t> kill_limit_multiplier,
+        utils::updateable_value<uint32_t> cpu_concurrency)
    : _initial_resources(count, memory)
    , _resources(count, memory)
    , _name(std::move(name))
    , _max_queue_length(max_queue_length)
    , _serialize_limit_multiplier(std::move(serialize_limit_multiplier))
    , _kill_limit_multiplier(std::move(kill_limit_multiplier))
+    , _cpu_concurrency(cpu_concurrency)
 { }

 reader_concurrency_semaphore::reader_concurrency_semaphore(no_limits, sstring name)
@@ -985,7 +995,8 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(no_limits, sstring na
            std::move(name),
            std::numeric_limits<size_t>::max(),
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
-            utils::updateable_value(std::numeric_limits<uint32_t>::max())) {}
+            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
+            utils::updateable_value(uint32_t(1))) {}

 reader_concurrency_semaphore::~reader_concurrency_semaphore() {
    assert(!_stats.waiters);
@@ -1186,8 +1197,8 @@ bool reader_concurrency_semaphore::has_available_units(const resources& r) const
    return (_resources.non_zero() && _resources.count >= r.count && _resources.memory >= r.memory) || _resources.count == _initial_resources.count;
 }

-bool reader_concurrency_semaphore::all_need_cpu_permits_are_awaiting() const {
-    return _stats.need_cpu_permits == _stats.awaits_permits;
+bool reader_concurrency_semaphore::cpu_concurrency_limit_reached() const {
+    return (_stats.need_cpu_permits - _stats.awaits_permits) >= _cpu_concurrency();
 }

 std::exception_ptr reader_concurrency_semaphore::check_queue_size(std::string_view queue_name) {
@@ -1270,7 +1281,7 @@ reader_concurrency_semaphore::can_admit_read(const reader_permit::impl& permit)
        return {can_admit::no, reason::ready_list};
    }

-    if (!all_need_cpu_permits_are_awaiting()) {
+    if (cpu_concurrency_limit_reached()) {
        return {can_admit::no, reason::need_cpu_permits};
    }

--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -186,6 +186,7 @@ private:
    size_t _max_queue_length = std::numeric_limits<size_t>::max();
    utils::updateable_value<uint32_t> _serialize_limit_multiplier;
    utils::updateable_value<uint32_t> _kill_limit_multiplier;
+    utils::updateable_value<uint32_t> _cpu_concurrency;
    stats _stats;
    bool _stopped = false;
    bool _evicting = false;
@@ -201,7 +202,7 @@ private:

    bool has_available_units(const resources& r) const;

-    bool all_need_cpu_permits_are_awaiting() const;
+    bool cpu_concurrency_limit_reached() const;

    [[nodiscard]] std::exception_ptr check_queue_size(std::string_view queue_name);

@@ -274,7 +275,19 @@ public:
            sstring name,
            size_t max_queue_length,
            utils::updateable_value<uint32_t> serialize_limit_multiplier,
-            utils::updateable_value<uint32_t> kill_limit_multiplier);
+            utils::updateable_value<uint32_t> kill_limit_multiplier,
+            utils::updateable_value<uint32_t> cpu_concurrency);
+
+    reader_concurrency_semaphore(
+            int count,
+            ssize_t memory,
+            sstring name,
+            size_t max_queue_length,
+            utils::updateable_value<uint32_t> serialize_limit_multiplier,
+            utils::updateable_value<uint32_t> kill_limit_multiplier)
+        : reader_concurrency_semaphore(count, memory, std::move(name), max_queue_length,
+                std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier), utils::updateable_value<uint32_t>(1))
+    { }

    /// Create a semaphore with practically unlimited count and memory.
    ///
@@ -291,8 +304,10 @@ public:
            ssize_t memory = std::numeric_limits<ssize_t>::max(),
            size_t max_queue_length = std::numeric_limits<size_t>::max(),
            utils::updateable_value<uint32_t> serialize_limit_multipler = utils::updateable_value(std::numeric_limits<uint32_t>::max()),
-            utils::updateable_value<uint32_t> kill_limit_multipler = utils::updateable_value(std::numeric_limits<uint32_t>::max()))
-        : reader_concurrency_semaphore(count, memory, std::move(name), max_queue_length, std::move(serialize_limit_multipler), std::move(kill_limit_multipler))
+            utils::updateable_value<uint32_t> kill_limit_multipler = utils::updateable_value(std::numeric_limits<uint32_t>::max()),
+            utils::updateable_value<uint32_t> cpu_concurrency = utils::updateable_value<uint32_t>(1))
+        : reader_concurrency_semaphore(count, memory, std::move(name), max_queue_length, std::move(serialize_limit_multipler),
+                std::move(kill_limit_multipler), std::move(cpu_concurrency))
    {}

    virtual ~reader_concurrency_semaphore();
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -3211,9 +3211,7 @@ repair_service::insert_repair_meta(
            reason,
            compaction_time] (schema_ptr s) {
        auto& db = get_db();
-        auto& cf = db.local().find_column_family(s->id());
-        return db.local().obtain_reader_permit(cf, "repair-meta", db::no_timeout, {}).then([s = std::move(s),
-                &cf,
+        return db.local().obtain_reader_permit(db.local().find_column_family(s->id()), "repair-meta", db::no_timeout, {}).then([s = std::move(s),
                this,
                from,
                repair_meta_id,
@@ -3226,7 +3224,7 @@ repair_service::insert_repair_meta(
                compaction_time] (reader_permit permit) mutable {
        node_repair_meta_id id{from, repair_meta_id};
        auto rm = seastar::make_shared<repair_meta>(*this,
-                cf,
+                get_db().local().find_column_family(s->id()),
                s,
                std::move(permit),
                range,
--- a/repair/table_check.cc
+++ b/repair/table_check.cc
@@ -14,16 +14,8 @@ namespace repair {

 future<table_dropped> table_sync_and_check(replica::database& db, service::migration_manager& mm, const table_id& uuid) {
    if (mm.use_raft()) {
-        abort_on_expiry aoe(lowres_clock::now() + std::chrono::seconds{10});
-        auto& as = aoe.abort_source();
-        auto sub = mm.get_abort_source().subscribe([&as] () noexcept {
-            if (!as.abort_requested()) {
-                as.request_abort();
-            }
-        });
-
        // Trigger read barrier to synchronize schema.
-        co_await mm.get_group0_barrier().trigger(as);
+        co_await mm.get_group0_barrier().trigger(mm.get_abort_source());
    }

    co_return !db.column_family_exists(uuid);
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -101,6 +101,8 @@ public:
    size_t memtable_count() const noexcept;
    // Returns minimum timestamp from memtable list
    api::timestamp_type min_memtable_timestamp() const;
+    // Returns true if memtable(s) contains key.
+    bool memtable_has_key(const dht::decorated_key& key) const;
    // Add sstable to main set
    void add_sstable(sstables::shared_sstable sstable);
    // Add sstable to maintenance set
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -332,7 +332,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
        "_read_concurrency_sem",
        max_inactive_queue_length(),
        _cfg.reader_concurrency_semaphore_serialize_limit_multiplier,
-        _cfg.reader_concurrency_semaphore_kill_limit_multiplier)
+        _cfg.reader_concurrency_semaphore_kill_limit_multiplier,
+        _cfg.reader_concurrency_semaphore_cpu_concurrency)
    // No timeouts or queue length limits - a failure here can kill an entire repair.
    // Trust the caller to limit concurrency.
    , _streaming_concurrency_sem(
@@ -341,7 +342,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
            "_streaming_concurrency_sem",
            std::numeric_limits<size_t>::max(),
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
-            utils::updateable_value(std::numeric_limits<uint32_t>::max()))
+            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
+            utils::updateable_value(uint32_t(1)))
    // No limits, just for accounting.
    , _compaction_concurrency_sem(reader_concurrency_semaphore::no_limits{}, "compaction")
    , _system_read_concurrency_sem(
@@ -367,8 +369,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
              _cfg.compaction_rows_count_warning_threshold,
              _cfg.compaction_collection_elements_count_warning_threshold))
    , _nop_large_data_handler(std::make_unique<db::nop_large_data_handler>())
-    , _user_sstables_manager(std::make_unique<sstables::sstables_manager>(*_large_data_handler, _cfg, feat, _row_cache_tracker, dbcfg.available_memory, sst_dir_sem.local(), [&stm]{ return stm.get()->get_my_id(); }, &sstm))
-    , _system_sstables_manager(std::make_unique<sstables::sstables_manager>(*_nop_large_data_handler, _cfg, feat, _row_cache_tracker, dbcfg.available_memory, sst_dir_sem.local(), [&stm]{ return stm.get()->get_my_id(); }))
+    , _user_sstables_manager(std::make_unique<sstables::sstables_manager>(*_large_data_handler, _cfg, feat, _row_cache_tracker, dbcfg.available_memory, sst_dir_sem.local(), [&stm]{ return stm.get()->get_my_id(); }, dbcfg.streaming_scheduling_group, &sstm))
+    , _system_sstables_manager(std::make_unique<sstables::sstables_manager>(*_nop_large_data_handler, _cfg, feat, _row_cache_tracker, dbcfg.available_memory, sst_dir_sem.local(), [&stm]{ return stm.get()->get_my_id(); }, dbcfg.streaming_scheduling_group))
    , _result_memory_limiter(dbcfg.available_memory / 10)
    , _data_listeners(std::make_unique<db::data_listeners>())
    , _mnotifier(mn)
@@ -1388,7 +1390,7 @@ keyspace::make_column_family_config(const schema& s, const database& db) const {
    cfg.view_update_concurrency_semaphore = _config.view_update_concurrency_semaphore;
    cfg.view_update_concurrency_semaphore_limit = _config.view_update_concurrency_semaphore_limit;
    cfg.data_listeners = &db.data_listeners();
-    cfg.enable_compacting_data_for_streaming_and_repair = db_config.enable_compacting_data_for_streaming_and_repair();
+    cfg.enable_compacting_data_for_streaming_and_repair = db_config.enable_compacting_data_for_streaming_and_repair;

    return cfg;
 }
@@ -1904,8 +1906,8 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
    auto slice = query::partition_slice(std::move(cr_ranges), std::move(static_columns),
        std::move(regular_columns), { }, { }, query::max_rows);

-    return do_with(std::move(slice), std::move(m), std::vector<locked_cell>(),
-                   [this, &cf, timeout, trace_state = std::move(trace_state), op = cf.write_in_progress()] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks) mutable {
+    return do_with(std::move(slice), std::move(m), cf.write_in_progress(), std::vector<locked_cell>(),
+                   [this, &cf, timeout, trace_state = std::move(trace_state)] (const query::partition_slice& slice, mutation& m, const utils::phased_barrier::operation& op, std::vector<locked_cell>& locks) mutable {
        tracing::trace(trace_state, "Acquiring counter locks");
        return cf.lock_counter_cells(m, timeout).then([&, m_schema = cf.schema(), trace_state = std::move(trace_state), timeout, this] (std::vector<locked_cell> lcs) mutable {
            locks = std::move(lcs);
--- a/replica/memtable.cc
+++ b/replica/memtable.cc
@@ -239,6 +239,11 @@ memtable::find_or_create_partition(const dht::decorated_key& key) {
    return i->partition();
 }

+bool
+memtable::contains_partition(const dht::decorated_key& key) const {
+    return partitions.find(key, dht::ring_position_comparator(*_schema)) != partitions.end();
+}
+
 boost::iterator_range<memtable::partitions_type::const_iterator>
 memtable::slice(const dht::partition_range& range) const {
    if (query::is_single_partition(range)) {
--- a/replica/memtable.hh
+++ b/replica/memtable.hh
@@ -216,6 +216,8 @@ public:
    mutation_cleaner& cleaner() noexcept {
        return _cleaner;
    }
+
+    bool contains_partition(const dht::decorated_key& key) const;
 public:
    memtable_list* get_memtable_list() noexcept {
        return _memtable_list;
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -384,6 +384,14 @@ api::timestamp_type compaction_group::min_memtable_timestamp() const {
    );
 }

+bool compaction_group::memtable_has_key(const dht::decorated_key& key) const {
+    if (_memtables->empty()) {
+        return false;
+    }
+    return std::ranges::any_of(*_memtables,
+        std::bind(&memtable::contains_partition, std::placeholders::_1, std::ref(key)));
+}
+
 api::timestamp_type table::min_memtable_timestamp() const {
    return *boost::range::min_element(compaction_groups() | boost::adaptors::transformed(std::mem_fn(&compaction_group::min_memtable_timestamp)));
 }
@@ -1154,9 +1162,15 @@ void table::set_metrics() {
                ms::make_counter("memtable_row_hits", _stats.memtable_app_stats.row_hits, ms::description("Number of rows overwritten by write operations in memtables"))(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
                ms::make_gauge("total_disk_space", ms::description("Total disk space used"), _stats.total_disk_space_used)(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
                ms::make_gauge("live_sstable", ms::description("Live sstable count"), _stats.live_sstable_count)(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}),
-                ms::make_counter("read_latency_count", ms::description("Number of reads"), [this] {return _stats.reads.histogram().count();})(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
-                ms::make_counter("write_latency_count", ms::description("Number of writes"), [this] {return _stats.writes.histogram().count();})(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}).set_skip_when_empty()
+                ms::make_gauge("live_disk_space", ms::description("Live disk space used"), _stats.live_disk_space_used)(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}),
+                ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return to_metrics_histogram(_stats.reads.histogram());})(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return to_metrics_histogram(_stats.writes.histogram());})(cf)(ks)(node_table_metrics).aggregate({seastar::metrics::shard_label}).set_skip_when_empty()
            });
+            if (this_shard_id() == 0) {
+                _metrics.add_group("column_family", {
+                        ms::make_gauge("cache_hit_rate", ms::description("Cache hit rate"), [this] {return float(_global_cache_hit_rate);})(cf)(ks)(ms::shard_label(""))
+                });
+            }
        }
    }
 }
@@ -2007,9 +2021,18 @@ future<> table::clear() {
 // NOTE: does not need to be futurized, but might eventually, depending on
 // if we implement notifications, whatnot.
 future<db::replay_position> table::discard_sstables(db_clock::time_point truncated_at) {
-    assert(std::ranges::all_of(compaction_groups(), [this] (const compaction_group_ptr& cg) {
-        return _compaction_manager.compaction_disabled(cg->as_table_state());
-    }));
+    // truncate_table_on_all_shards() disables compaction for the truncated
+    // tables and views, so we normally expect compaction to be disabled on
+    // this table. But as shown in issue #17543, it is possible that a new
+    // materialized view was created right after truncation started, and it
+    // would not have compaction disabled when this function is called on it.
+    if (!schema()->is_view()) {
+        if (!std::ranges::all_of(compaction_groups(), [this] (const compaction_group_ptr& cg) {
+                return _compaction_manager.compaction_disabled(cg->as_table_state()); })) {
+            utils::on_internal_error(fmt::format("compaction not disabled on table {}.{} during TRUNCATE",
+                schema()->ks_name(), schema()->cf_name()));
+        }
+    }

    struct pruner {
        column_family& cf;
@@ -2670,7 +2693,7 @@ table::disable_auto_compaction() {
    // - there are major compactions that additionally uses constant
    //   size backlog of shares,
    // - sstables rewrites tasks that do the same.
-    // 
+    //
    // Setting NullCompactionStrategy is not an option due to the
    // following reasons:
    // - it will 0 backlog if suspending current compactions is not an
@@ -2939,6 +2962,9 @@ public:
    api::timestamp_type min_memtable_timestamp() const override {
        return _cg.min_memtable_timestamp();
    }
+    bool memtable_has_key(const dht::decorated_key& key) const override {
+        return _cg.memtable_has_key(key);
+    }
    future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) override {
        if (offstrategy) {
            co_await _cg.update_sstable_lists_on_off_strategy_completion(std::move(desc));
--- a/schema/schema.cc
+++ b/schema/schema.cc
@@ -768,6 +768,16 @@ static std::ostream& map_as_cql_param(std::ostream& os, const std::map<sstring,
    return os;
 }

+// default impl assumes options are in a map.
+// implementations should override if not
+std::string schema_extension::options_to_string() const {
+    std::ostringstream ss;
+    ss << '{';
+    map_as_cql_param(ss, ser::deserialize_from_buffer(serialize(), boost::type<default_map_type>(), 0));
+    ss << '}';
+    return ss.str();
+}
+
 static std::ostream& column_definition_as_cql_key(std::ostream& os, const column_definition & cd) {
    os << cd.name_as_cql_string();
    os << " " << cd.type->cql3_type_name();
@@ -922,23 +932,19 @@ std::ostream& schema::describe(replica::database& db, std::ostream& os, bool wit
    os << "}";

    os << "\n    AND crc_check_chance = " << crc_check_chance();
-    os << "\n    AND dclocal_read_repair_chance = " << dc_local_read_repair_chance();
+    os << "\n    AND dclocal_read_repair_chance = " << dc_local_read_repair_chance();    
    os << "\n    AND default_time_to_live = " << default_time_to_live().count();
    os << "\n    AND gc_grace_seconds = " << gc_grace_seconds().count();
    os << "\n    AND max_index_interval = " << max_index_interval();
    os << "\n    AND memtable_flush_period_in_ms = " << memtable_flush_period();
    os << "\n    AND min_index_interval = " << min_index_interval();
-    os << "\n    AND read_repair_chance = " << read_repair_chance();
+    os << "\n    AND read_repair_chance = " << read_repair_chance(); 
    os << "\n    AND speculative_retry = '" << speculative_retry().to_sstring() << "'";
-    os << "\n    AND paxos_grace_seconds = " << paxos_grace_seconds().count();
-
-    auto tombstone_gc_str = tombstone_gc_options().to_sstring();
-    std::replace(tombstone_gc_str.begin(), tombstone_gc_str.end(), '"', '\'');
-    os << "\n    AND tombstone_gc = " << tombstone_gc_str;
    
-    if (cdc_options().enabled()) {
-        os << "\n    AND cdc = " << cdc_options().to_sstring();
+    for (auto& [type, ext] : extensions()) {
+        os << "\n    AND " << type << " = " << ext->options_to_string();
    }
+
    if (is_view() && !is_index(db, view_info()->base_id(), *this)) {
        auto is_sync_update = db::find_tag(*this, db::SYNCHRONOUS_VIEW_UPDATES_TAG_KEY);
        if (is_sync_update.has_value()) {
--- a/schema/schema.hh
+++ b/schema/schema.hh
@@ -551,6 +551,10 @@ public:
    virtual bool is_placeholder() const {
        return false;
    }
+    using default_map_type = std::map<sstring, sstring>;
+    // default impl assumes options are in a map.
+    // implementations should override if not
+    virtual std::string options_to_string() const;
 };

 struct schema_static_props {
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -948,7 +948,7 @@ future<> migration_manager::announce_with_raft(std::vector<mutation> schema, gro
        },
        guard, std::move(description));

-    co_return co_await _group0_client.add_entry(std::move(group0_cmd), std::move(guard), &_as);
+    co_return co_await _group0_client.add_entry(std::move(group0_cmd), std::move(guard), _as);
 }

 future<> migration_manager::announce_without_raft(std::vector<mutation> schema, group0_guard guard) {
@@ -983,7 +983,7 @@ future<> migration_manager::announce(std::vector<mutation> schema, group0_guard

 future<group0_guard> migration_manager::start_group0_operation() {
    assert(this_shard_id() == 0);
-    return _group0_client.start_operation(&_as);
+    return _group0_client.start_operation(_as);
 }

 /**
--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -276,24 +276,29 @@ future<> view_update_backlog_broker::on_change(gms::inet_address endpoint, gms::
        const char* start_bound = value.value().data();
        char* end_bound;
        for (auto* ptr : {&current, &max}) {
+            errno = 0;
            *ptr = std::strtoull(start_bound, &end_bound, 10);
-            if (*ptr == ULLONG_MAX) {
-                return make_ready_future();;
+            if (errno == ERANGE) {
+                return make_ready_future();
            }
            start_bound = end_bound + 1;
        }
        if (max == 0) {
            return make_ready_future();
        }
+        errno = 0;
        ticks = std::strtoll(start_bound, &end_bound, 10);
-        if (ticks == 0 || ticks == LLONG_MAX || end_bound != value.value().data() + value.value().size()) {
+        if (ticks == 0 || errno == ERANGE || end_bound != value.value().data() + value.value().size()) {
            return make_ready_future();
        }
        auto backlog = view_update_backlog_timestamped{db::view::update_backlog{current, max}, ticks};
-        auto[it, inserted] = _sp.local()._view_update_backlogs.try_emplace(endpoint, std::move(backlog));
-        if (!inserted && it->second.ts < backlog.ts) {
-            it->second = std::move(backlog);
-        }
+        return _sp.invoke_on_all([endpoint, backlog] (service::storage_proxy& sp) {
+            auto[it, inserted] = sp._view_update_backlogs.try_emplace(endpoint, backlog);
+            if (!inserted && it->second.ts < backlog.ts) {
+                it->second = backlog;
+            }
+            return make_ready_future();
+        });
    }
    return make_ready_future();
 }
--- a/service/paxos/paxos_state.hh
+++ b/service/paxos/paxos_state.hh
@@ -7,6 +7,7 @@
 * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
 */
 #pragma once
+#include "seastar/core/semaphore.hh"
 #include "service/paxos/proposal.hh"
 #include "log.hh"
 #include "utils/digest_algorithm.hh"
@@ -32,6 +33,7 @@ private:

    class key_lock_map {
        using semaphore = basic_semaphore<semaphore_default_exception_factory, clock_type>;
+        using semaphore_units = semaphore_units<semaphore_default_exception_factory, clock_type>;
        using map = std::unordered_map<dht::token, semaphore>;

        semaphore& get_semaphore_for_key(const dht::token& key);
@@ -82,22 +84,15 @@ public:
        key_lock_map& _map;
        dht::token _key;
        clock_type::time_point _timeout;
-        bool _locked = false;
+        key_lock_map::semaphore_units _units;
    public:
-        future<> lock() {
-            auto f = _map.get_semaphore_for_key(_key).wait(_timeout, 1);
-            _locked = true;
-            return f;
+        future<> lock () {
+            return get_units(_map.get_semaphore_for_key(_key), 1, _timeout).then([this] (auto&& u) { _units = std::move(u); });
        }
        guard(key_lock_map& map, const dht::token& key, clock_type::time_point timeout) : _map(map), _key(key), _timeout(timeout) {};
-        guard(guard&& o) noexcept : _map(o._map), _key(std::move(o._key)), _timeout(o._timeout), _locked(o._locked) {
-            o._locked = false;
-        }
+        guard(guard&& o) = default;
        ~guard() {
-            if (_locked) {
-                _map.get_semaphore_for_key(_key).signal(1);
-                _map.release_semaphore_for_key(_key);
-            }
+            _map.release_semaphore_for_key(_key);
        }
    };

--- a/service/raft/raft_group0_client.cc
+++ b/service/raft/raft_group0_client.cc
@@ -152,7 +152,7 @@ semaphore& raft_group0_client::operation_mutex() {
    return _operation_mutex;
 }

-future<> raft_group0_client::add_entry(group0_command group0_cmd, group0_guard guard, seastar::abort_source* as) {
+future<> raft_group0_client::add_entry(group0_command group0_cmd, group0_guard guard, seastar::abort_source& as) {
    if (this_shard_id() != 0) {
        // This should not happen since all places which construct `group0_guard` also check that they are on shard 0.
        // Note: `group0_guard::impl` is private to this module, making this easy to verify.
@@ -172,7 +172,7 @@ future<> raft_group0_client::add_entry(group0_command group0_cmd, group0_guard g
        do {
            retry = false;
            try {
-                co_await _raft_gr.group0().add_entry(cmd, raft::wait_type::applied, as);
+                co_await _raft_gr.group0().add_entry(cmd, raft::wait_type::applied, &as);
            } catch (const raft::dropped_entry& e) {
                logger.warn("add_entry: returned \"{}\". Retrying the command (prev_state_id: {}, new_state_id: {})",
                        e, group0_cmd.prev_state_id, group0_cmd.new_state_id);
@@ -234,7 +234,7 @@ static utils::UUID generate_group0_state_id(utils::UUID prev_state_id) {
    return utils::UUID_gen::get_random_time_UUID_from_micros(std::chrono::microseconds{ts});
 }

-future<group0_guard> raft_group0_client::start_operation(seastar::abort_source* as) {
+future<group0_guard> raft_group0_client::start_operation(seastar::abort_source& as) {
    if (this_shard_id() != 0) {
        on_internal_error(logger, "start_group0_operation: must run on shard 0");
    }
@@ -242,12 +242,12 @@ future<group0_guard> raft_group0_client::start_operation(seastar::abort_source*
    auto [upgrade_lock_holder, upgrade_state] = co_await get_group0_upgrade_state();
    switch (upgrade_state) {
        case group0_upgrade_state::use_post_raft_procedures: {
-            auto operation_holder = co_await get_units(_operation_mutex, 1);
-            co_await _raft_gr.group0().read_barrier(as);
+            auto operation_holder = co_await get_units(_operation_mutex, 1, as);
+            co_await _raft_gr.group0().read_barrier(&as);

            // Take `_group0_read_apply_mutex` *after* read barrier.
            // Read barrier may wait for `group0_state_machine::apply` which also takes this mutex.
-            auto read_apply_holder = co_await hold_read_apply_mutex();
+            auto read_apply_holder = co_await hold_read_apply_mutex(as);

            auto observed_group0_state_id = co_await _sys_ks.get_last_group0_state_id();
            auto new_group0_state_id = generate_group0_state_id(observed_group0_state_id);
--- a/service/raft/raft_group0_client.hh
+++ b/service/raft/raft_group0_client.hh
@@ -105,7 +105,7 @@ public:
    // Call after `system_keyspace` is initialized.
    future<> init();

-    future<> add_entry(group0_command group0_cmd, group0_guard guard, seastar::abort_source* as = nullptr);
+    future<> add_entry(group0_command group0_cmd, group0_guard guard, seastar::abort_source& as);

    future<> add_entry_unguarded(group0_command group0_cmd, seastar::abort_source* as = nullptr);

@@ -129,7 +129,7 @@ public:
    // FIXME?: this is kind of annoying for the user.
    // we could forward the call to shard 0, have group0_guard keep a foreign_ptr to the internal data structures on shard 0,
    // and add_entry would again forward to shard 0.
-    future<group0_guard> start_operation(seastar::abort_source* as = nullptr);
+    future<group0_guard> start_operation(seastar::abort_source& as);

    template<typename Command>
    requires std::same_as<Command, broadcast_table_query> || std::same_as<Command, write_mutations>
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1453,6 +1453,17 @@ public:
            timeout_cb();
        }
    }
+    void no_targets() {
+        // We don't have any live targets and we should complete the handler now.
+        // Either we already stored sufficient hints to achieve CL and the handler
+        // is completed successfully (see hint_to_dead_endpoints), or we don't achieve
+        // CL because we didn't store sufficient hints and we don't have live targets,
+        // so the handler is completed with error.
+        if (!_cl_achieved) {
+            _error = error::FAILURE;
+        }
+        _proxy->remove_response_handler(_id);
+    }
    void expire_at(storage_proxy::clock_type::time_point timeout) {
        _expire_timer.arm(timeout);
    }
@@ -2329,6 +2340,21 @@ bool storage_proxy::need_throttle_writes() const {
 }

 void storage_proxy::unthrottle() {
+   // Here, we garbage-collect (from _throttled_writes) the response IDs which are no longer
+   // relevant, because their handlers are gone.
+   //
+   // need_throttle_writes() may remain true for an indefinite amount of time, so without this piece of code,
+   // _throttled_writes might also grow without any limit. We saw this happen in a throughput test once.
+   //
+   // Note that we only remove the irrelevant entries which are in front of the list.
+   // We don't touch the middle of the list, so an irrelevant ID will still remain in the list if there is some
+   // earlier ID which is still relevant. But since writes should have some reasonable finite timeout,
+   // we assume that it's not a problem.
+   //
+   while (!_throttled_writes.empty() && !_response_handlers.contains(_throttled_writes.front())) {
+       _throttled_writes.pop_front();
+   }
+
   while(!need_throttle_writes() && !_throttled_writes.empty()) {
       auto id = _throttled_writes.front();
       _throttled_writes.pop_front();
@@ -3924,6 +3950,16 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
    auto& stats = handler_ptr->stats();
    auto& handler = *handler_ptr;
    auto& global_stats = handler._proxy->_global_stats;
+
+    if (handler.get_targets().size() == 0) {
+        // Usually we remove the response handler when receiving responses from all targets.
+        // Here we don't have any live targets to get responses from, so we should complete
+        // the write response handler immediately. Otherwise, it will remain active
+        // until it timeouts.
+        handler.no_targets();
+        return;
+    }
+
    if (handler.get_targets().size() != 1 || !fbu::is_me(handler.get_targets()[0])) {
        auto& topology = handler_ptr->_effective_replication_map_ptr->get_topology();
        auto local_dc = topology.get_datacenter();
@@ -6088,7 +6124,7 @@ future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> reque
    db::consistency_level cl = cl_for_paxos == db::consistency_level::LOCAL_SERIAL ?
        db::consistency_level::LOCAL_QUORUM : db::consistency_level::QUORUM;

-    unsigned contentions;
+    unsigned contentions = 0;

    dht::token token = partition_ranges[0].start()->value().as_decorated_key().token();
    utils::latency_counter lc;
@@ -6107,6 +6143,8 @@ future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> reque

        paxos::paxos_state::guard l = co_await paxos::paxos_state::get_cas_lock(token, write_timeout);

+        co_await utils::get_local_injector().inject("cas_timeout_after_lock", write_timeout + std::chrono::milliseconds(100));
+
        while (true) {
            // Finish the previous PAXOS round, if any, and, as a side effect, compute
            // a ballot (round identifier) which is a) unique b) has good chances of being
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -954,7 +954,7 @@ class topology_coordinator {
     };

    future<group0_guard> start_operation() {
-        auto guard = co_await _group0.client().start_operation(&_as);
+        auto guard = co_await _group0.client().start_operation(_as);

        if (_term != _raft.get_current_term()) {
            throw term_changed_error{};
@@ -996,7 +996,7 @@ class topology_coordinator {
            slogger.trace("raft topology: do update {} reason {}", updates, reason);
            topology_change change{std::move(updates)};
            group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
-            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard));
+            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: race while changing state: {}. Retrying", reason);
            throw;
@@ -2642,7 +2642,7 @@ future<> storage_service::raft_initialize_discovery_leader(raft::server& raft_se
        }

        slogger.info("raft topology: adding myself as the first node to the topology");
-        auto guard = co_await _group0->client().start_operation(&_abort_source);
+        auto guard = co_await _group0->client().start_operation(_abort_source);

        auto insert_join_request_mutation = build_mutation_from_join_params(params, guard);

@@ -2656,7 +2656,7 @@ future<> storage_service::raft_initialize_discovery_leader(raft::server& raft_se
        group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
                "bootstrap: adding myself as the first node to the topology");
        try {
-            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_abort_source);
+            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _abort_source);
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: bootstrap: concurrent operation is detected, retrying.");
        }
@@ -2703,7 +2703,7 @@ future<> storage_service::update_topology_with_local_metadata(raft::server& raft
    while (true) {
        slogger.info("raft topology: refreshing topology to check if it's synchronized with local metadata");

-        auto guard = co_await _group0->client().start_operation(&_abort_source);
+        auto guard = co_await _group0->client().start_operation(_abort_source);

        if (synchronized()) {
            break;
@@ -2738,7 +2738,7 @@ future<> storage_service::update_topology_with_local_metadata(raft::server& raft
                std::move(change), guard, ::format("{}: update topology with local metadata", raft_server.id()));

        try {
-            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_abort_source);
+            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _abort_source);
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: update topology with local metadata:"
                         " concurrent operation is detected, retrying.");
@@ -2947,10 +2947,8 @@ future<> storage_service::join_token_ring(sharded<db::system_distributed_keyspac
        // NORMAL doesn't necessarily mean UP (#14042). Wait for these nodes to be UP as well
        // to reduce flakiness (we need them to be UP to perform CDC generation write and for repair/streaming).
        //
-        // This could be done in Raft topology mode as well, but the calculation of nodes to sync with
-        // has to be done based on topology state machine instead of gossiper as it is here;
-        // furthermore, the place in the code where we do this has to be different (it has to be coordinated
-        // by the topology coordinator after it joins the node to the cluster).
+        // We do it in Raft topology mode as well in join_node_response_handler. The calculation of nodes to
+        // sync with is done based on topology state machine instead of gossiper as it is here.
        //
        // We calculate nodes to wait for based on token_metadata. Previously we would use gossiper
        // directly for this, but gossiper may still contain obsolete entries from 1. replaced nodes
@@ -2958,23 +2956,29 @@ future<> storage_service::join_token_ring(sharded<db::system_distributed_keyspac
        // but here they may still be present if we're performing topology changes in quick succession.
        // `token_metadata` has all host ID / token collisions resolved so in particular it doesn't contain
        // these obsolete IPs. Refs: #14487, #14468
-        auto& tm = get_token_metadata();
-        auto ignore_nodes = ri
-                ? parse_node_list(_db.local().get_config().ignore_dead_nodes_for_replace(), tm)
-                // TODO: specify ignore_nodes for bootstrap
-                : std::unordered_set<gms::inet_address>{};
+        //
+        // We recalculate nodes in every step of the loop in wait_alive. For example, if we booted a new node
+        // just after removing a different node, other nodes could still see the removed node as NORMAL. Then,
+        // the joining node would wait for it to be UP, and wait_alive would time out. Recalculation fixes
+        // this problem. Ref: #17526
+        auto get_sync_nodes = [&] {
+            auto ignore_nodes = ri
+                    ? parse_node_list(_db.local().get_config().ignore_dead_nodes_for_replace(), get_token_metadata())
+                    // TODO: specify ignore_nodes for bootstrap
+                    : std::unordered_set<gms::inet_address>{};
+            std::vector<gms::inet_address> sync_nodes;
+            get_token_metadata().get_topology().for_each_node([&] (const locator::node* np) {
+                auto ep = np->endpoint();
+                if (!ignore_nodes.contains(ep) && (!ri || ep != ri->address)) {
+                    sync_nodes.push_back(ep);
+                }
+            });
+            return sync_nodes;
+        };

-        std::vector<gms::inet_address> sync_nodes;
-        tm.get_topology().for_each_node([&] (const locator::node* np) {
-            auto ep = np->endpoint();
-            if (!ignore_nodes.contains(ep) && (!ri || ep != ri->address)) {
-                sync_nodes.push_back(ep);
-            }
-        });
-
-        slogger.info("Waiting for nodes {} to be alive", sync_nodes);
-        co_await _gossiper.wait_alive(sync_nodes, wait_for_live_nodes_timeout);
-        slogger.info("Nodes {} are alive", sync_nodes);
+        slogger.info("Waiting for other nodes to be alive. Current nodes: {}", get_sync_nodes());
+        co_await _gossiper.wait_alive(get_sync_nodes, wait_for_live_nodes_timeout);
+        slogger.info("Nodes {} are alive", get_sync_nodes());
    }

    assert(_group0);
@@ -4492,7 +4496,7 @@ future<> storage_service::raft_decomission() {
    });

    while (true) {
-        auto guard = co_await _group0->client().start_operation(&_abort_source);
+        auto guard = co_await _group0->client().start_operation(_abort_source);

        auto it = _topology_state_machine._topology.find(raft_server.id());
        if (!it) {
@@ -4519,7 +4523,7 @@ future<> storage_service::raft_decomission() {
        group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard, ::format("decomission: request decomission for {}", raft_server.id()));

        try {
-            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_abort_source);
+            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _abort_source);
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: decomission: concurrent operation is detected, retrying.");
            continue;
@@ -4814,7 +4818,7 @@ future<> storage_service::raft_removenode(locator::host_id host_id, std::list<lo
    auto id = raft::server_id{host_id.uuid()};

    while (true) {
-        auto guard = co_await _group0->client().start_operation(&_abort_source);
+        auto guard = co_await _group0->client().start_operation(_abort_source);

        auto it = _topology_state_machine._topology.find(id);

@@ -4854,7 +4858,7 @@ future<> storage_service::raft_removenode(locator::host_id host_id, std::list<lo
        group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard, ::format("removenode: request remove for {}", id));

        try {
-            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_abort_source);
+            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _abort_source);
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: removenode: concurrent operation is detected, retrying.");
            continue;
@@ -5375,7 +5379,7 @@ future<> storage_service::raft_rebuild(sstring source_dc) {
    auto& raft_server = _group0->group0_server();

    while (true) {
-        auto guard = co_await _group0->client().start_operation(&_abort_source);
+        auto guard = co_await _group0->client().start_operation(_abort_source);

        auto it = _topology_state_machine._topology.find(raft_server.id());
        if (!it) {
@@ -5401,7 +5405,7 @@ future<> storage_service::raft_rebuild(sstring source_dc) {
        group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard, ::format("rebuild: request rebuild for {} ({})", raft_server.id(), source_dc));

        try {
-            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_abort_source);
+            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _abort_source);
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: rebuild: concurrent operation is detected, retrying.");
            continue;
@@ -5420,7 +5424,7 @@ future<> storage_service::raft_check_and_repair_cdc_streams() {

    while (true) {
        slogger.info("raft topology: request check_and_repair_cdc_streams, refreshing topology");
-        auto guard = co_await _group0->client().start_operation(&_abort_source);
+        auto guard = co_await _group0->client().start_operation(_abort_source);
        auto curr_req = _topology_state_machine._topology.global_request;
        if (curr_req && *curr_req != global_topology_request::new_cdc_generation) {
            // FIXME: replace this with a queue
@@ -5446,7 +5450,7 @@ future<> storage_service::raft_check_and_repair_cdc_streams() {
        group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
                ::format("request check+repair CDC generation from {}", _group0->group0_server().id()));
        try {
-            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_abort_source);
+            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _abort_source);
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: request check+repair CDC: concurrent operation is detected, retrying.");
            continue;
@@ -6495,7 +6499,7 @@ future<join_node_request_result> storage_service::join_node_request_handler(join
    }

    while (true) {
-        auto guard = co_await _group0->client().start_operation(&_abort_source);
+        auto guard = co_await _group0->client().start_operation(_abort_source);

        if (const auto *p = _topology_state_machine._topology.find(params.host_id)) {
            const auto& rs = p->second;
@@ -6526,7 +6530,7 @@ future<join_node_request_result> storage_service::join_node_request_handler(join
        group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
                format("raft topology: placing join request for {}", params.host_id));
        try {
-            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_abort_source);
+            co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _abort_source);
            break;
        } catch (group0_concurrent_modification&) {
            slogger.info("raft topology: join_node_request: concurrent operation is detected, retrying.");
--- a/sstables/mx/writer.cc
+++ b/sstables/mx/writer.cc
@@ -796,7 +796,7 @@ public:
        _sst._shards = { shard };

        _cfg.monitor->on_write_started(_data_writer->offset_tracker());
-        _sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance(), utils::filter_format::m_format);
+        _sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _sst._schema->bloom_filter_fp_chance(), utils::filter_format::m_format);
        _pi_write_m.promoted_index_block_size = cfg.promoted_index_block_size;
        _pi_write_m.promoted_index_auto_scale_threshold = cfg.promoted_index_auto_scale_threshold;
        _index_sampling_state.summary_byte_cost = _cfg.summary_byte_cost;
@@ -884,7 +884,7 @@ void writer::init_file_writers() {
            make_compressed_file_m_format_output_stream(
                output_stream<char>(std::move(out)),
                &_sst._components->compression,
-                _schema.get_compressor_params()), _sst.filename(component_type::Data));
+                _sst._schema->get_compressor_params()), _sst.filename(component_type::Data));
    }

    out = _sst._storage->make_data_or_index_sink(_sst, component_type::Index).get0();
@@ -1454,7 +1454,7 @@ void writer::consume_end_of_stream() {

    _sst._components->statistics.contents[metadata_type::Serialization] = std::make_unique<serialization_header>(std::move(_sst_schema.header));
    seal_statistics(_sst.get_version(), _sst._components->statistics, _collector,
-        _sst._schema->get_partitioner().name(), _schema.bloom_filter_fp_chance(),
+        _sst._schema->get_partitioner().name(), _sst._schema->bloom_filter_fp_chance(),
        _sst._schema, _sst.get_first_decorated_key(), _sst.get_last_decorated_key(), _enc_stats);
    close_data_writer();
    _sst.write_summary();
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -105,6 +105,25 @@ thread_local utils::updateable_value<bool> global_cache_index_pages(true);

 logging::logger sstlog("sstable");

+template <typename T>
+const char* nullsafe_typename(T* x) noexcept {
+    try {
+        return typeid(*x).name();
+    } catch (const std::bad_typeid&) {
+        return "nullptr";
+    }
+}
+
+// dynamic_cast, but calls on_internal_error on failure.
+template <typename Derived, typename Base>
+Derived* downcast_ptr(Base* x) {
+    if (auto casted = dynamic_cast<Derived*>(x)) {
+        return casted;
+    } else {
+        on_internal_error(sstlog, fmt::format("Bad downcast: expected {}, but got {}", typeid(Derived*).name(), nullsafe_typename(x)));
+    }
+}
+
 // Because this is a noop and won't hold any state, it is better to use a global than a
 // thread_local. It will be faster, specially on non-x86.
 struct noop_write_monitor final : public write_monitor {
@@ -1396,7 +1415,7 @@ void sstable::write_filter() {
        return;
    }

-    auto f = static_cast<utils::filter::murmur3_bloom_filter *>(_components->filter.get());
+    auto f = downcast_ptr<utils::filter::murmur3_bloom_filter>(_components->filter.get());

    auto&& bs = f->bits();
    auto filter_ref = sstables::filter_ref(f->num_hashes(), bs.get_storage());
@@ -2872,6 +2891,7 @@ sstable::unlink(storage::sync_dir sync) noexcept {

    co_await std::move(remove_fut);
    _stats.on_delete();
+    _manager.on_unlink(this);
 }

 thread_local sstables_stats::stats sstables_stats::_shard_stats;
--- a/sstables/sstables_manager.cc
+++ b/sstables/sstables_manager.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <seastar/coroutine/switch_to.hh>
 #include "log.hh"
 #include "sstables/sstables_manager.hh"
 #include "sstables/partition_index_cache.hh"
@@ -21,7 +22,7 @@ namespace sstables {
 logging::logger smlogger("sstables_manager");

 sstables_manager::sstables_manager(
-    db::large_data_handler& large_data_handler, const db::config& dbcfg, gms::feature_service& feat, cache_tracker& ct, size_t available_memory, directory_semaphore& dir_sem, noncopyable_function<locator::host_id()>&& resolve_host_id, storage_manager* shared)
+    db::large_data_handler& large_data_handler, const db::config& dbcfg, gms::feature_service& feat, cache_tracker& ct, size_t available_memory, directory_semaphore& dir_sem, noncopyable_function<locator::host_id()>&& resolve_host_id, scheduling_group maintenance_sg, storage_manager* shared)
    : _storage(shared)
    , _available_memory(available_memory)
    , _large_data_handler(large_data_handler), _db_config(dbcfg), _features(feat), _cache_tracker(ct)
@@ -34,6 +35,7 @@ sstables_manager::sstables_manager(
        utils::updateable_value(std::numeric_limits<uint32_t>::max()))
    , _dir_semaphore(dir_sem)
    , _resolve_host_id(std::move(resolve_host_id))
+    , _maintenance_sg(std::move(maintenance_sg))
 {
    _components_reloader_status = components_reloader_fiber();
 }
@@ -169,6 +171,8 @@ size_t sstables_manager::get_memory_available_for_reclaimable_components() {
 }

 future<> sstables_manager::components_reloader_fiber() {
+    co_await coroutine::switch_to(_maintenance_sg);
+
    sstlog.trace("components_reloader_fiber start");
    while (true) {
        co_await _sstable_deleted_event.when();
@@ -278,4 +282,9 @@ void sstables_manager::unplug_system_keyspace() noexcept {
    _sys_ks = nullptr;
 }

+void sstables_manager::on_unlink(sstable* sst) {
+    // Remove the sst from manager's reclaimed list to prevent any attempts to reload its components.
+    _reclaimed.erase(*sst);
+}
+
 }   // namespace sstables
--- a/sstables/sstables_manager.hh
+++ b/sstables/sstables_manager.hh
@@ -124,8 +124,10 @@ private:
    // after system_keyspace initialization.
    noncopyable_function<locator::host_id()> _resolve_host_id;

+    scheduling_group _maintenance_sg;
+
 public:
-    explicit sstables_manager(db::large_data_handler& large_data_handler, const db::config& dbcfg, gms::feature_service& feat, cache_tracker&, size_t available_memory, directory_semaphore& dir_sem, noncopyable_function<locator::host_id()>&& resolve_host_id, storage_manager* shared = nullptr);
+    explicit sstables_manager(db::large_data_handler& large_data_handler, const db::config& dbcfg, gms::feature_service& feat, cache_tracker&, size_t available_memory, directory_semaphore& dir_sem, noncopyable_function<locator::host_id()>&& resolve_host_id, scheduling_group maintenance_sg = current_scheduling_group(), storage_manager* shared = nullptr);
    virtual ~sstables_manager();

    shared_sstable make_sstable(schema_ptr schema, sstring table_dir,
@@ -177,6 +179,9 @@ public:

    future<> delete_atomically(std::vector<shared_sstable> ssts);

+    // To be called by the sstable to signal its unlinking
+    void on_unlink(sstable* sst);
+
 private:
    void add(sstable* sst);
    // Transition the sstable to the "inactive" state. It has no
--- a/sstables/storage.cc
+++ b/sstables/storage.cc
@@ -74,6 +74,9 @@ public:
        return sstable_directory::delete_with_pending_deletion_log;
    }
    virtual future<> remove_by_registry_entry(utils::UUID uuid, entry_descriptor desc) override;
+    virtual future<uint64_t> free_space() const override {
+        return seastar::fs_avail(prefix());
+    }

    virtual sstring prefix() const override { return _dir; }
 };
@@ -471,6 +474,10 @@ public:
        return delete_with_system_keyspace;
    }
    virtual future<> remove_by_registry_entry(utils::UUID uuid, entry_descriptor desc) override;
+    virtual future<uint64_t> free_space() const override {
+        // assumes infinite space on s3 (https://aws.amazon.com/s3/faqs/#How_much_data_can_I_store).
+        return make_ready_future<uint64_t>(std::numeric_limits<uint64_t>::max());
+    }

    virtual sstring prefix() const override { return _location; }
 };
--- a/sstables/storage.hh
+++ b/sstables/storage.hh
@@ -64,6 +64,8 @@ public:
    virtual future<> destroy(const sstable& sst) = 0;
    virtual noncopyable_function<future<>(std::vector<shared_sstable>)> atomic_deleter() const = 0;
    virtual future<> remove_by_registry_entry(utils::UUID uuid, entry_descriptor desc) = 0;
+    // Free space available in the underlying storage.
+    virtual future<uint64_t> free_space() const = 0;

    virtual sstring prefix() const  = 0;
 };
--- a/test/alternator/test_lsi.py
+++ b/test/alternator/test_lsi.py
@@ -10,6 +10,7 @@

 import pytest
 import time
+import requests
 from botocore.exceptions import ClientError
 from util import create_test_table, new_test_table, random_string, full_scan, full_query, multiset, list_tables

@@ -538,3 +539,35 @@ def test_lsi_and_gsi_same_name(dynamodb):
                }
            ])
        table.delete()
+
+# Test that the LSI table can be addressed in Scylla's REST API (obviously,
+# since this test is for the REST API, it is Scylla-only and can't be run on
+# DynamoDB).
+# At the time this test was written, the LSI's name has a "!" in it, so this
+# test reproduces a bug in URL decoding (#5883). But the goal of this test
+# isn't to insist that a table backing an LSI must have a specific name,
+# but rather that whatever name it does have - it can be addressed.
+def test_lsi_name_rest_api(test_table_lsi_1, rest_api):
+    # See that the LSI is listed in list of tables. It will be a table
+    # whose CQL name contains the Alternator table's name, and the
+    # LSI's name ('hello'). As of this writing, it will actually be
+    # alternator_<name>:<name>!:<lsi> - but the test doesn't enshrine this.
+    resp = requests.get(f'{rest_api}/column_family/name')
+    resp.raise_for_status()
+    lsi_rest_name = None
+    for name in resp.json():
+        if test_table_lsi_1.name in name and 'hello' in name:
+            lsi_rest_name = name
+            break
+    assert lsi_rest_name
+    # Attempt to run a request on this LSI's table name "lsi_rest_name".
+    # We'll use the compaction_strategy request here, but if for some
+    # reason in the future we decide to drop that request, any other
+    # request will be fine.
+    resp = requests.get(f'{rest_api}/column_family/compaction_strategy/{lsi_rest_name}')
+    resp.raise_for_status()
+    # Let's make things difficult for the server by URL encoding the
+    # lsi_rest_name - exposing issue #5883.
+    encoded_lsi_rest_name = requests.utils.quote(lsi_rest_name)
+    resp = requests.get(f'{rest_api}/column_family/compaction_strategy/{encoded_lsi_rest_name}')
+    resp.raise_for_status()
--- a/test/boost/chunked_managed_vector_test.cc
+++ b/test/boost/chunked_managed_vector_test.cc
@@ -190,16 +190,18 @@ SEASTAR_TEST_CASE(tests_reserve_partial) {
  with_allocator(region.allocator(), [&] {
   as(region, [&] {
    auto rand = std::default_random_engine();
-    auto size_dist = std::uniform_int_distribution<unsigned>(1, 1 << 12);
+    // use twice the max_chunk_capacity() as upper limit to test if
+    // reserve_partial() can reserve capacity across multiple chunks.
+    auto max_test_size = lsa::chunked_managed_vector<uint8_t>::max_chunk_capacity() * 2;
+    auto size_dist = std::uniform_int_distribution<unsigned>(1, max_test_size);

    for (int i = 0; i < 100; ++i) {
        lsa::chunked_managed_vector<uint8_t> v;
-        const auto orig_size = size_dist(rand);
-        auto size = orig_size;
-        while (size) {
-            size = v.reserve_partial(size);
+        const auto size = size_dist(rand);
+        while (v.capacity() != size) {
+            v.reserve_partial(size);
        }
-        BOOST_REQUIRE_EQUAL(v.capacity(), orig_size);
+        BOOST_REQUIRE_EQUAL(v.capacity(), size);
    }
   });
  });
--- a/test/boost/chunked_vector_test.cc
+++ b/test/boost/chunked_vector_test.cc
@@ -180,13 +180,12 @@ BOOST_AUTO_TEST_CASE(tests_reserve_partial) {
    auto size_dist = std::uniform_int_distribution<unsigned>(1, 1 << 12);

    for (int i = 0; i < 100; ++i) {
-        utils::chunked_vector<uint8_t> v;
-        const auto orig_size = size_dist(rand);
-        auto size = orig_size;
-        while (size) {
-            size = v.reserve_partial(size);
+        utils::chunked_vector<uint8_t, 512> v;
+        const auto size = size_dist(rand);
+        while (v.capacity() != size) {
+            v.reserve_partial(size);
        }
-        BOOST_REQUIRE_EQUAL(v.capacity(), orig_size);
+        BOOST_REQUIRE_EQUAL(v.capacity(), size);
    }
 }

--- a/test/boost/commitlog_test.cc
+++ b/test/boost/commitlog_test.cc
@@ -304,24 +304,21 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {
    cfg.commitlog_sync_period_in_ms = 1;
    return cl_test(cfg, [](commitlog& log) {
            auto sem = make_lw_shared<semaphore>(0);
-            auto segments = make_lw_shared<segment_names>();
+            auto segments = make_lw_shared<std::set<sstring>>();

            // add a flush handler that simply says we're done with the range.
            auto r = log.add_flush_handler([&log, sem, segments](cf_id_type id, replay_position pos) {
-                auto f = make_ready_future<>();
-                // #6195 only get segment list at first callback. We can (not often)
-                // be called again, but reading segment list at that point might (will)
-                // render same list as in the diff check below. 
-                if (segments->empty()) {
-                    *segments = log.get_active_segment_names();
-                    // Verify #5899 - file size should not exceed the config max. 
-                    f = parallel_for_each(*segments, [](sstring filename) {
-                        return file_size(filename).then([](uint64_t size) {
-                            BOOST_REQUIRE_LE(size, max_size_mb * 1024 * 1024);
-                        });
-                    });
+                auto active_segments = log.get_active_segment_names();
+                for (auto&& s : active_segments) {
+                    segments->insert(s);
                }
-                return f.then([&log, sem, id] {
+
+                // Verify #5899 - file size should not exceed the config max.
+                return parallel_for_each(active_segments, [](sstring filename) {
+                    return file_size(filename).then([](uint64_t size) {
+                        BOOST_REQUIRE_LE(size, max_size_mb * 1024 * 1024);
+                    });
+                }).then([&log, sem, id] {
                    log.discard_completed_segments(id);
                    sem->signal();
                });
@@ -339,7 +336,8 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {
                                    set->insert(h.release().id);
                                });
                    }).then([&log, segments]() {
-                        auto diff = segment_diff(log, *segments);
+                        segment_names names(segments->begin(), segments->end());
+                        auto diff = segment_diff(log, names);
                        auto nn = diff.size();
                        auto dn = log.get_num_segments_destroyed();

--- a/test/boost/cql_query_test.cc
+++ b/test/boost/cql_query_test.cc
@@ -26,6 +26,8 @@
 #include <seastar/core/future-util.hh>
 #include <seastar/core/sleep.hh>
 #include "transport/messages/result_message.hh"
+#include "transport/messages/result_message_base.hh"
+#include "types/types.hh"
 #include "utils/big_decimal.hh"
 #include "types/user.hh"
 #include "types/map.hh"
@@ -4125,7 +4127,7 @@ SEASTAR_TEST_CASE(test_describe_simple_schema) {
                "    AND read_repair_chance = 0\n"
                "    AND speculative_retry = '99.0PERCENTILE'\n"
                "    AND paxos_grace_seconds = 43200\n"
-                "    AND tombstone_gc = {'mode':'timeout','propagation_delay_in_seconds':'3600'};\n"
+                "    AND tombstone_gc = {'mode': 'timeout','propagation_delay_in_seconds': '3600'};\n"
            },
            {"cf1", "CREATE TABLE ks.cf1 (\n"
                "    pk blob,\n"
@@ -4149,7 +4151,7 @@ SEASTAR_TEST_CASE(test_describe_simple_schema) {
                "    AND read_repair_chance = 0\n"
                "    AND speculative_retry = '99.0PERCENTILE'\n"
                "    AND paxos_grace_seconds = 43200\n"
-                "    AND tombstone_gc = {'mode':'timeout','propagation_delay_in_seconds':'3600'};\n"
+                "    AND tombstone_gc = {'mode': 'timeout','propagation_delay_in_seconds': '3600'};\n"
            },
            {"CF2", "CREATE TABLE ks.\"CF2\" (\n"
                "    pk blob,\n"
@@ -4173,7 +4175,7 @@ SEASTAR_TEST_CASE(test_describe_simple_schema) {
                "    AND read_repair_chance = 0\n"
                "    AND speculative_retry = '99.0PERCENTILE'\n"
                "    AND paxos_grace_seconds = 43200\n"
-                "    AND tombstone_gc = {'mode':'timeout','propagation_delay_in_seconds':'3600'};\n"
+                "    AND tombstone_gc = {'mode': 'timeout','propagation_delay_in_seconds': '3600'};\n"
            },
            {"Cf3", "CREATE TABLE ks.\"Cf3\" (\n"
                "    pk blob,\n"
@@ -4198,7 +4200,7 @@ SEASTAR_TEST_CASE(test_describe_simple_schema) {
                "    AND read_repair_chance = 0\n"
                "    AND speculative_retry = '99.0PERCENTILE'\n"
                "    AND paxos_grace_seconds = 43200\n"
-                "    AND tombstone_gc = {'mode':'timeout','propagation_delay_in_seconds':'3600'};\n"
+                "    AND tombstone_gc = {'mode': 'timeout','propagation_delay_in_seconds': '3600'};\n"
            },
            {"cf4", "CREATE TABLE ks.cf4 (\n"
                "    pk blob,\n"
@@ -4222,7 +4224,7 @@ SEASTAR_TEST_CASE(test_describe_simple_schema) {
                "    AND read_repair_chance = 0\n"
                "    AND speculative_retry = '99.0PERCENTILE'\n"
                "    AND paxos_grace_seconds = 43200\n"
-                "    AND tombstone_gc = {'mode':'timeout','propagation_delay_in_seconds':'3600'};\n"
+                "    AND tombstone_gc = {'mode': 'timeout','propagation_delay_in_seconds': '3600'};\n"
            }

        };
@@ -4267,7 +4269,7 @@ SEASTAR_TEST_CASE(test_describe_view_schema) {
                "    AND read_repair_chance = 0\n"
                "    AND speculative_retry = '99.0PERCENTILE'\n"
                "    AND paxos_grace_seconds = 43200\n"
-                "    AND tombstone_gc = {'mode':'timeout','propagation_delay_in_seconds':'3600'};\n";
+                "    AND tombstone_gc = {'mode': 'timeout','propagation_delay_in_seconds': '3600'};\n";

        std::unordered_map<std::string, std::string> cql_create_tables {
          {"cf_view", "CREATE MATERIALIZED VIEW \"KS\".cf_view AS\n"
@@ -4291,7 +4293,7 @@ SEASTAR_TEST_CASE(test_describe_view_schema) {
              "    AND read_repair_chance = 0\n"
              "    AND speculative_retry = '99.0PERCENTILE'\n"
              "    AND paxos_grace_seconds = 43200\n"
-              "    AND tombstone_gc = {'mode':'timeout','propagation_delay_in_seconds':'3600'};\n"},
+              "    AND tombstone_gc = {'mode': 'timeout','propagation_delay_in_seconds': '3600'};\n"},
          {"cf_index_index", "CREATE INDEX cf_index ON \"KS\".\"cF\"(col2);"},
          {"cf_index1_index", "CREATE INDEX cf_index1 ON \"KS\".\"cF\"(pk);"},
          {"cf_index2_index", "CREATE INDEX cf_index2 ON \"KS\".\"cF\"(pk1);"},
@@ -5361,6 +5363,72 @@ SEASTAR_TEST_CASE(test_parallelized_select_counter_type) {
    });
 }

+SEASTAR_TEST_CASE(test_single_partition_aggregation_is_not_parallelized) {
+    // It's pointless from performance pov to parallelize 
+    // aggregation queries which reads only single partition.
+    
+    return with_parallelized_aggregation_enabled_thread([](cql_test_env& e) {
+        auto& qp = e.local_qp();
+        const auto stat_parallelized = qp.get_cql_stats().select_parallelized;
+
+        e.execute_cql("CREATE TABLE tbl (pk int, ck int, col int, PRIMARY KEY (pk, ck));").get();
+        const int value_count = 10;
+        for (int pk = 0; pk < 2; pk++) {
+            for (int c = 0; c < value_count; c++) {
+                e.execute_cql(format("INSERT INTO tbl (pk, ck, col) VALUES ({:d}, {:d}, {:d});", pk, c, c)).get();
+            }
+        }
+        
+        const auto result1 = e.execute_cql("SELECT COUNT(*) FROM tbl WHERE pk = 1;").get();
+        assert_that(result1).is_rows().with_rows({
+            {long_type->decompose(int64_t(value_count))}
+        });
+        BOOST_CHECK_EQUAL(stat_parallelized, qp.get_cql_stats().select_parallelized);
+
+        const auto result2 = e.execute_cql("SELECT COUNT(*) FROM tbl WHERE pk = 1 AND ck = 1;").get();
+        assert_that(result2).is_rows().with_rows({
+            {long_type->decompose(int64_t(1))}
+        });
+        BOOST_CHECK_EQUAL(stat_parallelized, qp.get_cql_stats().select_parallelized);
+
+        const auto result3 = e.execute_cql("SELECT COUNT(*) FROM tbl WHERE token(pk) = 1;").get();
+        // We don't check value of count(*) here but only if it wasn't parallelized
+        BOOST_CHECK_EQUAL(stat_parallelized, qp.get_cql_stats().select_parallelized);
+        
+        const auto result4 = e.execute_cql("SELECT COUNT(*) FROM tbl WHERE pk = 1 AND pk = 2;").get();
+        assert_that(result4).is_rows().with_rows({
+            {long_type->decompose(int64_t(0))}
+        });
+        BOOST_CHECK_EQUAL(stat_parallelized, qp.get_cql_stats().select_parallelized);
+
+
+        e.execute_cql("CREATE TABLE tbl2 (pk1 int, pk2 int, ck int, col int, PRIMARY KEY((pk1, pk2), ck));").get();
+        for (int pk1 = 0; pk1 < 2; pk1++) {
+            for (int pk2 = 0; pk2 < 2; pk2++) {
+                for (int c = 0; c < value_count; c++) {
+                    e.execute_cql(format("INSERT INTO tbl2 (pk1, pk2, ck, col) VALUES ({:d}, {:d}, {:d}, {:d});", pk1, pk2, c, c)).get();
+                }
+            }
+        }
+        
+        const auto result_pk12 = e.execute_cql("SELECT COUNT(*) FROM tbl2 WHERE pk1 = 1 AND pk2 = 0;").get();
+        assert_that(result_pk12).is_rows().with_rows({
+            {long_type->decompose(int64_t(value_count))}
+        });
+        BOOST_CHECK_EQUAL(stat_parallelized, qp.get_cql_stats().select_parallelized);
+
+        // Query with only partly restricted partition key requires `ALLOW FILTERING` clause
+        // and we doesn't parallelize queries which need filtering.
+        // See issue #19369.
+        const auto result_pk1 = e.execute_cql("SELECT COUNT(*) FROM tbl2 WHERE pk1 = 1 ALLOW FILTERING;").get();
+        // This query contains also column for pk1
+        assert_that(result_pk1).is_rows().with_rows({
+            {long_type->decompose(int64_t(value_count * 2)), int32_type->decompose(int32_t(1))}
+        });
+        BOOST_CHECK_EQUAL(stat_parallelized, qp.get_cql_stats().select_parallelized);
+    });
+}
+
 static future<> with_udf_and_parallel_aggregation_enabled_thread(std::function<void(cql_test_env&)>&& func) {
    auto db_cfg_ptr = make_shared<db::config>();
    auto& db_cfg = *db_cfg_ptr;
--- a/test/boost/logalloc_test.cc
+++ b/test/boost/logalloc_test.cc
@@ -520,6 +520,73 @@ SEASTAR_TEST_CASE(test_zone_reclaiming_preserves_free_size) {
    });
 }

+// Tests the intended usage of hold_reserve.
+//
+// Sets up a reserve, exhausts memory, opens the reserve,
+// checks that this allows us to do multiple additional allocations
+// without failing.
+SEASTAR_THREAD_TEST_CASE(test_hold_reserve) {
+    logalloc::region region;
+    logalloc::allocating_section as;
+
+    // We will fill LSA with an intrusive list of small entries.
+    // We make it intrusive to avoid any containers which do std allocations,
+    // since it could make the test imprecise.
+    struct entry {
+        using link = boost::intrusive::list_member_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>;
+        link _link;
+        // We are going to fill the entire memory with this.
+        // Padding makes the entries bigger to speed up the test.
+        std::array<char, 8192> _padding;
+    };
+    using list = boost::intrusive::list<entry,
+        boost::intrusive::member_hook<entry, entry::link, &entry::_link>,
+        boost::intrusive::constant_time_size<false>>;
+
+    as.with_reserve(region, [&] {
+        with_allocator(region.allocator(), [&] {
+            assert(sizeof(entry) + 128 < current_allocator().preferred_max_contiguous_allocation());
+            logalloc::reclaim_lock rl(region);
+
+            // Reserve a segment.
+            auto guard = std::make_optional<hold_reserve>(128*1024);
+
+            // Fill the entire available memory with LSA objects.
+            list entries;
+            auto clean_up = defer([&entries] {
+                entries.clear_and_dispose([] (entry *e) {current_allocator().destroy(e);});
+            });
+            auto alloc_entry = [] () {
+                return current_allocator().construct<entry>();
+            };
+            try {
+                while (true) {
+                    entries.push_back(*alloc_entry());
+                }
+            } catch (const std::bad_alloc&) {
+                // expected
+            }
+
+            // Sanity check. We should be OOM at this point.
+            BOOST_REQUIRE_THROW(hold_reserve(128*1024), std::bad_alloc);
+            BOOST_REQUIRE_THROW(alloc_entry(), std::bad_alloc);
+
+            // Release the reserve.
+            guard.reset();
+
+            // Sanity check.
+            BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024));
+            BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024));
+            BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024));
+
+            // Freeing up a segment should be enough to allocate multiple small entries;
+            for (int i = 0; i < 10; ++i) {
+                entries.push_back(*alloc_entry());
+            }
+        });
+    });
+}
+
 // No point in testing contiguous memory allocation in debug mode
 #ifndef SEASTAR_DEFAULT_ALLOCATOR
 SEASTAR_THREAD_TEST_CASE(test_can_reclaim_contiguous_memory_with_mixed_allocations) {
--- a/test/boost/memtable_test.cc
+++ b/test/boost/memtable_test.cc
@@ -1058,6 +1058,9 @@ SEASTAR_TEST_CASE(failed_flush_prevents_writes) {
    std::cerr << "Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n";
    return make_ready_future<>();
 #else
+    auto db_config = make_shared<db::config>();
+    db_config->unspooled_dirty_soft_limit.set(1.0);
+
    return do_with_cql_env_thread([](cql_test_env& env) {
        replica::database& db = env.local_db();
        service::migration_manager& mm = env.migration_manager().local();
@@ -1090,22 +1093,22 @@ SEASTAR_TEST_CASE(failed_flush_prevents_writes) {
        // Trigger flush
        auto f = t.flush();

-        BOOST_ASSERT(eventually_true([&] {
+        BOOST_REQUIRE(eventually_true([&] {
            return db.cf_stats()->failed_memtables_flushes_count - failed_memtables_flushes_count >= 4;
        }));

        // The flush failed, make sure there is still data in memtable.
-        BOOST_ASSERT(t.min_memtable_timestamp() < api::max_timestamp);
+        BOOST_REQUIRE_LT(t.min_memtable_timestamp(), api::max_timestamp);
        utils::get_local_injector().disable("table_seal_active_memtable_reacquire_write_permit");

-        BOOST_ASSERT(eventually_true([&] {
+        BOOST_REQUIRE(eventually_true([&] {
            // The error above is no longer being injected, so
            // seal_active_memtable retry loop should eventually succeed
            return t.min_memtable_timestamp() == api::max_timestamp;
        }));

        std::move(f).get();
-    });
+    }, db_config);
 #endif
 }

--- a/test/boost/reader_concurrency_semaphore_test.cc
+++ b/test/boost/reader_concurrency_semaphore_test.cc
@@ -611,6 +611,35 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_waits_on_permits
    }
 }

+
+static void require_can_admit(schema_ptr schema, reader_concurrency_semaphore& semaphore, bool expected_can_admit, const char* description,
+        seastar::compat::source_location sl = seastar::compat::source_location::current()) {
+    testlog.trace("Running admission scenario {}, with exepcted_can_admit={}", description, expected_can_admit);
+    const auto stats_before = semaphore.get_stats();
+
+    auto admit_fut = semaphore.obtain_permit(schema, "require_can_admit", 1024, db::timeout_clock::now(), {});
+    admit_fut.wait();
+    const bool can_admit = !admit_fut.failed();
+    if (can_admit) {
+        admit_fut.ignore_ready_future();
+    } else {
+        // Make sure we have a timeout exception, not something else
+        BOOST_REQUIRE_THROW(std::rethrow_exception(admit_fut.get_exception()), semaphore_timed_out);
+    }
+
+    const auto stats_after = semaphore.get_stats();
+    BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted + uint64_t(can_admit));
+    // Deliberately not checking `reads_enqueued_for_admission`, a read can be enqueued temporarily during the admission process.
+
+    if (can_admit == expected_can_admit) {
+        testlog.trace("admission scenario '{}' with expected_can_admit={} passed at {}:{}", description, expected_can_admit, sl.file_name(),
+                sl.line());
+    } else {
+        BOOST_FAIL(fmt::format("admission scenario '{}'  with expected_can_admit={} failed at {}:{}\ndiagnostics: {}", description,
+                expected_can_admit, sl.file_name(), sl.line(), semaphore.dump_diagnostics()));
+    }
+};
+
 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_admission) {
    simple_schema s;
    const auto schema = s.schema();
@@ -620,30 +649,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_admission) {

    auto require_can_admit = [&] (bool expected_can_admit, const char* description,
            seastar::compat::source_location sl = seastar::compat::source_location::current()) {
-        testlog.trace("Running admission scenario {}, with exepcted_can_admit={}", description, expected_can_admit);
-        const auto stats_before = semaphore.get_stats();
-
-        auto admit_fut = semaphore.obtain_permit(schema, get_name(), 1024, db::timeout_clock::now(), {});
-        admit_fut.wait();
-        const bool can_admit = !admit_fut.failed();
-        if (can_admit) {
-            admit_fut.ignore_ready_future();
-        } else {
-            // Make sure we have a timeout exception, not something else
-            BOOST_REQUIRE_THROW(std::rethrow_exception(admit_fut.get_exception()), semaphore_timed_out);
-        }
-
-        const auto stats_after = semaphore.get_stats();
-        BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted + uint64_t(can_admit));
-        // Deliberately not checking `reads_enqueued_for_admission`, a read can be enqueued temporarily during the admission process.
-
-        if (can_admit == expected_can_admit) {
-            testlog.trace("admission scenario '{}' with expected_can_admit={} passed at {}:{}", description, expected_can_admit, sl.file_name(),
-                    sl.line());
-        } else {
-            BOOST_FAIL(fmt::format("admission scenario '{}'  with expected_can_admit={} failed at {}:{}\ndiagnostics: {}", description,
-                    expected_can_admit, sl.file_name(), sl.line(), semaphore.dump_diagnostics()));
-        }
+        ::require_can_admit(schema, semaphore, expected_can_admit, description, sl);
    };

    require_can_admit(true, "semaphore in initial state");
@@ -1944,3 +1950,57 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_execution_stage_wakeu

    permit2_fut.get();
 }
+
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_live_update_cpu_concurrency) {
+    simple_schema s;
+    const auto schema = s.schema();
+
+    utils::updateable_value_source<uint32_t> cpu_concurrency{2};
+    const int32_t initial_count = 4;
+    const uint32_t initial_memory = 4 * 1024;
+    const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
+    const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
+
+    reader_concurrency_semaphore semaphore(
+            utils::updateable_value<int>(initial_count),
+            initial_memory,
+            get_name(),
+            100,
+            utils::updateable_value<uint32_t>(serialize_multiplier),
+            utils::updateable_value<uint32_t>(kill_multiplier),
+            utils::updateable_value(cpu_concurrency));
+    auto stop_sem = deferred_stop(semaphore);
+
+    auto require_can_admit = [&] (bool expected_can_admit, const char* description,
+            seastar::compat::source_location sl = seastar::compat::source_location::current()) {
+        ::require_can_admit(schema, semaphore, expected_can_admit, description, sl);
+    };
+
+    auto permit1 = semaphore.obtain_permit(schema, get_name(), 1024, db::timeout_clock::now(), {}).get();
+
+    require_can_admit(true, "!need_cpu");
+    {
+        reader_permit::need_cpu_guard ncpu_guard{permit1};
+
+        require_can_admit(true, "need_cpu < cpu_concurrency");
+
+        auto permit2 = semaphore.obtain_permit(schema, get_name(), 1024, db::timeout_clock::now(), {}).get();
+
+        // no change
+        require_can_admit(true, "need_cpu < cpu_concurrency");
+        {
+            reader_permit::need_cpu_guard ncpu_guard{permit2};
+            require_can_admit(false, "need_cpu == cpu_concurrency");
+
+            cpu_concurrency.set(3);
+
+            require_can_admit(true, "after set(3): need_cpu < cpu_concurrency");
+
+            cpu_concurrency.set(2);
+
+            require_can_admit(false, "after set(2): need_cpu == cpu_concurrency");
+        }
+        require_can_admit(true, "need_cpu < cpu_concurrency");
+    }
+    require_can_admit(true, "!need_cpu");
+}
--- a/test/boost/sstable_3_x_test.cc
+++ b/test/boost/sstable_3_x_test.cc
@@ -5504,3 +5504,80 @@ SEASTAR_TEST_CASE(test_compression_premature_eof) {
        }
    });
 }
+
+// A reproducer for scylladb/scylladb#16065.
+// Creates an sstable with a newer schema, and populates
+// it with a reader created with an older schema.
+// 
+// Before the fixes, it would have resulted in an assert violation.
+SEASTAR_TEST_CASE(test_alter_bloom_fp_chance_during_write) {
+    return test_env::do_with_async([] (test_env& env) {
+        auto s1 = schema_builder("ks", "t")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("v", utf8_type, column_kind::regular_column)
+            .set_bloom_filter_fp_chance(1.0)
+            .build();
+        auto s2 = schema_builder(s1)
+            .set_bloom_filter_fp_chance(0.01)
+            .build();
+
+        auto ts = api::new_timestamp();
+
+        auto m = mutation(s1, partition_key::from_single_value(*s1, serialized(0)));
+        auto val = std::string(1000, '0');
+        m.set_clustered_cell(clustering_key::make_empty(), "v", val, ts);
+
+        auto mt = make_lw_shared<replica::memtable>(s1);
+        mt->apply(m);
+
+        auto sst = env.make_sstable(s2, sstable_version_types::me);
+        sst->write_components(mt->make_flat_reader(s1, env.make_reader_permit()), 1, s1, env.manager().configure_writer(), mt->get_encoding_stats()).get();
+
+        sstable_assertions sa(env, sst);
+        sa.load();
+        m.upgrade(s2);
+        auto assertions = assert_that(sa.make_reader());
+        assertions.produces(m);
+        assertions.produces_end_of_stream();
+    });
+}
+
+// Reproducer for scylladb/scylladb#16065.
+// Creates an sstable with a newer schema, and populates
+// it with a reader created with an older schema.
+//
+// Before the fixes, it would result in a "compress is not supported" error.
+SEASTAR_TEST_CASE(test_alter_compression_during_write) {
+    return test_env::do_with_async([] (test_env& env) {
+        auto s1 = schema_builder("ks", "t")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("v", utf8_type, column_kind::regular_column)
+            .set_compressor_params(std::map<sstring, sstring>{
+            })
+            .build();
+        auto s2 = schema_builder(s1)
+            .set_compressor_params(std::map<sstring, sstring>{
+                {"sstable_compression", "org.apache.cassandra.io.compress.ZstdCompressor"}
+            })
+            .build();
+
+        auto ts = api::new_timestamp();
+
+        auto m = mutation(s1, partition_key::from_single_value(*s1, serialized(0)));
+        auto val = std::string(1000, '0');
+        m.set_clustered_cell(clustering_key::make_empty(), "v", val, ts);
+
+        auto mt = make_lw_shared<replica::memtable>(s1);
+        mt->apply(m);
+
+        auto sst = env.make_sstable(s2, sstable_version_types::me);
+        sst->write_components(mt->make_flat_reader(s1, env.make_reader_permit()), 1, s1, env.manager().configure_writer(), mt->get_encoding_stats()).get();
+
+        sstable_assertions sa(env, sst);
+        sa.load();
+        m.upgrade(s2);
+        auto assertions = assert_that(sa.make_reader());
+        assertions.produces(m);
+        assertions.produces_end_of_stream();
+    });
+}
--- a/test/boost/sstable_compaction_test.cc
+++ b/test/boost/sstable_compaction_test.cc
@@ -3503,6 +3503,15 @@ SEASTAR_TEST_CASE(test_twcs_partition_estimate) {
    });
 }

+static compaction_descriptor get_reshaping_job(sstables::compaction_strategy& cs, const std::vector<shared_sstable>& input,
+                                               const schema_ptr& s, reshape_mode mode, uint64_t free_storage_space = std::numeric_limits<uint64_t>::max()) {
+    reshape_config cfg {
+        .mode = mode,
+        .free_storage_space = free_storage_space,
+    };
+    return cs.get_reshaping_job(input, s, cfg);
+}
+
 SEASTAR_TEST_CASE(stcs_reshape_test) {
    return test_env::do_with_async([] (test_env& env) {
        simple_schema ss;
@@ -3520,8 +3529,8 @@ SEASTAR_TEST_CASE(stcs_reshape_test) {
        auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered,
                                                    s->compaction_strategy_options());

-        BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size());
-        BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, reshape_mode::relaxed).sstables.size());
+        BOOST_REQUIRE(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size());
+        BOOST_REQUIRE(get_reshaping_job(cs, sstables, s, reshape_mode::relaxed).sstables.size());
    });
 }

@@ -3543,7 +3552,7 @@ SEASTAR_TEST_CASE(lcs_reshape_test) {
                sstables.push_back(std::move(sst));
            }

-            BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size() == 256);
+            BOOST_REQUIRE(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size() == 256);
        }
        // all overlapping
        {
@@ -3555,7 +3564,7 @@ SEASTAR_TEST_CASE(lcs_reshape_test) {
                sstables.push_back(std::move(sst));
            }

-            BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size() == uint64_t(s->max_compaction_threshold()));
+            BOOST_REQUIRE(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size() == uint64_t(s->max_compaction_threshold()));
        }
        // single sstable
        {
@@ -3563,7 +3572,7 @@ SEASTAR_TEST_CASE(lcs_reshape_test) {
            auto key = keys[0].key();
            sstables::test(sst).set_values_for_leveled_strategy(1 /* size */, 0 /* level */, 0 /* max ts */, key, key);

-            BOOST_REQUIRE(cs.get_reshaping_job({ sst }, s, reshape_mode::strict).sstables.size() == 0);
+            BOOST_REQUIRE(get_reshaping_job(cs, { sst }, s, reshape_mode::strict).sstables.size() == 0);
        }
    });
 }
@@ -3780,7 +3789,7 @@ SEASTAR_TEST_CASE(twcs_reshape_with_disjoint_set_test) {
                sstables.push_back(std::move(sst));
            }

-            BOOST_REQUIRE_EQUAL(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size(), disjoint_sstable_count);
+            BOOST_REQUIRE_EQUAL(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size(), disjoint_sstable_count);
        }

        {
@@ -3793,7 +3802,7 @@ SEASTAR_TEST_CASE(twcs_reshape_with_disjoint_set_test) {
                sstables.push_back(std::move(sst));
            }

-            auto reshaping_count = cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size();
+            auto reshaping_count = get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size();
            BOOST_REQUIRE_GE(reshaping_count, disjoint_sstable_count - min_threshold + 1);
            BOOST_REQUIRE_LE(reshaping_count, disjoint_sstable_count);
        }
@@ -3811,7 +3820,7 @@ SEASTAR_TEST_CASE(twcs_reshape_with_disjoint_set_test) {
                sstables.push_back(std::move(sst));
            }

-            BOOST_REQUIRE_EQUAL(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size(), 0);
+            BOOST_REQUIRE_EQUAL(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size(), 0);
        }

        {
@@ -3824,7 +3833,7 @@ SEASTAR_TEST_CASE(twcs_reshape_with_disjoint_set_test) {
                sstables.push_back(std::move(sst));
            }

-            BOOST_REQUIRE_EQUAL(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size(), uint64_t(s->max_compaction_threshold()));
+            BOOST_REQUIRE_EQUAL(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size(), uint64_t(s->max_compaction_threshold()));
        }

        {
@@ -3859,7 +3868,7 @@ SEASTAR_TEST_CASE(twcs_reshape_with_disjoint_set_test) {
            }

            auto check_mode_correctness = [&] (reshape_mode mode) {
-                auto ret = cs.get_reshaping_job(sstables, s, mode);
+                auto ret = get_reshaping_job(cs, sstables, s, mode);
                BOOST_REQUIRE_EQUAL(ret.sstables.size(), uint64_t(s->max_compaction_threshold()));
                // fail if any file doesn't belong to set of small files
                bool has_big_sized_files = boost::algorithm::any_of(ret.sstables, [&] (const sstables::shared_sstable& sst) {
@@ -3871,6 +3880,45 @@ SEASTAR_TEST_CASE(twcs_reshape_with_disjoint_set_test) {
            check_mode_correctness(reshape_mode::strict);
            check_mode_correctness(reshape_mode::relaxed);
        }
+
+        {
+            // create set of 256 disjoint ssts that spans multiple windows (essentially what happens in off-strategy during node op)
+
+            std::vector<sstables::shared_sstable> sstables;
+            sstables.reserve(disjoint_sstable_count);
+            for (auto i = 0U; i < disjoint_sstable_count; i++) {
+                std::vector<mutation> muts;
+                muts.reserve(5);
+                for (auto j = 0; j < 5; j++) {
+                    muts.push_back(make_row(i, std::chrono::hours(j * 8)));
+                }
+                auto sst = make_sstable_containing(sst_gen, std::move(muts));
+                sstables.push_back(std::move(sst));
+            }
+
+            auto job_size = [] (auto&& sst_range) {
+                return boost::accumulate(sst_range | boost::adaptors::transformed(std::mem_fn(&sstable::bytes_on_disk)), uint64_t(0));
+            };
+            auto free_space_for_reshaping_sstables = [&job_size] (auto&& sst_range) {
+                return job_size(std::move(sst_range)) * (time_window_compaction_strategy::reshape_target_space_overhead * 100);
+            };
+
+            // all sstables can be reshaped in a single round if there's enough space
+            {
+                uint64_t free_space = free_space_for_reshaping_sstables(boost::make_iterator_range(sstables));
+                BOOST_REQUIRE(get_reshaping_job(cs, sstables, s, reshape_mode::strict, free_space).sstables.size() == sstables.size());
+            }
+
+            // only a subset can be reshaped in a single round to respect the 10% space overhead
+            {
+                const size_t sstables_that_fit_in_target_overhead = 10;
+                uint64_t free_space = free_space_for_reshaping_sstables(boost::make_iterator_range(sstables.begin(), sstables.begin() + sstables_that_fit_in_target_overhead));
+                auto target_space_overhead = free_space * time_window_compaction_strategy::reshape_target_space_overhead;
+                auto job = get_reshaping_job(cs, sstables, s, reshape_mode::strict, free_space);
+                BOOST_REQUIRE(job.sstables.size() < sstables.size());
+                BOOST_REQUIRE(job_size(boost::make_iterator_range(job.sstables)) <= target_space_overhead);
+            }
+        }
    });
 }

@@ -3913,7 +3961,7 @@ SEASTAR_TEST_CASE(stcs_reshape_overlapping_test) {
                sstables.push_back(std::move(sst));
            }

-            BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size() == disjoint_sstable_count);
+            BOOST_REQUIRE(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size() == disjoint_sstable_count);
        }

        {
@@ -3926,7 +3974,7 @@ SEASTAR_TEST_CASE(stcs_reshape_overlapping_test) {
                sstables.push_back(std::move(sst));
            }

-            BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, reshape_mode::strict).sstables.size() == uint64_t(s->max_compaction_threshold()));
+            BOOST_REQUIRE(get_reshaping_job(cs, sstables, s, reshape_mode::strict).sstables.size() == uint64_t(s->max_compaction_threshold()));
        }
    });
 }
--- a/test/boost/sstable_datafile_test.cc
+++ b/test/boost/sstable_datafile_test.cc
@@ -3085,6 +3085,10 @@ future<> test_sstable_bytes_correctness(sstring tname, test_env_config cfg) {

        auto sst = make_sstable_containing(env.make_sstable(schema), muts);

+        auto free_space = sst->get_storage().free_space().get();
+        BOOST_REQUIRE(free_space > 0);
+        testlog.info("prefix: {}, free space: {}", sst->get_storage().prefix(), free_space);
+
        auto get_bytes_on_disk_from_storage = [&] (const sstables::shared_sstable& sst) {
            uint64_t bytes_on_disk = 0;
            auto& underlying_storage = const_cast<sstables::storage&>(sst->get_storage());
@@ -3231,6 +3235,11 @@ std::pair<shared_sstable, size_t> create_sstable_with_bloom_filter(test_env& env
    return {sst, sst_bf_memory};
 }

+void dispose_and_stop_tracking_bf_memory(shared_sstable&& sst, test_env_sstables_manager& mgr) {
+    mgr.remove_sst_from_reclaimed(sst.get());
+    shared_sstable::dispose(sst.release().release());
+}
+
 SEASTAR_TEST_CASE(test_sstable_manager_auto_reclaim_and_reload_of_bloom_filter) {
    return test_env::do_with_async([] (test_env& env) {
        simple_schema ss;
@@ -3268,7 +3277,7 @@ SEASTAR_TEST_CASE(test_sstable_manager_auto_reclaim_and_reload_of_bloom_filter)

        // Test auto reload - disposing sst3 should trigger reload of the
        // smallest filter in the reclaimed list, which is sst1's bloom filter.
-        shared_sstable::dispose(sst3.release().release());
+        dispose_and_stop_tracking_bf_memory(std::move(sst3), sst_mgr);
        REQUIRE_EVENTUALLY_EQUAL(sst1->filter_memory_size(), sst1_bf_memory);
        // only sst4's bloom filter memory should be reported as reclaimed
        REQUIRE_EVENTUALLY_EQUAL(sst_mgr.get_total_memory_reclaimed(), sst4_bf_memory);
@@ -3278,7 +3287,7 @@ SEASTAR_TEST_CASE(test_sstable_manager_auto_reclaim_and_reload_of_bloom_filter)
    }, {
        // limit available memory to the sstables_manager to test reclaiming.
        // this will set the reclaim threshold to 100 bytes.
-        .available_memory = 1000
+        .available_memory = 500
    });
 }

@@ -3333,7 +3342,7 @@ SEASTAR_TEST_CASE(test_bloom_filter_reclaim_during_reload) {
        utils::get_local_injector().enable("reload_reclaimed_components/pause", true);

        // dispose sst2 to trigger reload of sst1's bloom filter
-        shared_sstable::dispose(sst2.release().release());
+        dispose_and_stop_tracking_bf_memory(std::move(sst2), sst_mgr);
        // _total_reclaimable_memory will be updated when the reload begins; wait for it.
        REQUIRE_EVENTUALLY_EQUAL(sst_mgr.get_total_reclaimable_memory(), sst1_bf_memory);

@@ -3356,6 +3365,60 @@ SEASTAR_TEST_CASE(test_bloom_filter_reclaim_during_reload) {
    }, {
        // limit available memory to the sstables_manager to test reclaiming.
        // this will set the reclaim threshold to 100 bytes.
-        .available_memory = 1000
+        .available_memory = 500
    });
 }
+
+SEASTAR_TEST_CASE(test_bloom_filter_reload_after_unlink) {
+    return test_env::do_with_async([] (test_env& env) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+        fmt::print("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
+        return;
+#endif
+        simple_schema ss;
+        auto schema = ss.schema();
+
+        auto mut = mutation(schema, ss.make_pkey(1));
+        mut.partition().apply_insert(*schema, ss.make_ckey(1), ss.new_timestamp());
+
+        // bloom filter will be reclaimed automatically due to low memory
+        auto sst = make_sstable_containing(env.make_sstable(schema), {mut});
+        auto& sst_mgr = env.manager();
+        BOOST_REQUIRE_EQUAL(sst->filter_memory_size(), 0);
+        auto memory_reclaimed = sst_mgr.get_total_memory_reclaimed();
+
+        // manager's reclaimed set has the sst now
+        auto& reclaimed_set = sst_mgr.get_reclaimed_set();
+        BOOST_REQUIRE_EQUAL(reclaimed_set.size(), 1);
+        BOOST_REQUIRE_EQUAL(reclaimed_set.begin()->get_filename(), sst->get_filename());
+
+        // hold a copy of shared sst object in async thread to test reload after unlink
+        utils::get_local_injector().enable("test_bloom_filter_reload_after_unlink");
+        auto async_sst_holder = seastar::async([sst] {
+            // do nothing just hold a copy of sst and wait for message signalling test completion
+            utils::get_local_injector().inject_with_handler("test_bloom_filter_reload_after_unlink", [] (auto& handler) {
+                auto ret = handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{5});
+                return ret;
+            }).get();
+        });
+
+        // unlink the sst and release the object
+        sst->unlink().get();
+        sst.release();
+
+        // reclaimed set should be now empty but the total memory reclaimed should
+        // be still the same as the sst object is not deactivated yet due to a copy
+        // being alive in the async thread.
+        BOOST_REQUIRE_EQUAL(sst_mgr.get_reclaimed_set().size(), 0);
+        BOOST_REQUIRE_EQUAL(sst_mgr.get_total_memory_reclaimed(), memory_reclaimed);
+
+        // message async thread to complete waiting and thus release its copy of sst, triggering deactivation
+        utils::get_local_injector().receive_message("test_bloom_filter_reload_after_unlink");
+        async_sst_holder.get();
+
+        REQUIRE_EVENTUALLY_EQUAL(sst_mgr.get_total_memory_reclaimed(), 0);
+    }, {
+        // set available memory = 0 to force reclaim the bloom filter
+        .available_memory = 0
+    });
+};
--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -615,7 +615,10 @@ private:
            _sl_controller.invoke_on_all(&qos::service_level_controller::start).get();

            _sys_ks.start(std::ref(_qp), std::ref(_db)).get();
-            auto stop_sys_kd = defer([this] { _sys_ks.stop().get(); });
+            auto stop_sys_kd = defer([this] {
+                _sys_ks.invoke_on_all(&db::system_keyspace::shutdown).get();
+                _sys_ks.stop().get();
+            });

            replica::distributed_loader::init_system_keyspace(_sys_ks, _erm_factory, _db).get();
            _db.local().maybe_init_schema_commitlog();
@@ -785,9 +788,6 @@ private:
            }

            group0_client.init().get();
-            auto stop_system_keyspace = defer([this] {
-                _sys_ks.invoke_on_all(&db::system_keyspace::shutdown).get();
-            });

            auto shutdown_db = defer([this] {
                _db.invoke_on_all(&replica::database::shutdown).get();
--- a/test/lib/sstable_test_env.hh
+++ b/test/lib/sstable_test_env.hh
@@ -54,6 +54,14 @@ public:
    size_t get_total_reclaimable_memory() {
        return _total_reclaimable_memory;
    }
+
+    void remove_sst_from_reclaimed(sstable* sst) {
+        _reclaimed.erase(*sst);
+    }
+
+    auto& get_reclaimed_set() {
+        return _reclaimed;
+    }
 };

 struct test_env_config {
--- a/test/lib/test_services.cc
+++ b/test/lib/test_services.cc
@@ -110,6 +110,7 @@ public:
    api::timestamp_type min_memtable_timestamp() const override {
        return table().min_memtable_timestamp();
    }
+    bool memtable_has_key(const dht::decorated_key& key) const override { return false; }
    future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) override {
        return table().as_table_state().on_compaction_completion(std::move(desc), offstrategy);
    }
@@ -201,7 +202,7 @@ test_env::impl::impl(test_env_config cfg, sstables::storage_manager* sstm)
    , feature_service(gms::feature_config_from_db_config(*db_config))
    , mgr(cfg.large_data_handler == nullptr ? nop_ld_handler : *cfg.large_data_handler, *db_config,
        feature_service, cache_tracker, cfg.available_memory, dir_sem,
-        [host_id = locator::host_id::create_random_id()]{ return host_id; }, sstm)
+        [host_id = locator::host_id::create_random_id()]{ return host_id; }, current_scheduling_group(), sstm)
    , semaphore(reader_concurrency_semaphore::no_limits{}, "sstables::test_env")
    , storage(std::move(cfg.storage))
 { }
--- a/test/pylib/async_cql.py
+++ b/test/pylib/async_cql.py
@@ -77,7 +77,7 @@ def _wrap_future(driver_response_future: ResponseFuture, all_pages: bool = False


 # TODO: paged result query handling (iterable?)
-def run_async(self, *args, all_pages = False, **kwargs) -> asyncio.Future:
+def run_async(self, *args, all_pages = True, **kwargs) -> asyncio.Future:
    """Execute a CQL query asynchronously by wrapping the driver's future"""
    # The default timeouts should have been more than enough, but in some
    # extreme cases with a very slow debug build running on a slow or very busy
--- a/test/pylib/minio_server.py
+++ b/test/pylib/minio_server.py
@@ -52,9 +52,10 @@ class MinioServer:
        self.default_user = 'minioadmin'
        self.default_pass = 'minioadmin'
        self.bucket_name = 'testbucket'
-        self.access_key = ''.join(random.choice(string.hexdigits) for i in range(16))
-        self.secret_key = ''.join(random.choice(string.hexdigits) for i in range(32))
+        self.access_key = os.environ.get(self.ENV_ACCESS_KEY, ''.join(random.choice(string.hexdigits) for i in range(16)))
+        self.secret_key = os.environ.get(self.ENV_SECRET_KEY, ''.join(random.choice(string.hexdigits) for i in range(32)))
        self.log_filename = (self.tempdir / 'minio').with_suffix(".log")
+        self.old_env = dict()

    def check_server(self, port):
        s = socket.socket()
@@ -154,8 +155,11 @@ class MinioServer:
        with open(path, 'w', encoding='ascii') as config_file:
            endpoint = {'name': address,
                        'port': port,
-                        'aws_access_key_id': acc_key,
-                        'aws_secret_access_key': secret_key,
+                        # don't put credentials here. We're exporing env vars, which should
+                        # be picked up properly by scylla.
+                        # https://github.com/scylladb/scylla-pkg/issues/3845
+                        #'aws_access_key_id': acc_key,
+                        #'aws_secret_access_key': secret_key,
                        'aws_region': region,
                        }
            yaml.dump({'endpoints': [endpoint]}, config_file)
@@ -184,6 +188,37 @@ class MinioServer:

        return cmd

+    def _set_environ(self):
+        self.old_env = dict(os.environ)
+        os.environ[self.ENV_CONFFILE] = f'{self.config_file}'
+        os.environ[self.ENV_ADDRESS] = f'{self.address}'
+        os.environ[self.ENV_PORT] = f'{self.port}'
+        os.environ[self.ENV_BUCKET] = f'{self.bucket_name}'
+        os.environ[self.ENV_ACCESS_KEY] = f'{self.access_key}'
+        os.environ[self.ENV_SECRET_KEY] = f'{self.secret_key}'
+
+    def _get_environs(self):
+        return [self.ENV_CONFFILE,
+                self.ENV_ADDRESS,
+                self.ENV_PORT,
+                self.ENV_BUCKET,
+                self.ENV_ACCESS_KEY,
+                self.ENV_SECRET_KEY]
+
+    def _unset_environ(self):
+        for env in self._get_environs():
+            if self.old_env[env] is not None:
+                os.environ[env] = self.old_env[env]
+            else:
+                del os.environ[env]
+
+    def print_environ(self):
+        msgs = []
+        for key in self._get_environs():
+            value = os.environ[key]
+            msgs.append(f'export {key}={value}')
+        print('\n'.join(msgs))
+
    async def start(self):
        if self.srv_exe is None:
            self.logger.info("Minio not installed, get it from https://dl.minio.io/server/minio/release/linux-amd64/minio and put into PATH")
@@ -206,13 +241,7 @@ class MinioServer:
            return

        self.create_conf_file(self.address, self.port, self.access_key, self.secret_key, self.DEFAULT_REGION, self.config_file)
-        os.environ[self.ENV_CONFFILE] = f'{self.config_file}'
-        os.environ[self.ENV_ADDRESS] = f'{self.address}'
-        os.environ[self.ENV_PORT] = f'{self.port}'
-        os.environ[self.ENV_BUCKET] = f'{self.bucket_name}'
-        os.environ[self.ENV_ACCESS_KEY] = f'{self.access_key}'
-        os.environ[self.ENV_SECRET_KEY] = f'{self.secret_key}'
-
+        self._set_environ()
        try:
            alias = 'local'
            self.log_to_file(f'Configuring access to {self.address}:{self.port}')
@@ -238,6 +267,7 @@ class MinioServer:
        if not self.cmd:
            return

+        self._unset_environ()
        try:
            self.cmd.kill()
        except ProcessLookupError:
--- a/test/pylib/scylla_cluster.py
+++ b/test/pylib/scylla_cluster.py
@@ -75,8 +75,7 @@ def make_scylla_conf(workdir: pathlib.Path, host_addr: str, seed_addrs: List[str
                                  'alternator-streams',
                                  'consistent-topology-changes',
                                  'broadcast-tables',
-                                  'keyspace-storage-options',
-                                  'tablets'],
+                                  'keyspace-storage-options'],

        'consistent_cluster_management': True,

@@ -396,7 +395,9 @@ class ScyllaServer:
        """Start an installed server. May be used for restarts."""

        env = os.environ.copy()
-        env.clear()     # pass empty env to make user user's SCYLLA_HOME has no impact
+        # remove from env to make sure user's SCYLLA_HOME has no impact
+        env.pop('SCYLLA_HOME', None)
+
        self.cmd = await asyncio.create_subprocess_exec(
            self.exe,
            *self.cmdline_options,
--- a/test/topology/test_random_tables.py
+++ b/test/topology/test_random_tables.py
@@ -115,7 +115,7 @@ async def test_paged_result(manager, random_tables):

    # Check only 1 page
    stmt = SimpleStatement(f"SELECT * FROM {table} ALLOW FILTERING", fetch_size = fetch_size)
-    res = await cql.run_async(stmt)
+    res = await cql.run_async(stmt, all_pages=False)
    assert len(res) == fetch_size

    # Check all pages
--- a/test/topology_custom/test_alternator.py
+++ b/test/topology_custom/test_alternator.py
@@ -0,0 +1,182 @@
+#
+# Copyright (C) 2024-present ScyllaDB
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+#
+
+# Multi-node tests for Alternator.
+#
+# Please note that most tests for Alternator are single-node tests and can
+# be found in the test/alternator directory. Most functional testing of the
+# many different syntax features that Alternator provides don't need more
+# than a single node to be tested, and should be able to run also on DynamoDB
+# - not just on Alternator, which the test/alternator framework allows to do.
+# So only the minority of tests that do need a bigger cluster should be here.
+
+import pytest
+import asyncio
+import logging
+import time
+import boto3
+import botocore
+
+logger = logging.getLogger(__name__)
+
+# Convenience function to open a connection to Alternator usable by the
+# AWS SDK.
+alternator_config = {
+    'alternator_port': 8000,
+    'alternator_write_isolation': 'only_rmw_uses_lwt',
+    'alternator_ttl_period_in_seconds': '0.5',
+}
+def get_alternator(ip):
+    url = f"http://{ip}:{alternator_config['alternator_port']}"
+    return boto3.resource('dynamodb', endpoint_url=url,
+        region_name='us-east-1',
+        aws_access_key_id='alternator',
+        aws_secret_access_key='secret_pass',
+        config=botocore.client.Config(
+            retries={"max_attempts": 0},
+            read_timeout=300)
+    )
+
+# Alternator convenience function for fetching the entire result set of a
+# query into an array of items.
+def full_query(table, ConsistentRead=True, **kwargs):
+    response = table.query(ConsistentRead=ConsistentRead, **kwargs)
+    items = response['Items']
+    while 'LastEvaluatedKey' in response:
+        response = table.query(ExclusiveStartKey=response['LastEvaluatedKey'],
+            ConsistentRead=ConsistentRead, **kwargs)
+        items.extend(response['Items'])
+    return items
+
+# FIXME: boto3 is NOT async. So all tests that use it are not really async.
+# We could use the aioboto3 library to write a really asynchronous test, or
+# implement an async wrapper to the boto3 functions ourselves (e.g., run them
+# in a separate thread) ourselves.
+
+@pytest.fixture(scope="module")
+async def alternator3(manager_internal):
+    """A fixture with a 3-node Alternator cluster that can be shared between
+       multiple tests. These test should not modify the cluster's topology,
+       and should each use unique table names and/or unique keys to avoid
+       being confused by other tests.
+       Returns the manager object and 3 boto3 resource objects for making
+       DynamoDB API requests to each of the nodes in the Alternator cluster.
+    """
+    manager = manager_internal
+    servers = [await manager.server_add(config=alternator_config) for _ in range(3)]
+    yield [manager] + [get_alternator(server.ip_addr) for server in servers]
+    await manager.stop()
+
+test_table_prefix = 'alternator_Test_'
+def unique_table_name():
+    current_ms = int(round(time.time() * 1000))
+    # If unique_table_name() is called twice in the same millisecond...
+    if unique_table_name.last_ms >= current_ms:
+        current_ms = unique_table_name.last_ms + 1
+    unique_table_name.last_ms = current_ms
+    return test_table_prefix + str(current_ms)
+unique_table_name.last_ms = 0
+
+
+async def test_alternator_ttl_scheduling_group(alternator3):
+    """A reproducer for issue #18719: The expiration scans and deletions
+       initiated by the Alternator TTL feature are supposed to run entirely in
+       the "streaming" scheduling group. But because of a bug in inheritence
+       of scheduling groups through RPC, some of the work ended up being done
+       on the "statement" scheduling group.
+       This test verifies that Alternator TTL work is done on the right
+       scheduling group.
+       This test assumes that the cluster is not concurrently busy with
+       running any other workload - so we won't see any work appearing
+       in the wrong scheduling group. We can assume this because we don't
+       run multiple tests in parallel on the same cluster.
+    """
+    manager, alternator, *_ = alternator3
+    table = alternator.create_table(TableName=unique_table_name(),
+        BillingMode='PAY_PER_REQUEST',
+        KeySchema=[
+            {'AttributeName': 'p', 'KeyType': 'HASH' },
+        ],
+        AttributeDefinitions=[
+            {'AttributeName': 'p', 'AttributeType': 'N' },
+        ])
+    # Enable expiration (TTL) on attribute "expiration"
+    table.meta.client.update_time_to_live(TableName=table.name, TimeToLiveSpecification={'AttributeName': 'expiration', 'Enabled': True})
+
+    # Insert N rows, setting them all to expire 3 seconds from now.
+    N = 100
+    expiration = int(time.time())+3
+    with table.batch_writer() as batch:
+        for p in range(N):
+            batch.put_item(Item={'p': p, 'expiration': expiration})
+
+
+    # Unfortunately, Alternator has no way of doing the writes above with
+    # CL=ALL, only CL=QUORUM. So at this point we're not sure all the writes
+    # above have completed. We want to wait until they are over, so that we
+    # won't measure any of those writes in the statement scheduling group.
+    # Let's do it by checking the metrics of background writes and wait for
+    # them to drop to zero.
+    ips = [server.ip_addr for server in await manager.running_servers()]
+    timeout = time.time() + 60
+    while True:
+        if time.time() > timeout:
+            pytest.fail("timed out waiting for background writes to complete")
+        bg_writes = 0
+        for ip in ips:
+            metrics = await manager.metrics.query(ip)
+            bg_writes += metrics.get('scylla_storage_proxy_coordinator_background_writes')
+        if bg_writes == 0:
+            break # done waiting for the background writes to finish
+        await asyncio.sleep(0.1)
+
+    # Get the current amount of work (in CPU ms) done across all nodes and
+    # shards in different scheduling groups. We expect this to increase
+    # considerably for the streaming group while expiration scanning is
+    # proceeding, but not increase at all for the statement group because
+    # there are no requests being executed.
+    async def get_cpu_metrics():
+        ms_streaming = 0
+        ms_statement = 0
+        for ip in ips:
+            metrics = await manager.metrics.query(ip)
+            ms_streaming += metrics.get('scylla_scheduler_runtime_ms', {'group': 'streaming'})
+            ms_statement += metrics.get('scylla_scheduler_runtime_ms', {'group': 'statement'})
+        return (ms_streaming, ms_statement)
+
+    ms_streaming_before, ms_statement_before = await get_cpu_metrics()
+
+    # Wait until all rows expire, and get the CPU metrics again. All items
+    # were set to expire in 3 seconds, and the expiration thread is set up
+    # in alternator_config to scan the whole table in 0.5 seconds, and the
+    # whole table is just 100 rows, so we expect all the data to be gone in
+    # 4 seconds. Let's wait 5 seconds just in case. Even if not all the data
+    # will have been deleted by then, we do expect some deletions to have
+    # happened, and certainly several scans, all taking CPU which we expect
+    # to be in the right scheduling group.
+    await asyncio.sleep(5)
+    ms_streaming_after, ms_statement_after = await get_cpu_metrics()
+
+    # As a sanity check, verify some of the data really expired, so there
+    # was some TTL work actually done. We actually expect all of the data
+    # to have been expired by now, but in some extremely slow builds and
+    # test machines, this may not be the case.
+    assert N > table.scan(ConsistentRead=True, Select='COUNT')['Count']
+
+    # Between the calls to get_cpu_metrics() above, several expiration scans
+    # took place (we configured scans to happen every 0.5 seconds), and also
+    # a lot of deletes when the expiration time was reached. We expect all
+    # that work to have happened in the streaming group, not statement group,
+    # so "ratio" calculate below should be tiny, even exactly zero. Before
+    # issue #18719 was fixed, it was not tiny at all - 0.58.
+    # Just in case there are other unknown things happening, let's assert it
+    # is <0.1 instead of zero.
+    ms_streaming = ms_streaming_after - ms_streaming_before
+    ms_statement = ms_statement_after - ms_statement_before
+    ratio = ms_statement / ms_streaming
+    assert ratio < 0.1
+
+    table.delete()
--- a/test/topology_custom/test_hints.py
+++ b/test/topology_custom/test_hints.py
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2024-present ScyllaDB
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+#
+import asyncio
+import pytest
+import time
+import logging
+import requests
+import re
+
+from cassandra.cluster import ConnectionException, NoHostAvailable  # type: ignore
+from cassandra.query import SimpleStatement, ConsistencyLevel
+
+from test.pylib.manager_client import ManagerClient
+
+
+logger = logging.getLogger(__name__)
+
+# Write with RF=1 and CL=ANY to a dead node should write hints and succeed
+@pytest.mark.asyncio
+async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient):
+    node_count = 2
+    servers = [await manager.server_add() for _ in range(node_count)]
+
+    cql = manager.get_cql()
+    await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
+    await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)")
+
+    await manager.server_stop_gracefully(servers[1].server_id)
+
+    def get_hints_written_count(server):
+        c = 0
+        metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text
+        pattern = re.compile("^scylla_hints_manager_written")
+        for metric in metrics.split('\n'):
+            if pattern.match(metric) is not None:
+                c += int(float(metric.split()[1]))
+        return c
+
+    hints_before = get_hints_written_count(servers[0])
+
+    # Some of the inserts will be targeted to the dead node.
+    # The coordinator doesn't have live targets to send the write to, but it should write a hint.
+    for i in range(100):
+        await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY))
+
+    # Verify hints are written
+    hints_after = get_hints_written_count(servers[0])
+    assert hints_after > hints_before
--- a/test/topology_custom/test_lwt_semaphore.py
+++ b/test/topology_custom/test_lwt_semaphore.py
@@ -0,0 +1,37 @@
+#
+# Copyright (C) 2024-present ScyllaDB
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+#
+
+import asyncio
+import time
+from test.pylib.rest_client import inject_error
+from test.pylib.util import wait_for_cql_and_get_hosts
+import pytest
+from cassandra.protocol import WriteTimeout
+
+@pytest.mark.asyncio
+async def test_cas_semaphore(manager):
+    """ This is a regression test for scylladb/scylladb#19698 """
+    servers = [await manager.server_add(cmdline=['--smp', '1', '--write-request-timeout-in-ms', '500'])]
+
+    host = await wait_for_cql_and_get_hosts(manager.cql, {servers[0]}, time.time() + 60)
+
+    await manager.cql.run_async("CREATE KEYSPACE test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}")
+    await manager.cql.run_async("CREATE TABLE test.test (a int PRIMARY KEY, b int)")
+
+    async with inject_error(manager.api, servers[0].ip_addr, 'cas_timeout_after_lock'):
+        res = [manager.cql.run_async(f"INSERT INTO test.test (a) VALUES (0) IF NOT EXISTS", host=host[0]) for r in range(10)]
+        try:
+            await asyncio.gather(*res)
+        except WriteTimeout:
+            pass
+
+    res = [manager.cql.run_async(f"INSERT INTO test.test (a) VALUES (0) IF NOT EXISTS", host=host[0]) for r in range(10)]
+    await asyncio.gather(*res)
+
+    metrics = await manager.metrics.query(servers[0].ip_addr)
+    contention = metrics.get(name="scylla_storage_proxy_coordinator_cas_write_contention_count")
+
+    assert contention == None
--- a/test/topology_custom/test_mv_topology_change.py
+++ b/test/topology_custom/test_mv_topology_change.py
@@ -0,0 +1,49 @@
+#
+# Copyright (C) 2024-present ScyllaDB
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+#
+import asyncio
+import pytest
+import time
+import logging
+import requests
+import re
+
+from cassandra.cluster import ConnectionException, NoHostAvailable  # type: ignore
+
+from test.pylib.manager_client import ManagerClient
+from test.topology.conftest import skip_mode
+from test.pylib.util import wait_for
+
+
+logger = logging.getLogger(__name__)
+
+# Reproduces issue #19529
+# Write to a table with MV while one node is stopped, and verify
+# it doesn't cause MV write timeouts or preventing topology changes.
+# The writes that are targeted to the stopped node are with CL=ANY so
+# they should store a hint and then complete successfuly.
+# If the MV write handler is not completed after storing the hint, as in
+# issue #19529, it remains active until it timeouts, preventing topology changes
+# during this time.
+@pytest.mark.asyncio
+async def test_mv_write_to_dead_node(manager: ManagerClient):
+    servers = [await manager.server_add() for _ in range(4)]
+
+    cql = manager.get_cql()
+    await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}")
+    await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)")
+    await cql.run_async("CREATE materialized view ks.t_view AS select pk, v from ks.t where v is not null primary key (v, pk)")
+
+    await manager.server_stop_gracefully(servers[-1].server_id)
+
+    # Do inserts. some should generate MV writes to the stopped node
+    for i in range(100):
+        await cql.run_async(f"insert into ks.t (pk, v) values ({i}, {i+1})")
+
+    # Remove the node to trigger a topology change.
+    # If the MV write is not completed, as in issue #19529, the topology change
+    # will be held for long time until the write timeouts.
+    # Otherwise, it is expected to complete in short time.
+    await manager.remove_node(servers[0].server_id, servers[-1].server_id)
--- a/tools/scylla-sstable.cc
+++ b/tools/scylla-sstable.cc
@@ -892,6 +892,7 @@ public:
    virtual sstables::shared_sstable make_sstable() const override { return do_make_sstable(); }
    virtual sstables::sstable_writer_config configure_writer(sstring origin) const override { return do_configure_writer(std::move(origin)); }
    virtual api::timestamp_type min_memtable_timestamp() const override { return api::min_timestamp; }
+    virtual bool memtable_has_key(const dht::decorated_key& key) const override { return false; }
    virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) override { return make_ready_future<>(); }
    virtual bool is_auto_compaction_disabled_by_user() const noexcept override { return false; }
    virtual bool tombstone_gc_enabled() const noexcept override { return false; }
--- a/utils/allocation_strategy.hh
+++ b/utils/allocation_strategy.hh
@@ -188,6 +188,24 @@ public:
    void invalidate_references() noexcept {
        ++_invalidate_counter;
    }
+
+    // Asks the allocator to set aside some free memory,
+    // preventing it from being allocated until the matching
+    // unreserve() call. Can be used to preallocate some memory
+    // for a critical section where allocations can't fail.
+    //
+    // This is hack designed with the implementation details of the
+    // log-structured allocator in mind. In other allocators,
+    // it doesn't do anything useful.
+    //
+    // Don't use this unless you understand exactly what you are doing.
+    virtual uintptr_t reserve(size_t memory) {
+        return 0;
+    }
+
+    // As the argument to this function, you must pass the *return value* of the matching reserve().
+    virtual void unreserve(uintptr_t opaque) noexcept {
+    }
 };

 class standard_allocation_strategy : public allocation_strategy {
@@ -257,6 +275,16 @@ struct alloc_strategy_deleter {
    }
 };

+// RAII for allocation_strategy::reserve().
+class hold_reserve {
+    uintptr_t _opaque;
+public:
+    hold_reserve(size_t memory) : _opaque(current_allocator().reserve(memory)) {}
+    ~hold_reserve() { current_allocator().unreserve(_opaque); }
+    // Disallow copying and moving. They *could* be implemented, but I just didn't bother.
+    hold_reserve(hold_reserve&&) = delete;
+};
+
 // std::unique_ptr which can be used for owning an object allocated using allocation_strategy.
 // Must be destroyed before the pointer is invalidated. For compacting allocators, that
 // means it must not escape outside allocating_section or reclaim lock.
--- a/utils/chunked_vector.hh
+++ b/utils/chunked_vector.hh
@@ -81,7 +81,7 @@ private:
        }
    }
    void do_reserve_for_push_back();
-    size_t make_room(size_t n, bool stop_after_one);
+    void make_room(size_t n, bool stop_after_one);
    chunk_ptr new_chunk(size_t n);
    T* addr(size_t i) const {
        return &_chunks[i / max_chunk_capacity()][i % max_chunk_capacity()];
@@ -177,22 +177,19 @@ public:
    ///
    /// Allows reserving the memory chunk-by-chunk, avoiding stalls when a lot of
    /// chunks are needed. To drive the reservation to completion, call this
-    /// repeatedly with the value returned from the previous call until it
-    /// returns 0, yielding between calls when necessary. Example usage:
+    /// repeatedly until the vector's capacity reaches the expected size, yielding
+    /// between calls when necessary. Example usage:
    ///
-    ///     return do_until([&size] { return !size; }, [&my_vector, &size] () mutable {
-    ///         size = my_vector.reserve_partial(size);
+    ///     return do_until([&my_vector, size] { return my_vector.capacity() == size; }, [&my_vector, size] () mutable {
+    ///         my_vector.reserve_partial(size);
    ///     });
    ///
    /// Here, `do_until()` takes care of yielding between iterations when
    /// necessary.
-    ///
-    /// \returns the memory that remains to be reserved
-    size_t reserve_partial(size_t n) {
+    void reserve_partial(size_t n) {
        if (n > _capacity) {
-            return make_room(n, true);
+            make_room(n, true);
        }
-        return 0;
    }

    size_t memory_size() const {
@@ -402,7 +399,7 @@ chunked_vector<T, max_contiguous_allocation>::migrate(T* begin, T* end, T* resul
 }

 template <typename T, size_t max_contiguous_allocation>
-size_t
+void
 chunked_vector<T, max_contiguous_allocation>::make_room(size_t n, bool stop_after_one) {
    // First, if the last chunk is below max_chunk_capacity(), enlarge it

@@ -434,7 +431,6 @@ chunked_vector<T, max_contiguous_allocation>::make_room(size_t n, bool stop_afte
        _capacity += now;
        stop = stop_after_one;
    }
-    return (n - _capacity);
 }

 template <typename T, size_t max_contiguous_allocation>
--- a/utils/large_bitset.cc
+++ b/utils/large_bitset.cc
@@ -7,31 +7,23 @@
 */

 #include "large_bitset.hh"
-#include <algorithm>
 #include <seastar/core/align.hh>
 #include <seastar/core/thread.hh>
-#include "seastarx.hh"

 using namespace seastar;

 large_bitset::large_bitset(size_t nr_bits) : _nr_bits(nr_bits) {
    assert(thread::running_in_thread());

-    const size_t orig_nr_ints = align_up(nr_bits, bits_per_int()) / bits_per_int();
-    auto nr_ints = orig_nr_ints;
-    while (nr_ints) {
-        nr_ints = _storage.reserve_partial(nr_ints);
-        if (need_preempt()) {
-            thread::yield();
-        }
+    size_t nr_ints = align_up(nr_bits, bits_per_int()) / bits_per_int();
+    while (_storage.capacity() != nr_ints) {
+        _storage.reserve_partial(nr_ints);
+        thread::maybe_yield();
    }
-    nr_ints = orig_nr_ints;
    while (nr_ints) {
        _storage.push_back(0);
        --nr_ints;
-        if (need_preempt()) {
-            thread::yield();
-        }
+        thread::maybe_yield();
    }
 }

@@ -40,8 +32,6 @@ large_bitset::clear() {
    assert(thread::running_in_thread());
    for (auto&& pos: _storage) {
        pos = 0;
-        if (need_preempt()) {
-            thread::yield();
-        }
+        thread::maybe_yield();
    }
 }
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -1007,7 +1007,17 @@ class segment_pool {
    utils::dynamic_bitset _lsa_owned_segments_bitmap; // owned by this
    utils::dynamic_bitset _lsa_free_segments_bitmap;  // owned by this, but not in use
    size_t _free_segments = 0;
+
+    // Invariant: _free_segments > _current_emergency_reserve_goal.
+    // Used to ensure that some critical allocations won't fail.
+    // (We grow _current_emergency_reserve_goal in advance and shrink it right
+    // before the critical allocations, which allows them to utilize the pre-reserved
+    // segments).
    size_t _current_emergency_reserve_goal = 1;
+    // Used by allocating_section to request a certain number of free segments
+    // to be prepared for usage when the section is entered.
+    // This is more of a side-channel argument to refill_emergency_reserve() than a real piece of state.
+    // Passing it via a variable makes it easier to debug.
    size_t _emergency_reserve_max = 30;
    bool _allocation_failure_flag = false;
    bool _allocation_enabled = true;
@@ -1088,6 +1098,7 @@ public:
    void clear_allocation_failure_flag() noexcept { _allocation_failure_flag = false; }
    bool allocation_failure_flag() const noexcept { return _allocation_failure_flag; }
    void refill_emergency_reserve();
+    void ensure_free_segments(size_t n_segments);
    void add_non_lsa_memory_in_use(size_t n) noexcept {
        _non_lsa_memory_in_use += n;
    }
@@ -1330,10 +1341,18 @@ void segment_pool::deallocate_segment(segment* seg) noexcept
 }

 void segment_pool::refill_emergency_reserve() {
-    while (_free_segments < _emergency_reserve_max) {
-        auto seg = allocate_segment(_emergency_reserve_max);
+    try {
+        ensure_free_segments(_emergency_reserve_max);
+    } catch (const std::bad_alloc&) {
+        throw bad_alloc(format("failed to refill emergency reserve of {} (have {} free segments)", _emergency_reserve_max, _free_segments));
+    }
+}
+
+void segment_pool::ensure_free_segments(size_t n_segments) {
+    while (_free_segments < n_segments) {
+        auto seg = allocate_segment(n_segments);
        if (!seg) {
-            throw bad_alloc(format("failed to refill emergency reserve of {} (have {} free segments)", _emergency_reserve_max, _free_segments));
+            throw std::bad_alloc();
        }
        ++_segments_in_use;
        free_segment(seg);
@@ -2337,6 +2356,44 @@ public:
        return _eviction_fn;
    }

+    // LSA holds an internal "emergency reserve" of free segments that
+    // is only "opened" for usage before some critical allocations
+    // (in particular: the ones performed during memory compaction)
+    // to ensure that they won't fail.
+    //
+    // Here we hijack this mechanism to let the rest of the application implement
+    // some critical sections with infallible LSA allocations.
+    //
+    // reserve() increments the size of the internal emergency reserve,
+    // unreserve() decrements it.
+    //
+    // When you want to have some critical section that has to do some LSA 
+    // allocations infallibly (e.g. to restore some invariants
+    // of a LSA-managed data structure in a destructor), you can call reserve()
+    // beforehand to ensure that some extra memory will be held unused,
+    // and then call unreserve() (with reserve()'s return value as the argument)
+    // to make the reserved free segments available to the critical section.
+    // 
+    uintptr_t reserve(size_t memory) override {
+        // We round up the requested reserve to full segments.
+        size_t n_segments = (memory + segment::size - 1) >> segment::size_shift;
+
+        auto& pool = segment_pool();
+        size_t new_goal = pool.current_emergency_reserve_goal() + n_segments;
+        pool.ensure_free_segments(new_goal);
+        pool.set_current_emergency_reserve_goal(new_goal);
+
+        static_assert(sizeof(uintptr_t) >= sizeof(size_t));
+        return n_segments;
+    }
+
+    void unreserve(uintptr_t n_segments) noexcept override {
+        auto& pool = segment_pool();
+        assert(pool.current_emergency_reserve_goal() >= n_segments);
+        size_t new_goal = pool.current_emergency_reserve_goal() - n_segments;
+        pool.set_current_emergency_reserve_goal(new_goal);
+    }
+
    friend class region;
    friend class lsa_buffer;
    friend class region_evictable_occupancy_ascending_less_comparator;
--- a/Show More
+++ b/Show More