compaction_manager: cancel submission timer on drain

The `drain` method, cancels all running compactions and moves the compaction manager into the disabled state. To move it back to the enabled state, the `enable` method shall be called. This, however, throws an assertion error as the submission time is not cancelled and re-enabling the manager tries to arm the armed timer. Thus, cancel the timer, when calling the drain method to disable the compaction manager. Fixes https://github.com/scylladb/scylladb/issues/24504 All versions are affected. So it's a good candidate for a backport. Closes scylladb/scylladb#24505 (cherry picked from commit a9a53d9178) Closes scylladb/scylladb#24585
Merge '[Backport 6.2] cql: create default superuser if it doesn't exist' from Marcin Maliszkiewicz
2025-06-29 14:40:43 +03:00 · 2025-06-29 14:34:55 +03:00 · 2025-06-28 09:40:37 +03:00 · 2025-06-27 17:50:15 +02:00 · 2025-06-27 17:50:08 +02:00 · 2025-06-27 17:50:01 +02:00
203 changed files with 4260 additions and 1251 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -47,6 +47,11 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
        )
        logging.info(f"Pull request created: {backport_pr.html_url}")
        backport_pr.add_to_assignees(pr.user)
+        if is_draft:
+            backport_pr.add_to_labels("conflicts")
+            pr_comment = f"@{pr.user} - This PR was marked as draft because it has conflicts\n"
+            pr_comment += "Please resolve them and mark this PR as ready for review"
+            backport_pr.create_issue_comment(pr_comment)
        logging.info(f"Assigned PR to original author: {pr.user}")
        return backport_pr
    except GithubException as e:
--- a/.github/workflows/add-label-when-promoted.yaml
+++ b/.github/workflows/add-label-when-promoted.yaml
@@ -49,7 +49,7 @@ jobs:
          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
        run: python .github/scripts/label_promoted_commits.py  --commits ${{ github.event.before }}..${{ github.sha }} --repository ${{ github.repository }} --ref ${{ github.ref }}
      - name: Run auto-backport.py when promotion completed
-        if: github.event_name == 'push' && github.ref == 'refs/heads/${{ env.DEFAULT_BRANCH }}'
+        if: ${{ github.event_name == 'push' && github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH) }}
        env:
          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
        run: python .github/scripts/auto-backport.py --repo ${{ github.repository }} --base-branch ${{ github.ref }} --commits ${{ github.event.before }}..${{ github.sha }}
@@ -65,7 +65,7 @@ jobs:
            echo "backport_label=false" >> $GITHUB_OUTPUT
          fi
      - name: Run auto-backport.py when label was added
-        if: github.event_name == 'pull_request_target' && steps.check_label.outputs.backport_label == 'true' && (github.event.pull_request.state == 'closed' && github.event.pull_request.merged == true)
+        if: ${{ github.event_name == 'pull_request_target' && steps.check_label.outputs.backport_label == 'true' && github.event.pull_request.state == 'closed' }}
        env:
          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
        run: python .github/scripts/auto-backport.py --repo ${{ github.repository }} --base-branch ${{ github.ref }} --pull-request ${{ github.event.pull_request.number }} --head-commit ${{ github.event.pull_request.base.sha }}
--- a/.github/workflows/make-pr-ready-for-review.yaml
+++ b/.github/workflows/make-pr-ready-for-review.yaml
@@ -0,0 +1,27 @@
+name: Mark PR as Ready When Conflicts Label is Removed
+
+on:
+  pull_request_target:
+    types:
+      - unlabeled
+
+env:
+  DEFAULT_BRANCH: 'master'
+
+jobs:
+  mark-ready:
+    if: github.event.label.name == 'conflicts'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.repository }}
+          ref: ${{ env.DEFAULT_BRANCH }}
+          token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+          fetch-depth: 1
+      - name: Mark pull request as ready for review
+        run:  gh pr ready "${{ github.event.pull_request.number }}"
+        env:
+          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=6.2.2
+VERSION=6.2.4

 if test -f version
 then
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -218,6 +218,30 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/task_manager/drain/{module}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Drain finished local tasks",
+               "type":"void",
+               "nickname":"drain_tasks",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"module",
+                     "description":"The module to drain",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
      }
   ],
   "models":{
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -218,6 +218,32 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
        uint32_t ttl = cfg.task_ttl_seconds();
        co_return json::json_return_type(ttl);
    });
+
+    tm::drain_tasks.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        co_await tm.invoke_on_all([&req] (tasks::task_manager& tm) -> future<> {
+            tasks::task_manager::module_ptr module;
+            try {
+                module = tm.find_module(req->get_path_param("module"));
+            } catch (...) {
+                throw bad_param_exception(fmt::format("{}", std::current_exception()));
+            }
+
+            const auto& local_tasks = module->get_local_tasks();
+            std::vector<tasks::task_id> ids;
+            ids.reserve(local_tasks.size());
+            std::transform(begin(local_tasks), end(local_tasks), std::back_inserter(ids), [] (const auto& task) {
+                return task.second->is_complete() ? task.first : tasks::task_id::create_null_id();
+            });
+
+            for (auto&& id : ids) {
+                if (id) {
+                    module->unregister_task(id);
+                }
+                co_await maybe_yield();
+            }
+        });
+        co_return json_void();
+    });
 }

 void unset_task_manager(http_context& ctx, routes& r) {
@@ -229,6 +255,7 @@ void unset_task_manager(http_context& ctx, routes& r) {
    tm::get_task_status_recursively.unset(r);
    tm::get_and_update_ttl.unset(r);
    tm::get_ttl.unset(r);
+    tm::drain_tasks.unset(r);
 }

 }
--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -43,6 +43,10 @@ future<> maintenance_socket_role_manager::stop() {
    return make_ready_future<>();
 }

+future<> maintenance_socket_role_manager::ensure_superuser_is_created() {
+    return make_ready_future<>();
+}
+
 template<typename T = void>
 future<T> operation_not_supported_exception(std::string_view operation) {
    return make_exception_future<T>(
--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -39,6 +39,8 @@ public:

    virtual future<> stop() override;

+    virtual future<> ensure_superuser_is_created() override;
+
    virtual future<> create(std::string_view role_name, const role_config&, ::service::group0_batch&) override;

    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -106,6 +106,13 @@ public:

    virtual future<> stop() = 0;

+    ///
+    /// Ensure that superuser role exists.
+    ///
+    /// \returns a future once it is ensured that the superuser role exists.
+    ///
+    virtual future<> ensure_superuser_is_created() = 0;
+
    ///
    /// \returns an exceptional future with \ref role_already_exists for a role that has previously been created.
    ///
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -257,6 +257,10 @@ future<> service::stop() {
    });
 }

+future<> service::ensure_superuser_is_created() {
+    return _role_manager->ensure_superuser_is_created();
+}
+
 void service::update_cache_config() {
    auto db = _qp.db();

--- a/auth/service.hh
+++ b/auth/service.hh
@@ -131,6 +131,8 @@ public:

    future<> stop();

+    future<> ensure_superuser_is_created();
+
    void update_cache_config();

    void reset_authorization_cache();
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -241,35 +241,39 @@ future<> standard_role_manager::migrate_legacy_metadata() {
 }

 future<> standard_role_manager::start() {
-    return once_among_shards([this] {
-        return futurize_invoke([this] () {
-            if (legacy_mode(_qp)) {
-                return create_legacy_metadata_tables_if_missing();
-            }
-            return make_ready_future<>();
-        }).then([this] {
-            _stopped = auth::do_after_system_ready(_as, [this] {
-                return seastar::async([this] {
-                    if (legacy_mode(_qp)) {
-                        _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
+    return once_among_shards([this] () -> future<> {
+        if (legacy_mode(_qp)) {
+            co_await create_legacy_metadata_tables_if_missing();
+        }

-                        if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get()) {
-                            if (legacy_metadata_exists()) {
-                                log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
-                            }
+        auto handler = [this] () -> future<> {
+            const bool legacy = legacy_mode(_qp);
+            if (legacy) {
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
+                }
+                co_await _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as);

-                            return;
-                        }
-
-                        if (legacy_metadata_exists()) {
-                            migrate_legacy_metadata().get();
-                            return;
-                        }
+                if (co_await any_nondefault_role_row_satisfies(_qp, &has_can_login)) {
+                    if (legacy_metadata_exists()) {
+                        log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
                    }
-                    create_default_role_if_missing().get();
-                });
-            });
-        });
+                    co_return;
+                }
+
+                if (legacy_metadata_exists()) {
+                    co_await migrate_legacy_metadata();
+                    co_return;
+                }
+            }
+            co_await create_default_role_if_missing();
+            if (!legacy) {
+                _superuser_created_promise.set_value();
+            }
+        };
+
+        _stopped = auth::do_after_system_ready(_as, handler);
+        co_return;
    });
 }

@@ -278,6 +282,11 @@ future<> standard_role_manager::stop() {
    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
 }

+future<> standard_role_manager::ensure_superuser_is_created() {
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    return _superuser_created_promise.get_shared_future();
+}
+
 future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
    const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
            get_auth_ks_name(_qp),
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -37,6 +37,7 @@ class standard_role_manager final : public role_manager {
    future<> _stopped;
    abort_source _as;
    std::string _superuser;
+    shared_promise<> _superuser_created_promise;

 public:
    standard_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
@@ -49,6 +50,8 @@ public:

    virtual future<> stop() override;

+    virtual future<> ensure_superuser_is_created() override;
+
    virtual future<> create(std::string_view role_name, const role_config&, ::service::group0_batch&) override;

    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
--- a/cache_mutation_reader.hh
+++ b/cache_mutation_reader.hh
@@ -122,6 +122,9 @@ class cache_mutation_reader final : public mutation_reader::impl {
    gc_clock::time_point _read_time;
    gc_clock::time_point _gc_before;

+    api::timestamp_type _max_purgeable_timestamp = api::missing_timestamp;
+    api::timestamp_type _max_purgeable_timestamp_shadowable = api::missing_timestamp;
+
    future<> do_fill_buffer();
    future<> ensure_underlying();
    void copy_from_cache_to_buffer();
@@ -200,12 +203,17 @@ class cache_mutation_reader final : public mutation_reader::impl {
    gc_clock::time_point get_gc_before(const schema& schema, dht::decorated_key dk, const gc_clock::time_point query_time) {
        auto gc_state = _read_context.tombstone_gc_state();
        if (gc_state) {
-            return gc_state->get_gc_before_for_key(schema.shared_from_this(), dk, query_time);
+            return gc_state->with_commitlog_check_disabled().get_gc_before_for_key(schema.shared_from_this(), dk, query_time);
        }

        return gc_clock::time_point::min();
    }

+    bool can_gc(tombstone t, is_shadowable is) const {
+        const auto max_purgeable = is ? _max_purgeable_timestamp_shadowable : _max_purgeable_timestamp;
+        return t.timestamp < max_purgeable;
+    }
+
 public:
    cache_mutation_reader(schema_ptr s,
                               dht::decorated_key dk,
@@ -227,8 +235,19 @@ public:
        , _read_time(get_read_time())
        , _gc_before(get_gc_before(*_schema, dk, _read_time))
    {
-        clogger.trace("csm {}: table={}.{}, reversed={}, snap={}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name(), _read_context.is_reversed(),
-                      fmt::ptr(&*_snp));
+        _max_purgeable_timestamp = ctx.get_max_purgeable(dk, is_shadowable::no);
+        _max_purgeable_timestamp_shadowable = ctx.get_max_purgeable(dk, is_shadowable::yes);
+
+        clogger.trace("csm {}: table={}.{}, dk={}, gc-before={}, max-purgeable-regular={}, max-purgeable-shadowable={}, reversed={}, snap={}",
+                fmt::ptr(this),
+                _schema->ks_name(),
+                _schema->cf_name(),
+                dk,
+                _gc_before,
+                _max_purgeable_timestamp,
+                _max_purgeable_timestamp_shadowable,
+                _read_context.is_reversed(),
+                fmt::ptr(&*_snp));
        push_mutation_fragment(*_schema, _permit, partition_start(std::move(dk), _snp->partition_tombstone()));
    }
    cache_mutation_reader(schema_ptr s,
@@ -786,12 +805,12 @@ void cache_mutation_reader::copy_from_cache_to_buffer() {
            t.apply(range_tomb);

            auto row_tomb_expired = [&](row_tombstone tomb) {
-                return (tomb && tomb.max_deletion_time() < _gc_before);
+                return (tomb && tomb.max_deletion_time() < _gc_before && can_gc(tomb.tomb(), tomb.is_shadowable()));
            };

            auto is_row_dead = [&](const deletable_row& row) {
                auto& m = row.marker();
-                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before);
+                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before && can_gc(tombstone(m.timestamp(), m.deletion_time()), is_shadowable::no));
            };

            if (row_tomb_expired(t) || is_row_dead(row)) {
@@ -799,9 +818,11 @@ void cache_mutation_reader::copy_from_cache_to_buffer() {

                _read_context.cache()._tracker.on_row_compacted();

+                auto mutation_can_gc = can_gc_fn([this] (tombstone t, is_shadowable is) { return can_gc(t, is); });
+
                with_allocator(_snp->region().allocator(), [&] {
                    deletable_row row_copy(row_schema, row);
-                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, always_gc, _gc_before, nullptr);
+                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, mutation_can_gc, _gc_before, nullptr);
                    std::swap(row, row_copy);
                });
                remove_row = row.empty();
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -364,6 +364,9 @@ cdc::topology_description make_new_generation_description(
        const noncopyable_function<std::pair<size_t, uint8_t>(dht::token)>& get_sharding_info,
        const locator::token_metadata_ptr tmptr) {
    const auto tokens = get_tokens(bootstrap_tokens, tmptr);
+    if (tokens.empty()) {
+        on_internal_error(cdc_log, "Attempted to create a CDC generation from an empty list of tokens");
+    }

    utils::chunked_vector<token_range_description> vnode_descriptions;
    vnode_descriptions.reserve(tokens.size());
@@ -1111,7 +1114,9 @@ future<bool> generation_service::legacy_do_handle_cdc_generation(cdc::generation
    auto sys_dist_ks = get_sys_dist_ks();
    auto gen = co_await retrieve_generation_data(gen_id, _sys_ks.local(), *sys_dist_ks, { _token_metadata.get()->count_normal_token_owners() });
    if (!gen) {
-        throw std::runtime_error(fmt::format(
+        // This may happen during raft upgrade when a node gossips about a generation that
+        // was propagated through raft and we didn't apply it yet.
+        throw generation_handling_nonfatal_exception(fmt::format(
            "Could not find CDC generation {} in distributed system tables (current time: {}),"
            " even though some node gossiped about it.",
            gen_id, db_clock::now()));
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -186,7 +186,7 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
    }

    auto ts = to_ts(tp);
-    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
+    auto [it, emplaced] = _gens.emplace(to_ts(tp), std::nullopt);

    if (_last_stream_timestamp != api::missing_timestamp) {
        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
@@ -201,5 +201,5 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
        }
    }

-    return emplaced;
+    return !it->second;
 }
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1120,7 +1120,10 @@ future<> compaction_manager::drain() {
    cmlog.info("Asked to drain");
    if (*_early_abort_subscription) {
        _state = state::disabled;
+        _compaction_submission_timer.cancel();
        co_await stop_ongoing_compactions("drain");
+        // Trigger a signal to properly exit from postponed_compactions_reevaluation() fiber
+        reevaluate_postponed_compactions();
    }
    cmlog.info("Drained");
 }
--- a/compress.cc
+++ b/compress.cc
@@ -14,7 +14,9 @@
 #include "exceptions/exceptions.hh"
 #include "utils/class_registrator.hh"

-const sstring compressor::namespace_prefix = "org.apache.cassandra.io.compress.";
+sstring compressor::make_name(std::string_view short_name) {
+    return seastar::format("org.apache.cassandra.io.compress.{}", short_name);
+}

 class lz4_processor: public compressor {
 public:
@@ -66,7 +68,7 @@ compressor::ptr_type compressor::create(const sstring& name, const opt_getter& o
        return {};
    }

-    qualified_name qn(namespace_prefix, name);
+    qualified_name qn(make_name(""), name);

    for (auto& c : { lz4, snappy, deflate }) {
        if (c->name() == static_cast<const sstring&>(qn)) {
@@ -91,9 +93,9 @@ shared_ptr<compressor> compressor::create(const std::map<sstring, sstring>& opti
    return {};
 }

-thread_local const shared_ptr<compressor> compressor::lz4 = ::make_shared<lz4_processor>(namespace_prefix + "LZ4Compressor");
-thread_local const shared_ptr<compressor> compressor::snappy = ::make_shared<snappy_processor>(namespace_prefix + "SnappyCompressor");
-thread_local const shared_ptr<compressor> compressor::deflate = ::make_shared<deflate_processor>(namespace_prefix + "DeflateCompressor");
+thread_local const shared_ptr<compressor> compressor::lz4 = ::make_shared<lz4_processor>(make_name("LZ4Compressor"));
+thread_local const shared_ptr<compressor> compressor::snappy = ::make_shared<snappy_processor>(make_name("SnappyCompressor"));
+thread_local const shared_ptr<compressor> compressor::deflate = ::make_shared<deflate_processor>(make_name("DeflateCompressor"));

 const sstring compression_parameters::SSTABLE_COMPRESSION = "sstable_compression";
 const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_in_kb";
--- a/compress.hh
+++ b/compress.hh
@@ -69,7 +69,7 @@ public:
    static thread_local const ptr_type snappy;
    static thread_local const ptr_type deflate;

-    static const sstring namespace_prefix;
+    static sstring make_name(std::string_view short_name);
 };

 template<typename BaseType, typename... Args>
--- a/configure.py
+++ b/configure.py
@@ -1113,7 +1113,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
-                'repair/table_check.cc',
+                'streaming/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -1323,6 +1323,7 @@ scylla_tests_generic_dependencies = [
    'test/lib/test_utils.cc',
    'test/lib/tmpdir.cc',
    'test/lib/sstable_run_based_compaction_strategy_for_tests.cc',
+    'test/lib/eventually.cc',
 ]

 scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_generic_dependencies + [
@@ -1363,6 +1364,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/lib/key_utils.cc',
                'test/lib/random_schema.cc',
                'test/lib/data_model.cc',
+                'test/lib/eventually.cc',
                'seastar/tests/perf/linux_perf_event.cc']

 deps = {
@@ -1470,7 +1472,7 @@ deps['test/boost/bytes_ostream_test'] = [
    "test/lib/log.cc",
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
-deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'utils/hashers.cc', 'utils/on_internal_error.cc']
+deps['test/boost/UUID_test'] = ['clocks-impl.cc', 'utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'utils/hashers.cc', 'utils/on_internal_error.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
@@ -1501,11 +1503,11 @@ deps['test/boost/rust_test'] += ['rust/inc/src/lib.rs']

 deps['test/boost/group0_cmd_merge_test'] += ['test/lib/expr_test_utils.cc']

-deps['test/raft/replication_test'] = ['test/raft/replication_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
-deps['test/raft/raft_server_test'] = ['test/raft/raft_server_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
+deps['test/raft/replication_test'] = ['test/raft/replication_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc', 'test/lib/eventually.cc'] + scylla_raft_dependencies
+deps['test/raft/raft_server_test'] = ['test/raft/raft_server_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc', 'test/lib/eventually.cc'] + scylla_raft_dependencies
 deps['test/raft/randomized_nemesis_test'] = ['test/raft/randomized_nemesis_test.cc', 'direct_failure_detector/failure_detector.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
 deps['test/raft/failure_detector_test'] = ['test/raft/failure_detector_test.cc', 'direct_failure_detector/failure_detector.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
-deps['test/raft/many_test'] = ['test/raft/many_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
+deps['test/raft/many_test'] = ['test/raft/many_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc', 'test/lib/eventually.cc'] + scylla_raft_dependencies
 deps['test/raft/fsm_test'] =  ['test/raft/fsm_test.cc', 'test/raft/helpers.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
 deps['test/raft/etcd_test'] =  ['test/raft/etcd_test.cc', 'test/raft/helpers.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
 deps['test/raft/raft_sys_table_storage_test'] = ['test/raft/raft_sys_table_storage_test.cc'] + \
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -27,6 +27,8 @@
 #include "gms/feature_service.hh"
 #include "replica/database.hh"

+using namespace std::string_literals;
+
 static logging::logger mylogger("alter_keyspace");

 bool is_system_keyspace(std::string_view keyspace);
@@ -207,6 +209,25 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
        auto ts = mc.write_timestamp();
        auto global_request_id = mc.new_group0_state_id();

+        // #22688 - filter out any dc*:0 entries - consider these
+        // null and void (removed). Migration planning will treat it
+        // as dc*=0 still.
+        std::erase_if(ks_options, [](const auto& i) {
+            static constexpr std::string replication_prefix = ks_prop_defs::KW_REPLICATION + ":"s;
+            // Flattened map, replication entries starts with "replication:".
+            // Only valid options are replication_factor, class and per-dc rf:s. We want to
+            // filter out any dcN=0 entries.
+            auto& [key, val] = i;
+            if (key.starts_with(replication_prefix) && val == "0") {
+                std::string_view v(key);
+                v.remove_prefix(replication_prefix.size());
+                return v != ks_prop_defs::REPLICATION_FACTOR_KEY 
+                    && v != ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY
+                    ;
+            }
+            return false;
+        });
+
        // we only want to run the tablets path if there are actually any tablets changes, not only schema changes
        // TODO: the current `if (changes_tablets(qp))` is insufficient: someone may set the same RFs as before,
        //       and we'll unnecessarily trigger the processing path for ALTER tablets KS,
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -69,6 +69,16 @@ static std::map<sstring, sstring> prepare_options(
        }
    }

+    // #22688 / #20039 - check for illegal, empty options (after above expand)
+    // moved to here. We want to be able to remove dc:s once rf=0, 
+    // in which case, the options actually serialized in result mutations
+    // will in extreme cases in fact be empty -> cannot do this check in 
+    // verify_options. We only want to apply this constraint on the input
+    // provided by the user
+    if (options.empty() && !tm.get_topology().get_datacenters().empty()) {
+        throw exceptions::configuration_exception("Configuration for at least one datacenter must be present");
+    }
+
    return options;
 }

--- a/cql3/statements/list_service_level_statement.cc
+++ b/cql3/statements/list_service_level_statement.cc
@@ -54,7 +54,7 @@ list_service_level_statement::execute(query_processor& qp,

    return make_ready_future().then([this, &state] () {
                                  if (_describe_all) {
-                                      return state.get_service_level_controller().get_distributed_service_levels();
+                                      return state.get_service_level_controller().get_distributed_service_levels(qos::query_context::user);
                                  } else {
                                      return state.get_service_level_controller().get_distributed_service_level(_service_level);
                                  }
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1588,7 +1588,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ

    std::vector<std::pair<sseg_ptr, uint64_t>> maybe_clear;

-    assert(_request_controller.available_units() <= ssize_t(max_request_controller_units()));
+    SCYLLA_ASSERT(_request_controller.available_units() <= ssize_t(max_request_controller_units()));
    auto fut = get_units(_request_controller, max_request_controller_units(), timeout);
    if (_request_controller.waiters()) {
        totals.requests_blocked_memory++;
@@ -1597,7 +1597,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
    scope_increment_counter allocating(totals.active_allocations);

    auto permit = co_await std::move(fut);
-    assert(_request_controller.available_units() == 0);
+    SCYLLA_ASSERT(_request_controller.available_units() == 0);

    decltype(permit) fake_permit; // can't have allocate+sync release semaphore.
    bool failed = false;
@@ -1858,10 +1858,13 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
            }
        }
    }
-    assert(_request_controller.available_units() == 0);
-
+    SCYLLA_ASSERT(_request_controller.available_units() == 0);
+    SCYLLA_ASSERT(permit.count() == max_request_controller_units());
+    auto nw = _request_controller.waiters();
    permit.return_all();
-    assert(_request_controller.available_units() == ssize_t(max_request_controller_units()));
+    // #20633 cannot guarantee controller avail is now full, since we could have had waiters when doing
+    // return all -> now will be less avail
+    SCYLLA_ASSERT(nw > 0 || _request_controller.available_units() == ssize_t(max_request_controller_units()));

    if (!failed) {
        clogger.trace("Oversized allocation succeeded.");
--- a/db/config.cc
+++ b/db/config.cc
@@ -27,6 +27,7 @@
 #include "cdc/cdc_extension.hh"
 #include "tombstone_gc_extension.hh"
 #include "db/per_partition_rate_limit_extension.hh"
+#include "db/paxos_grace_seconds_extension.hh"
 #include "db/tags/extension.hh"
 #include "config.hh"
 #include "extensions.hh"
@@ -1192,6 +1193,18 @@ void db::config::add_tombstone_gc_extension() {
    _extensions->add_schema_extension<tombstone_gc_extension>(tombstone_gc_extension::NAME);
 }

+void db::config::add_paxos_grace_seconds_extension() {
+    _extensions->add_schema_extension<db::paxos_grace_seconds_extension>(db::paxos_grace_seconds_extension::NAME);
+}
+
+void db::config::add_all_default_extensions() {
+    add_cdc_extension();
+    add_per_partition_rate_limit_extension();
+    add_tags_extension();
+    add_tombstone_gc_extension();
+    add_paxos_grace_seconds_extension();
+}
+
 void db::config::setup_directories() {
    maybe_in_workdir(commitlog_directory, "commitlog");
    if (!schema_commitlog_directory.is_set()) {
--- a/db/config.hh
+++ b/db/config.hh
@@ -140,6 +140,9 @@ public:
    void add_per_partition_rate_limit_extension();
    void add_tags_extension();
    void add_tombstone_gc_extension();
+    void add_paxos_grace_seconds_extension();
+
+    void add_all_default_extensions();

    /// True iff the feature is enabled.
    bool check_experimental(experimental_features_t::feature f) const;
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -741,8 +741,8 @@ system_distributed_keyspace::get_cdc_desc_v1_timestamps(context ctx) {
    co_return res;
 }

-future<qos::service_levels_info> system_distributed_keyspace::get_service_levels() const {
-    return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE);
+future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
+    return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
 }

 future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -112,7 +112,7 @@ public:

    future<db_clock::time_point> cdc_current_generation_timestamp(context);

-    future<qos::service_levels_info> get_service_levels() const;
+    future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
    future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
    future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
    future<> drop_service_level(sstring service_level_name) const;
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2044,7 +2044,6 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        // the view build information.
        fail.cancel();
        co_await barrier.arrive_and_wait();
-        units.return_all();

        co_await calculate_shard_build_step(vbi);
        _mnotifier.register_listener(this);
@@ -2349,7 +2348,7 @@ static future<> announce_with_raft(

 future<> view_builder::mark_view_build_started(sstring ks_name, sstring view_name) {
    co_await write_view_build_status(
-        [&] () -> future<> {
+        [this, ks_name, view_name] () -> future<> {
            co_await utils::get_local_injector().inject("view_builder_pause_add_new_view",
                    [] (auto& handler) { return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5)); });
            const sstring query_string = format("INSERT INTO {}.{} (keyspace_name, view_name, host_id, status) VALUES (?, ?, ?, ?)",
@@ -2359,7 +2358,7 @@ future<> view_builder::mark_view_build_started(sstring ks_name, sstring view_nam
                    {std::move(ks_name), std::move(view_name), host_id.uuid(), "STARTED"},
                    "view builder: mark view build STARTED");
        },
-        [&] () -> future<> {
+        [this, ks_name, view_name] () -> future<> {
            co_await utils::get_local_injector().inject("view_builder_pause_add_new_view",
                    [] (auto& handler) { return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5)); });
            co_await _sys_dist_ks.start_view_build(std::move(ks_name), std::move(view_name));
@@ -2369,7 +2368,7 @@ future<> view_builder::mark_view_build_started(sstring ks_name, sstring view_nam

 future<> view_builder::mark_view_build_success(sstring ks_name, sstring view_name) {
    co_await write_view_build_status(
-        [&] () -> future<> {
+        [this, ks_name, view_name] () -> future<> {
            co_await utils::get_local_injector().inject("view_builder_pause_mark_success",
                    [] (auto& handler) { return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5)); });
            const sstring query_string = format("UPDATE {}.{} SET status = ? WHERE keyspace_name = ? AND view_name = ? AND host_id = ?",
@@ -2379,7 +2378,7 @@ future<> view_builder::mark_view_build_success(sstring ks_name, sstring view_nam
                    {"SUCCESS", std::move(ks_name), std::move(view_name), host_id.uuid()},
                    "view builder: mark view build SUCCESS");
        },
-        [&] () -> future<> {
+        [this, ks_name, view_name] () -> future<> {
            co_await utils::get_local_injector().inject("view_builder_pause_mark_success",
                    [] (auto& handler) { return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5)); });
            co_await _sys_dist_ks.finish_view_build(std::move(ks_name), std::move(view_name));
@@ -2389,14 +2388,14 @@ future<> view_builder::mark_view_build_success(sstring ks_name, sstring view_nam

 future<> view_builder::remove_view_build_status(sstring ks_name, sstring view_name) {
    co_await write_view_build_status(
-        [&] () -> future<> {
+        [this, ks_name, view_name] () -> future<> {
            const sstring query_string = format("DELETE FROM {}.{} WHERE keyspace_name = ? AND view_name = ?",
                    db::system_keyspace::NAME, db::system_keyspace::VIEW_BUILD_STATUS_V2);
            co_await announce_with_raft(_qp, _group0_client, _as, std::move(query_string),
                    {std::move(ks_name), std::move(view_name)},
                    "view builder: delete view build status");
        },
-        [&] () -> future<> {
+        [this, ks_name, view_name] () -> future<> {
            co_await _sys_dist_ks.remove_view(std::move(ks_name), std::move(view_name));
        }
    );
@@ -2444,11 +2443,11 @@ view_builder::view_build_statuses(sstring keyspace, sstring view_name) const {

 future<> view_builder::add_new_view(view_ptr view, build_step& step) {
    vlogger.info0("Building view {}.{}, starting at token {}", view->ks_name(), view->cf_name(), step.current_token());
+    if (this_shard_id() == 0) {
+        co_await mark_view_build_started(view->ks_name(), view->cf_name());
+    }
+    co_await _sys_ks.register_view_for_building(view->ks_name(), view->cf_name(), step.current_token());
    step.build_status.emplace(step.build_status.begin(), view_build_status{view, step.current_token(), std::nullopt});
-    auto f = this_shard_id() == 0 ? mark_view_build_started(view->ks_name(), view->cf_name()) : make_ready_future<>();
-    return when_all_succeed(
-            std::move(f),
-            _sys_ks.register_view_for_building(view->ks_name(), view->cf_name(), step.current_token())).discard_result();
 }

 static future<> flush_base(lw_shared_ptr<replica::column_family> base, abort_source& as) {
@@ -2990,6 +2989,12 @@ public:
                    _step.build_status.pop_back();
                }
            }
+
+            // before going back to the minimum token, advance current_key to the end
+            // and check for built views in that range.
+            _step.current_key = {_step.prange.end().value_or(dht::ring_position::max()).value().token(), partition_key::make_empty()};
+            check_for_built_views();
+
            _step.current_key = {dht::minimum_token(), partition_key::make_empty()};
            for (auto&& vs : _step.build_status) {
                vs.next_token = dht::minimum_token();
--- a/dist/common/systemd/scylla-server.service
+++ b/dist/common/systemd/scylla-server.service
@@ -20,7 +20,6 @@ ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $MEM_CONF
 ExecStopPost=+/opt/scylladb/scripts/scylla_stop
 TimeoutStartSec=1y
 TimeoutStopSec=900
-KillMode=process
 Restart=on-abnormal
 User=scylla
 OOMScoreAdjust=-950
--- a/docs/architecture/index.rst
+++ b/docs/architecture/index.rst
@@ -12,6 +12,7 @@ ScyllaDB Architecture
   SSTable <sstable/index/>
   Compaction Strategies <compaction/compaction-strategies>
   Raft Consensus Algorithm in ScyllaDB </architecture/raft>
+   Zero-token Nodes </architecture/zero-token-nodes>
   
              
 * :doc:`Data Distribution with Tablets </architecture/tablets/>` - Tablets in ScyllaDB
@@ -22,5 +23,6 @@ ScyllaDB Architecture
 * :doc:`SSTable </architecture/sstable/index/>` - ScyllaDB SSTable 2.0 and 3.0 Format Information
 * :doc:`Compaction Strategies </architecture/compaction/compaction-strategies>` - High-level analysis of different compaction strategies
 * :doc:`Raft Consensus Algorithm in ScyllaDB </architecture/raft>` - Overview of how Raft is implemented in ScyllaDB.
+* :doc:`Zero-token Nodes </architecture/zero-token-nodes>` - Nodes that do not replicate any data.

 Learn more about these topics in the `ScyllaDB University: Architecture lesson <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/architecture/>`_.
--- a/docs/architecture/zero-token-nodes.rst
+++ b/docs/architecture/zero-token-nodes.rst
@@ -0,0 +1,23 @@
+=========================
+Zero-token Nodes
+=========================
+
+By default, all nodes in a cluster own a set of token ranges and are used to
+replicate data. In certain circumstances, you may choose to add a node that
+doesn't own any token. Such nodes are referred to as zero-token nodes. They
+do not have a copy of the data but only participate in Raft quorum voting.
+
+To configure a zero-token node, set the ``join_ring`` parameter to ``false``.
+
+You can use zero-token nodes in multi-DC deployments to reduce the risk of
+losing a quorum of nodes.
+See :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters </operating-scylla/procedures/cluster-management/arbiter-dc>` for details.
+
+Note that:
+
+* Zero-token nodes are ignored by drivers, so there is no need to change
+  the load balancing policy on the clients after adding zero-token nodes
+  to the cluster.
+* Zero-token nodes never store replicated data, so running ``nodetool rebuild``,
+  ``nodetool repair``, and ``nodetool cleanup`` can be skipped as it does not
+  affect zero-token nodes.
--- a/docs/dev/task_manager.md
+++ b/docs/dev/task_manager.md
@@ -62,16 +62,18 @@ Briefly:
 - `/task_manager/list_module_tasks/{module}` -
        lists (by default non-internal) tasks in the module;
 - `/task_manager/task_status/{task_id}` -
-        gets the task's status, unregisters the task if it's finished;
+        gets the task's status;
 - `/task_manager/abort_task/{task_id}` -
        aborts the task if it's abortable;
 - `/task_manager/wait_task/{task_id}` -
        waits for the task and gets its status;
 - `/task_manager/task_status_recursive/{task_id}` -
        gets statuses of the task and all its descendants in BFS
-        order, unregisters the task;
+        order;
 - `/task_manager/ttl` -
        gets or sets new ttl.
+- `/task_manager/drain/{module}` -
+        unregisters all finished local tasks in the module.

 # Virtual tasks

--- a/docs/getting-started/cloud-instance-recommendations.rst
+++ b/docs/getting-started/cloud-instance-recommendations.rst
@@ -175,7 +175,7 @@ Recommended instances types are `n1-highmem <https://cloud.google.com/compute/do
   * - n2-highmem-32
     - 32
     - 256
-     - 6,000
+     - 9,000
   * - n2-highmem-48
     - 48
     - 384
--- a/docs/operating-scylla/admin-tools/scylla-sstable.rst
+++ b/docs/operating-scylla/admin-tools/scylla-sstable.rst
@@ -430,8 +430,8 @@ The content is dumped in JSON, using the following schema:
        "estimated_tombstone_drop_time": $STREAMING_HISTOGRAM,
        "sstable_level": Uint,
        "repaired_at": Uint64,
-        "min_column_names": [Uint, ...],
-        "max_column_names": [Uint, ...],
+        "min_column_names": [String, ...],
+        "max_column_names": [String, ...],
        "has_legacy_counter_shards": Bool,
        "columns_count": Int64, // >= MC only
        "rows_count": Int64, // >= MC only
--- a/docs/operating-scylla/admin-tools/task-manager.rst
+++ b/docs/operating-scylla/admin-tools/task-manager.rst
@@ -73,17 +73,19 @@ API calls
 	- *keyspace* - if set, tasks are filtered to contain only the ones working on this keyspace;
 	- *table* - if set, tasks are filtered to contain only the ones working on this table;

-* ``/task_manager/task_status/{task_id}`` - gets the task's status, unregisters the task if it's finished;
+* ``/task_manager/task_status/{task_id}`` - gets the task's status;
 * ``/task_manager/abort_task/{task_id}`` - aborts the task if it's abortable, otherwise 403 status code is returned;
-* ``/task_manager/wait_task/{task_id}`` - waits for the task and gets its status (does not unregister the tasks); query params:
+* ``/task_manager/wait_task/{task_id}`` - waits for the task and gets its status; query params:

 	- *timeout* - timeout in seconds; if set - 408 status code is returned if waiting times out;

-* ``/task_manager/task_status_recursive/{task_id}`` - gets statuses of the task and all its descendants in BFS order, unregisters the root task;
+* ``/task_manager/task_status_recursive/{task_id}`` - gets statuses of the task and all its descendants in BFS order;
 * ``/task_manager/ttl`` - gets or sets new ttl; query params (if setting):

 	- *ttl* - new ttl value.

+* ``/task_manager/drain/{module}`` - unregisters all finished local tasks in the module.
+
 Cluster tasks are not unregistered from task manager with API calls.

 Tasks API
--- a/docs/operating-scylla/nodetool-commands/tasks/drain.rst
+++ b/docs/operating-scylla/nodetool-commands/tasks/drain.rst
@@ -0,0 +1,21 @@
+Nodetool tasks drain
+====================
+**tasks drain** - Unregisters all finished local tasks from the module.
+If a module is not specified, finished tasks in all modules are unregistered.
+
+Syntax
+-------
+.. code-block:: console
+
+   nodetool tasks drain [--module <module>]
+
+Options
+-------
+
+* ``--module`` - if set, only the specified module is drained.
+
+For example:
+
+.. code-block:: shell
+
+   > nodetool tasks drain --module repair
--- a/docs/operating-scylla/nodetool-commands/tasks/index.rst
+++ b/docs/operating-scylla/nodetool-commands/tasks/index.rst
@@ -5,6 +5,7 @@ Nodetool tasks
   :hidden:

   abort <abort>
+   drain <drain>
   list <list>
   modules <modules>
   status <status>
@@ -17,10 +18,17 @@ Nodetool tasks
 Task manager is an API-based tool for tracking long-running background operations, such as repair or compaction,
 which makes them observable and controllable. Task manager operates per node.

+Task Status Retention
+---------------------
+
+* When a task completes, its status is temporarily stored on the executing node
+* Status information is retained for up to :confval:`task_ttl_in_seconds` seconds
+
 Supported tasks suboperations
 -----------------------------

 * :doc:`abort </operating-scylla/nodetool-commands/tasks/abort>` - Aborts the task.
+* :doc:`drain </operating-scylla/nodetool-commands/tasks/drain>` - Unregisters all finished local tasks.
 * :doc:`list </operating-scylla/nodetool-commands/tasks/list>` - Lists tasks in the module.
 * :doc:`modules </operating-scylla/nodetool-commands/tasks/modules>` - Lists supported modules.
 * :doc:`status </operating-scylla/nodetool-commands/tasks/status>` - Gets status of the task.
--- a/docs/operating-scylla/nodetool-commands/tasks/status.rst
+++ b/docs/operating-scylla/nodetool-commands/tasks/status.rst
@@ -1,6 +1,6 @@
 Nodetool tasks status
 =========================
-**tasks status** - Gets the status of a task manager task. If the task was finished it is unregistered.
+**tasks status** - Gets the status of a task manager task.

 Syntax
 -------
@@ -23,10 +23,10 @@ Example output
   type: repair
   kind: node
   scope: keyspace
-   state: done
+   state: running
   is_abortable: true
   start_time: 2024-07-29T15:48:55Z
-   end_time: 2024-07-29T15:48:55Z
+   end_time:
   error:
   parent_id: none
   sequence_number: 5
--- a/docs/operating-scylla/nodetool-commands/tasks/tree.rst
+++ b/docs/operating-scylla/nodetool-commands/tasks/tree.rst
@@ -1,7 +1,7 @@
 Nodetool tasks tree
 =======================
 **tasks tree** - Gets the statuses of a task manager task and all its descendants.
-The statuses are listed in BFS order. If the task was finished it is unregistered.
+The statuses are listed in BFS order.

 If task_id isn't specified, trees of all non-internal tasks are printed
 (internal tasks are the ones that have a parent or cover an operation that
--- a/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
@@ -201,6 +201,7 @@ Add New DC

 #. If you are using ScyllaDB Monitoring, update the `monitoring stack <https://monitoring.docs.scylladb.com/stable/install/monitoring_stack.html#configure-scylla-nodes-from-files>`_ to monitor it. If you are using ScyllaDB Manager, make sure you install the `Manager Agent <https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html>`_ and Manager can access the new DC.

+.. _add-dc-to-existing-dc-not-connect-clients:

 Configure the Client not to Connect to the New DC
 -------------------------------------------------
--- a/docs/operating-scylla/procedures/cluster-management/arbiter-dc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/arbiter-dc.rst
@@ -0,0 +1,57 @@
+=========================================================
+Preventing Quorum Loss in Symmetrical Multi-DC Clusters
+=========================================================
+
+ScyllaDB requires at least a quorum (majority) of nodes in a cluster to be up
+and communicate with each other. A cluster that loses a quorum can handle reads
+and writes of user data, but cluster management operations, such as schema and
+topology updates, are impossible.
+
+In clusters that are symmetrical, i.e., have two (DCs) with the same number of
+nodes, losing a quorum may occur if one of the DCs becomes unavailable.
+For example, if one DC fails in a 2-DC cluster where each DC has three nodes,
+only three out of six nodes are available, and the quorum is lost.
+
+Adding another DC would mitigate the risk of losing a quorum, but it comes
+with network and storage costs. To prevent the quorum loss with minimum costs,
+you can configure an arbiter (tie-breaker) DC.
+
+An arbiter DC is a datacenter with a :doc:`zero-token node </architecture/zero-token-nodes>`
+-- a node that doesn't replicate any data but is only used for Raft quorum
+voting. An arbiter DC maintains the cluster quorum if one of the other DCs
+fails, while it doesn't incur extra network and storage costs as it has no
+user data.
+
+Adding an Arbiter DC
+-----------------------
+
+To set up an arbiter DC, follow the procedure to
+:doc:`add a new datacenter to an existing cluster </operating-scylla/procedures/cluster-management/add-dc-to-existing-dc/>`.
+When editing the *scylla.yaml* file, set the ``join_ring`` parameter to
+``false`` following these guidelines:
+
+* Set ``join_ring=false`` before you start the node(s). If you set that
+  parameter on a node that has already been bootstrapped and owns a token
+  range, the node startup will fail. In such a case, you'll need to
+  :doc:`decommission </operating-scylla/procedures/cluster-management/decommissioning-data-center>`
+  the node, :doc:`wipe it clean </operating-scylla/procedures/cluster-management/clear-data>`,
+  and add it back to the arbiter DC properly following
+  the :doc:`procedure </operating-scylla/procedures/cluster-management/add-dc-to-existing-dc/>`.
+* As a rule, one node is sufficient for an arbiter to serve as a tie-breaker.
+  In case you add more than one node to the arbiter DC, ensure that you set
+  ``join_ring=false`` on all the nodes in that DC.
+
+Follow-up steps:
+^^^^^^^^^^^^^^^^^^^
+* An arbiter DC has a replication factor of 0 (RF=0) for all keyspaces. You
+  need to ``ALTER`` the keyspaces to update their RF.
+* Since zero-token nodes are ignored by drivers, you can skip
+  :ref:`configuring the client not to connect to the new DC <add-dc-to-existing-dc-not-connect-clients>`.
+
+References
+----------------
+
+* :doc:`Zero-token Nodes </architecture/zero-token-nodes>`
+* :doc:`Raft Consensus Algorithm in ScyllaDB </architecture/raft>`
+* :doc:`Handling Node Failures </troubleshooting/handling-node-failures>`
+* :doc:`Adding a New Data Center Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-dc-to-existing-dc/>`
--- a/docs/operating-scylla/procedures/cluster-management/create-cluster-multidc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/create-cluster-multidc.rst
@@ -209,6 +209,17 @@ In this example, we will show how to install a nine nodes cluster.
   UN   54.187.142.201  109.54 KB       256     ?               d99967d6-987c-4a54-829d-86d1b921470f    RACK1
   UN   54.187.168.20   109.54 KB       256     ?               2329c2e0-64e1-41dc-8202-74403a40f851    RACK1

-See also:
+--------------------------
+Preventing Quorum Loss
+--------------------------

+If your cluster is symmetrical, i.e., it has  an even number of datacenters
+with the same number of nodes, consider adding an arbiter DC to mitigate
+the risk of losing a quorum at a minimum cost.
+See :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters </operating-scylla/procedures/cluster-management/arbiter-dc>`
+for details.
+
+------------
+See also:
+------------
 :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster>`
--- a/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
+++ b/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
@@ -55,7 +55,7 @@ Procedure
      cqlsh> DESCRIBE <KEYSPACE_NAME>
      cqlsh> CREATE KEYSPACE <KEYSPACE_NAME> WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', '<DC_NAME1>' : 3, '<DC_NAME2>' : 3, '<DC_NAME3>' : 3};

-      cqlsh> ALTER KEYSPACE <KEYSPACE_NAME> WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', '<DC_NAME1>' : 3, '<DC_NAME2>' : 3};
+      cqlsh> ALTER KEYSPACE <KEYSPACE_NAME> WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', '<DC_NAME1>' : 3, '<DC_NAME2>' : 3, '<DC_NAME3>' : 0};

   For example:

@@ -71,7 +71,7 @@ Procedure

   .. code-block:: shell

-      cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'EUROPE-DC' : 3};
+      cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};

 #. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
   Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.
--- a/docs/operating-scylla/procedures/cluster-management/index.rst
+++ b/docs/operating-scylla/procedures/cluster-management/index.rst
@@ -26,6 +26,8 @@ Cluster Management Procedures
   Safely Restart Your Cluster <safe-start>
   Handling Membership Change Failures <handling-membership-change-failures>
   repair-based-node-operation
+   Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
+

 .. panel-box::
  :title: Cluster and DC Creation
@@ -84,6 +86,8 @@ Cluster Management Procedures

  * :doc:`Repair Based Node Operations (RBNO) </operating-scylla/procedures/cluster-management/repair-based-node-operation>`

+  * :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`
+
 .. panel-box::
  :title: Topology Changes
  :id: "getting-started"
--- a/docs/troubleshooting/autoremove-ubuntu.rst
+++ b/docs/troubleshooting/autoremove-ubuntu.rst
@@ -0,0 +1,38 @@
+Removing ScyllaDB with the "--autoremove" option on Ubuntu breaks system packages
+======================================================================================
+
+Problem
+^^^^^^^
+
+Running ``apt purge scylla --autoremove`` marks most system packages for
+removal.
+
+.. code::
+
+   root@myserv:~# apt purge scylla --autoremove
+   Reading package lists... Done
+   Building dependency tree... Done
+   Reading state information... Done
+   The following packages will be REMOVED:
+     apport-symptoms* bc* bcache-tools* bolt* btrfs-progs* byobu* cloud-guest-utils* cloud-init* cloud-initramfs-copymods* cloud-initramfs-dyn-netconf* cryptsetup* cryptsetup-initramfs* dmeventd* eatmydata* ethtool* fdisk* fonts-ubuntu-console* fwupd* fwupd-signed* gdisk* gir1.2-packagekitglib-1.0* git* git-man* kpartx* landscape-common* libaio1* libappstream4* libatasmart4* libblockdev-crypto2* libblockdev-fs2*
+     libblockdev-loop2* libblockdev-part-err2* libblockdev-part2* libblockdev-swap2* libblockdev-utils2* libblockdev2* libdevmapper-event1.02.1* libeatmydata1* liberror-perl* libfdisk1* libfwupd2* libfwupdplugin5* libgcab-1.0-0* libgpgme11* libgstreamer1.0-0* libgusb2* libinih1* libintl-perl* libintl-xs-perl* libjcat1* libjson-glib-1.0-0* libjson-glib-1.0-common* liblvm2cmd2.03* libmbim-glib4* libmbim-proxy*
+     libmm-glib0* libmodule-find-perl* libmodule-scandeps-perl* libmspack0* libpackagekit-glib2-18* libparted-fs-resize0* libproc-processtable-perl* libqmi-glib5* libqmi-proxy* libsgutils2-2* libsmbios-c2* libsort-naturally-perl* libstemmer0d* libtcl8.6* libterm-readkey-perl* libudisks2-0* liburcu8* libutempter0* libvolume-key1* libxmlb2* libxmlsec1* libxmlsec1-openssl* libxslt1.1* lvm2* lxd-agent-loader* mdadm*
+     modemmanager* motd-news-config* multipath-tools* needrestart* open-vm-tools* overlayroot* packagekit* packagekit-tools* pastebinit* patch* pollinate* python3-apport* python3-certifi* python3-chardet* python3-configobj* python3-debconf* python3-debian* python3-json-pointer* python3-jsonpatch* python3-jsonschema* python3-magic* python3-newt* python3-packaging* python3-pexpect* python3-problem-report*
+     python3-ptyprocess* python3-pyrsistent* python3-requests* python3-software-properties* python3-systemd* python3-xkit* run-one* sbsigntool* screen* scylla* scylla-conf* scylla-cqlsh* scylla-kernel-conf* scylla-node-exporter* scylla-python3* scylla-server* secureboot-db* sg3-utils* sg3-utils-udev* software-properties-common* sosreport* tcl* tcl8.6* thin-provisioning-tools* tmux* ubuntu-drivers-common* udisks2*
+     unattended-upgrades* update-notifier-common* usb-modeswitch* usb-modeswitch-data* xfsprogs* zerofree*
+   0 upgraded, 0 newly installed, 139 to remove and 0 not upgraded.
+
+Cause
+^^^^^^^
+
+This problem may occur on Ubuntu 22.04 or earlier. It is caused by
+the ``systemd-coredump`` package installed with the ``scylla_setup`` script.
+Installing ``systemd-coredump`` results in removing ``apport`` and ``ubuntu-server``.
+In turn, the ``--autoremove`` option marks for removal all packages installed
+by ``ubuntu-server dependencies``.
+
+
+Solution
+^^^^^^^^^^
+
+Do not run the ``--autoremove`` option when removing ScyllaDB.
--- a/docs/troubleshooting/drop-table-space-up.rst
+++ b/docs/troubleshooting/drop-table-space-up.rst
@@ -16,6 +16,4 @@ Solution
 2. If you are deleting an entire keyspace, repeat the procedure above for every table inside the keyspace.
 3. This behavior is controlled by the ``auto_snapshot`` flag in ``/etc/scylla/scylla.yaml``, which set to true by default. To stop taking snapshots on deletion, set that flag to false and restart all your scylla nodes.

-.. note:: Alternatively you can use the``rm`` Linux utility to remove the files. If you do, keep in mind that the ``rm`` Linux utility is not aware if some snapshots are still associated with existing keyspaces, but nodetool is. 
- 
-
+.. note:: Alternatively you can use the ``rm`` Linux utility to remove the files. If you do, keep in mind that the ``rm`` Linux utility is not aware if some snapshots are still associated with existing keyspaces, but nodetool is.
--- a/docs/troubleshooting/index.rst
+++ b/docs/troubleshooting/index.rst
@@ -14,6 +14,7 @@ Troubleshooting ScyllaDB
   storage/index
   CQL/index
   monitor/index
+   install-remove/index


 ScyllaDB's troubleshooting section contains articles which are targeted to pinpoint and answer problems with ScyllaDB. For broader issues and workarounds, consult the :doc:`Knowledge base </kb/index>`.
@@ -33,6 +34,7 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
  * :doc:`Data Storage and SSTables <storage/index>`
  * :doc:`CQL errors <CQL/index>`
  * :doc:`ScyllaDB Monitoring and ScyllaDB Manager <monitor/index>`
+  * :doc:`Installation and Removal <install-remove/index>`

 Also check out the `Monitoring lesson <https://university.scylladb.com/courses/scylla-operations/lessons/scylla-monitoring/>`_ on ScyllaDB University, which covers how to troubleshoot different issues when running a ScyllaDB cluster.

--- a/docs/troubleshooting/install-remove/index.rst
+++ b/docs/troubleshooting/install-remove/index.rst
@@ -0,0 +1,13 @@
+Installation and Removal
+===========================
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2 
+
+   Removing ScyllaDB on Ubuntu breaks system packages </troubleshooting/autoremove-ubuntu/>
+
+
+
+* :doc:`Removing ScyllaDB with the "--autoremove" option on Ubuntu breaks system packages </troubleshooting/autoremove-ubuntu/>`
+  
--- a/docs/troubleshooting/report-scylla-problem.rst
+++ b/docs/troubleshooting/report-scylla-problem.rst
@@ -279,17 +279,12 @@ Once you have collected and compressed your reports, send them to ScyllaDB for a
   curl -X PUT https://upload.scylladb.com/$report_uuid/yourfile -T yourfile


-For example with the health check report and node health check report:
-
-
-.. code-block:: shell
-
-   curl -X PUT https://upload.scylladb.com/$report_uuid/output_files.tgz -T output_files.tgz
+For example with the Scylla Doctor's vitals:

  
 .. code-block:: shell
 
-   curl -X PUT https://upload.scylladb.com/$report_uuid/192.0.2.0-health-check-report.txt -T 192.0.2.0-health-check-report.txt
+   curl -X PUT https://upload.scylladb.com/$report_uuid/my_cluster_123_vitals.tgz -T my_cluster_123_vitals.tgz


 The **UUID** you generated replaces the variable ``$report_uuid`` at runtime. ``yourfile`` is any file you need to send to ScyllaDB support.
--- a/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
+++ b/docs/upgrade/upgrade-to-enterprise/upgrade-guide-from-5.2-to-2023.1/upgrade-guide-from-5.2-to-2023.1-generic.rst
@@ -162,54 +162,27 @@ Download and install the new release

   .. group-tab:: EC2/GCP/Azure Ubuntu Image

-        Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
+      Before upgrading, check what version you are running now using ``scylla --version``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.

-        There are two alternative upgrade procedures: upgrading ScyllaDB and simultaneously updating 3rd party and OS packages - recommended if you 
-        are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04, and upgrading ScyllaDB without updating 
-        any external packages.
+      If you’re using the ScyllaDB official image (recommended), see
+      the **Debian/Ubuntu** tab for upgrade instructions. If you’re using your
+      own image and have installed ScyllaDB packages for Ubuntu or Debian,
+      you need to apply an extended upgrade procedure:
+      
+      #. Update the ScyllaDB deb repo (see above).
+      #. Configure Java 1.8 (see above).
+      #. Install the new ScyllaDB version with the additional 
+         ``scylla-enterprise-machine-image`` package:

-        **To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
-
-        Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
-
-        #. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
-
-        #. Load the new repo:
-
-            .. code:: sh
-
-               sudo apt-get update
-
-        #. Run the following command to update the manifest file:
-
-            .. code:: sh
-
-               cat scylla-enterprise-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
-
-            Where:
-
-              * ``<version>`` - The ScyllaDB Enterprise version to which you are upgrading ( |NEW_VERSION| ).
-              * ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
-
-            The file is included in the ScyllaDB Enterprise packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
-
-            Example:
-
-                .. code:: sh
-
-                   cat scylla-enterprise-packages-2022.2.0-x86_64.txt | sudo xargs -n1 apt-get install -y
-
-
-                .. note::
-
-                   Alternatively, you can update the manifest file with the following command:
-
-                   ``sudo apt-get install $(awk '{print $1'} scylla-enterprise-packages-<version>-<arch>.txt) -y``
-
-
-
-        To upgrade ScyllaDB without updating any external packages, follow the :ref:`download and installation instructions for Debian/Ubuntu <upgrade-debian-ubuntu-5.2-to-enterprise-2023.1>`.
+          .. code::
+         
+           sudo apt-get clean all
+           sudo apt-get update
+           sudo apt-get dist-upgrade scylla-enterprise
+           sudo apt-get dist-upgrade scylla-enterprise-machine-image

+      #. Run ``scylla_setup`` without running ``io_setup``.
+      #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.

 Start the node
 --------------
--- a/generic_server.cc
+++ b/generic_server.cc
@@ -38,12 +38,13 @@ connection::~connection()
    _server._connections_list.erase(iter);
 }

-future<> server::for_each_gently(noncopyable_function<future<>(connection&)> fn) {
+future<> server::for_each_gently(noncopyable_function<void(connection&)> fn) {
    _gentle_iterators.emplace_front(*this);
    std::list<gentle_iterator>::iterator gi = _gentle_iterators.begin();
    return seastar::do_until([ gi ] { return gi->iter == gi->end; },
        [ gi, fn = std::move(fn) ] {
-            return fn(*(gi->iter++));
+            fn(*(gi->iter++));
+            return make_ready_future<>();
        }
    ).finally([ this, gi ] { _gentle_iterators.erase(gi); });
 }
--- a/generic_server.hh
+++ b/generic_server.hh
@@ -118,7 +118,7 @@ protected:

    virtual future<> unadvertise_connection(shared_ptr<connection> conn);

-    future<> for_each_gently(noncopyable_function<future<>(connection&)>);
+    future<> for_each_gently(noncopyable_function<void(connection&)>);
 };

 }
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -149,7 +149,7 @@ public:
    // the "features_enable_test_feature" injection is enabled.
    // This feature MUST NOT be advertised in release mode!
    gms::feature test_only_feature { *this, "TEST_ONLY_FEATURE"sv };
-
+    gms::feature enforced_raft_rpc_scheduling_group { *this, "ENFORCED_RAFT_RPC_SCHEDULING_GROUP"sv };
 public:

    const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
--- a/gms/gossip_digest_syn.cc
+++ b/gms/gossip_digest_syn.cc
@@ -13,8 +13,8 @@
 auto fmt::formatter<gms::gossip_digest_syn>::format(const gms::gossip_digest_syn& syn, fmt::format_context& ctx) const
        -> decltype(ctx.out()) {
    auto out = ctx.out();
-    // out = fmt::format_to(out, "cluster_id:{},partioner:{},group0_id{},"
-    //                      syn._cluster_id, syn._partioner, syn._group0_id);
+    out = fmt::format_to(out, "cluster_id:{},partioner:{},group0_id{},",
+                         syn._cluster_id, syn._partioner, syn._group0_id);
    out = fmt::format_to(out, "digests:{{");
    for (auto& d : syn._digests) {
        out = fmt::format_to(out, "{} ", d);
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -356,31 +356,30 @@ future<> gossiper::handle_ack_msg(msg_addr id, gossip_digest_ack ack_msg) {
 }

 future<> gossiper::do_send_ack2_msg(msg_addr from, utils::chunked_vector<gossip_digest> ack_msg_digest) {
-    return futurize_invoke([this, from, ack_msg_digest = std::move(ack_msg_digest)] () mutable {
-        /* Get the state required to send to this gossipee - construct GossipDigestAck2Message */
-        std::map<inet_address, endpoint_state> delta_ep_state_map;
-        for (auto g_digest : ack_msg_digest) {
-            inet_address addr = g_digest.get_endpoint();
-            const auto es = get_endpoint_state_ptr(addr);
-            if (!es || es->get_heart_beat_state().get_generation() < g_digest.get_generation()) {
-                continue;
-            }
-            // Local generation for addr may have been increased since the
-            // current node sent an initial SYN. Comparing versions across
-            // different generations in get_state_for_version_bigger_than
-            // could result in losing some app states with smaller versions.
-            const auto version = es->get_heart_beat_state().get_generation() > g_digest.get_generation()
-                ? version_type(0)
-                : g_digest.get_max_version();
-            auto local_ep_state_ptr = this->get_state_for_version_bigger_than(addr, version);
-            if (local_ep_state_ptr) {
-                delta_ep_state_map.emplace(addr, *local_ep_state_ptr);
-            }
+    /* Get the state required to send to this gossipee - construct GossipDigestAck2Message */
+    std::map<inet_address, endpoint_state> delta_ep_state_map;
+    for (auto g_digest : ack_msg_digest) {
+        inet_address addr = g_digest.get_endpoint();
+        const auto es = get_endpoint_state_ptr(addr);
+        if (!es || es->get_heart_beat_state().get_generation() < g_digest.get_generation()) {
+            continue;
        }
-        gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
-        logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
-        return ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
-    });
+        // Local generation for addr may have been increased since the
+        // current node sent an initial SYN. Comparing versions across
+        // different generations in get_state_for_version_bigger_than
+        // could result in losing some app states with smaller versions.
+        const auto version = es->get_heart_beat_state().get_generation() > g_digest.get_generation()
+            ? version_type(0)
+            : g_digest.get_max_version();
+        auto local_ep_state_ptr = get_state_for_version_bigger_than(addr, version);
+        if (local_ep_state_ptr) {
+            delta_ep_state_map.emplace(addr, *local_ep_state_ptr);
+        }
+    }
+    gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
+    logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
+    co_await ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
+    logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
 }

 // Depends on
@@ -683,6 +682,10 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
                // If there is no host id in the new state there should be one locally
                hid = get_host_id(ep);
            }
+            if (hid == my_host_id()) {
+                 logger.trace("Ignoring gossip for {} because it maps to local id, but is not local address", ep);
+                 return make_ready_future<>();
+            }
            if (_topo_sm->_topology.left_nodes.contains(raft::server_id(hid.uuid()))) {
                logger.trace("Ignoring gossip for {} because it left", ep);
                return make_ready_future<>();
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -333,8 +333,10 @@ public:

    void set_topology_state_machine(service::topology_state_machine* m) {
        _topo_sm = m;
-        // In raft topology mode the coodinator maintains banned nodes list
-        _just_removed_endpoints.clear();
+        if (m) {
+            // In raft topology mode the coodinator maintains banned nodes list
+            _just_removed_endpoints.clear();
+        }
    }

 private:
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -677,6 +677,8 @@ future<> global_vnode_effective_replication_map::get_keyspace_erms(sharded<repli
        // all under the lock.
        auto lk = co_await db.get_shared_token_metadata().get_lock();
        auto erm = db.find_keyspace(keyspace_name).get_vnode_effective_replication_map();
+        utils::get_local_injector().inject("get_keyspace_erms_throw_no_such_keyspace",
+                [&keyspace_name] { throw data_dictionary::no_such_keyspace{keyspace_name}; });
        auto ring_version = erm->get_token_metadata().get_ring_version();
        _erms[0] = make_foreign(std::move(erm));
        co_await coroutine::parallel_for_each(boost::irange(1u, smp::count), [this, &sharded_db, keyspace_name, ring_version] (unsigned shard) -> future<> {
--- a/locator/network_topology_strategy.cc
+++ b/locator/network_topology_strategy.cc
@@ -269,9 +269,9 @@ network_topology_strategy::calculate_natural_endpoints(
 }

 void network_topology_strategy::validate_options(const gms::feature_service& fs) const {
-    if(_config_options.empty()) {
-        throw exceptions::configuration_exception("Configuration for at least one datacenter must be present");
-    }
+    // #22688 / #20039 - we want to remove dc:s once rf=0, and we
+    // also want to allow fully setting rf=0 in _all_ dc:s (hello data loss)
+    // so empty options here are in fact ok. Removed check for it
    validate_tablet_options(*this, fs, _config_options);
    auto tablet_opts = recognized_tablet_options();
    for (auto& c : _config_options) {
@@ -376,7 +376,10 @@ future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_
        ++nodes_per_dc[node.dc_rack().dc];
    }

-    for (const auto& [dc, dc_rf] : _dc_rep_factor) {
+    // #22688 - take all dcs in topology into account when determining migration.
+    // Any change should still have been pre-checked to never exceed rf factor one.
+    for (const auto& dc : tm->get_topology().get_datacenters()) {
+        auto dc_rf = get_replication_factor(dc);
        auto dc_node_count = nodes_per_dc[dc];
        if (dc_rf == dc_node_count) {
            continue;
@@ -430,8 +433,8 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
            continue;
        }
        const auto& existing = replicas_per_rack[rack];
-        auto& candidate = existing.empty() ?
-                new_racks.emplace_back(rack) : existing_racks.emplace_back(rack);
+        candidates_list& rack_list = existing.empty() ? new_racks : existing_racks;
+        auto& candidate = rack_list.emplace_back(rack);
        for (const auto& node : nodes) {
            if (!node->is_normal()) {
                continue;
@@ -442,7 +445,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
            }
        }
        if (candidate.nodes.empty()) {
-            existing_racks.pop_back();
+            rack_list.pop_back();
            tablet_logger.trace("allocate_replica {}.{}: no candidate nodes left on rack={}", s->ks_name(), s->cf_name(), rack);
            // Note that this rack can't be in new_racks since
            // those had no existing replicas and if current rack has no nodes
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -136,6 +136,18 @@ std::optional<tablet_replica> get_leaving_replica(const tablet_info& tinfo, cons
    return *leaving.begin();
 }

+bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tablet_transition_info& trinfo) {
+    if (replica == locator::get_leaving_replica(tinfo, trinfo)) {
+        // we do tablet cleanup on the leaving replica in the `cleanup` stage, after which there is only the `end_migration` stage.
+        return trinfo.stage == locator::tablet_transition_stage::end_migration;
+    }
+    if (replica == trinfo.pending_replica) {
+        // we do tablet cleanup on the pending replica in the `cleanup_target` stage, after which there is only the `revert_migration` stage.
+        return trinfo.stage == locator::tablet_transition_stage::revert_migration;
+    }
+    return false;
+}
+
 tablet_replica_set get_new_replicas(const tablet_info& tinfo, const tablet_migration_info& mig) {
    return replace_replica(tinfo.replicas, mig.src, mig.dst);
 }
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -222,6 +222,10 @@ struct tablet_transition_info {
 // Returns the leaving replica for a given transition.
 std::optional<tablet_replica> get_leaving_replica(const tablet_info&, const tablet_transition_info&);

+// True if the tablet is transitioning and it's in a stage that follows the stage
+// where we clean up the tablet on the given replica.
+bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tablet_transition_info& trinfo);
+
 /// Represents intention to move a single tablet replica from src to dst.
 struct tablet_migration_info {
    locator::tablet_transition_kind kind;
--- a/main.cc
+++ b/main.cc
@@ -91,11 +91,7 @@

 #include "redis/controller.hh"
 #include "cdc/log.hh"
-#include "cdc/cdc_extension.hh"
 #include "cdc/generation_service.hh"
-#include "tombstone_gc_extension.hh"
-#include "db/tags/extension.hh"
-#include "db/paxos_grace_seconds_extension.hh"
 #include "service/qos/standard_service_level_distributed_data_accessor.hh"
 #include "service/storage_proxy.hh"
 #include "service/mapreduce_service.hh"
@@ -103,7 +99,6 @@
 #include "alternator/ttl.hh"
 #include "tools/entry_point.hh"
 #include "test/perf/entry_point.hh"
-#include "db/per_partition_rate_limit_extension.hh"
 #include "lang/manager.hh"
 #include "sstables/sstables_manager.hh"
 #include "db/virtual_tables.hh"
@@ -635,15 +630,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
    app_template app(std::move(app_cfg));

    auto ext = std::make_shared<db::extensions>();
-    ext->add_schema_extension<db::tags_extension>(db::tags_extension::NAME);
-    ext->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
-    ext->add_schema_extension<db::paxos_grace_seconds_extension>(db::paxos_grace_seconds_extension::NAME);
-    ext->add_schema_extension<tombstone_gc_extension>(tombstone_gc_extension::NAME);
-    ext->add_schema_extension<db::per_partition_rate_limit_extension>(db::per_partition_rate_limit_extension::NAME);
-
    auto cfg = make_lw_shared<db::config>(ext);
    auto init = app.get_options_description().add_options();

+    cfg->add_all_default_extensions();
+
    init("version", bpo::bool_switch(), "print version number and exit");
    init("build-id", bpo::bool_switch(), "print build-id and exit");
    init("build-mode", bpo::bool_switch(), "print build mode and exit");
@@ -1658,18 +1649,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                sys_dist_ks.invoke_on_all(&db::system_distributed_keyspace::stop).get();
            });

-            group0_service.start().get();
-            auto stop_group0_service = defer_verbose_shutdown("group 0 service", [&group0_service] {
-                sl_controller.local().abort_group0_operations();
-                group0_service.abort().get();
-            });
-
-            utils::get_local_injector().inject("stop_after_starting_group0_service",
-                [] { std::raise(SIGSTOP); });
-
-            // Set up group0 service earlier since it is needed by group0 setup just below
-            ss.local().set_group0(group0_service);
-
            supervisor::notify("starting view update generator");
            view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(stop_signal.as_sharded_abort_source())).get();
            auto stop_view_update_generator = defer_verbose_shutdown("view update generator", [] {
@@ -1936,6 +1915,18 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
             */
            db.local().enable_autocompaction_toggle();

+            group0_service.start().get();
+            auto stop_group0_service = defer_verbose_shutdown("group 0 service", [&group0_service] {
+                sl_controller.local().abort_group0_operations();
+                group0_service.abort().get();
+            });
+
+            utils::get_local_injector().inject("stop_after_starting_group0_service",
+                [] { std::raise(SIGSTOP); });
+
+            // Set up group0 service earlier since it is needed by group0 setup just below
+            ss.local().set_group0(group0_service);
+
            const auto generation_number = gms::generation_type(sys_ks.local().increment_and_get_generation().get());

            // Load address_map from system.peers and subscribe to gossiper events to keep it updated.
@@ -2019,6 +2010,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                api::unset_server_authorization_cache(ctx).get();
            });

+            // update the service level cache after the SL data accessor and auth service are initialized.
+            if (sl_controller.local().is_v2()) {
+                sl_controller.local().update_cache(qos::update_both_cache_levels::yes).get();
+            }
+
            sl_controller.invoke_on_all([&lifecycle_notifier] (qos::service_level_controller& controller) {
                lifecycle_notifier.local().register_subscriber(&controller);
            }).get();
@@ -2090,6 +2086,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            if (cfg->view_building()) {
                view_builder.invoke_on_all(&db::view::view_builder::start, std::ref(mm), utils::cross_shard_barrier()).get();
            }
+            auto drain_view_builder = defer_verbose_shutdown("draining view builders", [&] {
+                view_builder.invoke_on_all(&db::view::view_builder::drain).get();
+            });

            api::set_server_view_builder(ctx, view_builder).get();
            auto stop_vb_api = defer_verbose_shutdown("view builder API", [&ctx] {
@@ -2134,6 +2133,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                ss.local().drain_on_shutdown().get();
            });

+            auth_service.local().ensure_superuser_is_created().get();
            ss.local().register_protocol_server(cql_server_ctl, cfg->start_native_transport()).get();
            api::set_transport_controller(ctx, cql_server_ctl).get();
            auto stop_transport_controller = defer_verbose_shutdown("transport controller API", [&ctx] {
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -584,6 +584,18 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::JOIN_NODE_RESPONSE:
    case messaging_verb::JOIN_NODE_QUERY:
    case messaging_verb::TASKS_GET_CHILDREN:
+    case messaging_verb::RAFT_SEND_SNAPSHOT:
+    case messaging_verb::RAFT_APPEND_ENTRIES:
+    case messaging_verb::RAFT_APPEND_ENTRIES_REPLY:
+    case messaging_verb::RAFT_VOTE_REQUEST:
+    case messaging_verb::RAFT_VOTE_REPLY:
+    case messaging_verb::RAFT_TIMEOUT_NOW:
+    case messaging_verb::RAFT_READ_QUORUM:
+    case messaging_verb::RAFT_READ_QUORUM_REPLY:
+    case messaging_verb::RAFT_EXECUTE_READ_BARRIER_ON_LEADER:
+    case messaging_verb::RAFT_ADD_ENTRY:
+    case messaging_verb::RAFT_MODIFY_CONFIG:
+    case messaging_verb::RAFT_PULL_SNAPSHOT:
        // See comment above `TOPOLOGY_INDEPENDENT_IDX`.
        // DO NOT put any 'hot' (e.g. data path) verbs in this group,
        // only verbs which are 'rare' and 'cheap'.
@@ -637,19 +649,7 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::PAXOS_ACCEPT:
    case messaging_verb::PAXOS_LEARN:
    case messaging_verb::PAXOS_PRUNE:
-    case messaging_verb::RAFT_SEND_SNAPSHOT:
-    case messaging_verb::RAFT_APPEND_ENTRIES:
-    case messaging_verb::RAFT_APPEND_ENTRIES_REPLY:
-    case messaging_verb::RAFT_VOTE_REQUEST:
-    case messaging_verb::RAFT_VOTE_REPLY:
-    case messaging_verb::RAFT_TIMEOUT_NOW:
-    case messaging_verb::RAFT_READ_QUORUM:
-    case messaging_verb::RAFT_READ_QUORUM_REPLY:
-    case messaging_verb::RAFT_EXECUTE_READ_BARRIER_ON_LEADER:
-    case messaging_verb::RAFT_ADD_ENTRY:
-    case messaging_verb::RAFT_MODIFY_CONFIG:
    case messaging_verb::DIRECT_FD_PING:
-    case messaging_verb::RAFT_PULL_SNAPSHOT:
        return 2;
    case messaging_verb::MUTATION_DONE:
    case messaging_verb::MUTATION_FAILED:
--- a/mutation/frozen_mutation.hh
+++ b/mutation/frozen_mutation.hh
@@ -134,9 +134,7 @@ public:

    auto on_end_of_partition() {
        flush_rows_and_tombstones(position_in_partition::after_all_clustered_rows());
-        if (_consumer.consume_end_of_partition()) {
-            _stop_consuming = stop_iteration::yes;
-        }
+        _stop_consuming = _consumer.consume_end_of_partition();
        using consume_res_type = decltype(_consumer.consume_end_of_stream());
        if constexpr (std::is_same_v<consume_res_type, void>) {
            _consumer.consume_end_of_stream();
--- a/mutation/mutation.cc
+++ b/mutation/mutation.cc
@@ -356,7 +356,13 @@ void mutation_partition_json_writer::write_atomic_cell_value(const atomic_cell_v
 }

 void mutation_partition_json_writer::write_collection_value(const collection_mutation_view_description& mv, data_type type) {
-    write_each_collection_cell(mv, type, [&] (atomic_cell_view v, data_type t) { write_atomic_cell_value(v, t); });
+    write_each_collection_cell(mv, type, [&] (atomic_cell_view v, data_type t) {
+        if (v.is_live()) {
+            write_atomic_cell_value(v, t);
+        } else {
+            writer().Null();
+        }
+    });
 }

 void mutation_partition_json_writer::write(gc_clock::duration ttl, gc_clock::time_point expiry) {
--- a/mutation/mutation_compactor.hh
+++ b/mutation/mutation_compactor.hh
@@ -164,8 +164,7 @@ class compact_mutation_state {
    uint32_t _current_partition_limit;
    bool _empty_partition{};
    bool _empty_partition_in_gc_consumer{};
-    const dht::decorated_key* _dk{};
-    dht::decorated_key _last_dk;
+    std::optional<dht::decorated_key> _dk;
    bool _return_static_content_on_partition_with_no_rows{};

    std::optional<static_row> _last_static_row;
@@ -310,7 +309,6 @@ public:
        , _partition_limit(partition_limit)
        , _partition_row_limit(_slice.options.contains(query::partition_slice::option::distinct) ? 1 : slice.partition_row_limit())
        , _tombstone_gc_state(nullptr)
-        , _last_dk({dht::token(), partition_key::make_empty()})
        , _last_pos(position_in_partition::for_partition_end())
        , _validator("mutation_compactor for read", _schema, validation_level)
    {
@@ -326,7 +324,6 @@ public:
        , _can_gc([this] (tombstone t, is_shadowable is_shadowable) { return can_gc(t, is_shadowable); })
        , _slice(s.full_slice())
        , _tombstone_gc_state(gc_state)
-        , _last_dk({dht::token(), partition_key::make_empty()})
        , _last_pos(position_in_partition::for_partition_end())
        , _collector(std::make_unique<mutation_compactor_garbage_collector>(_schema))
        // We already have a validator for compaction in the sstable writer, no need to validate twice
@@ -337,10 +334,10 @@ public:

    void consume_new_partition(const dht::decorated_key& dk) {
        _validator(mutation_fragment_v2::kind::partition_start, position_in_partition_view::for_partition_start(), {});
-        _validator(dk);
        _stop = stop_iteration::no;
-        auto& pk = dk.key();
-        _dk = &dk;
+        _dk = dk;
+        auto& pk = _dk->key();
+        _validator(*_dk);
        _return_static_content_on_partition_with_no_rows =
            _slice.options.contains(query::partition_slice::option::always_return_static_content) ||
            !has_ck_selector(_slice.row_ranges(_schema, pk));
@@ -531,10 +528,6 @@ public:
    requires CompactedFragmentsConsumerV2<Consumer> && CompactedFragmentsConsumerV2<GCConsumer>
    auto consume_end_of_stream(Consumer& consumer, GCConsumer& gc_consumer) {
        _validator.on_end_of_stream();
-        if (_dk) {
-            _last_dk = *_dk;
-            _dk = &_last_dk;
-        }
        if constexpr (std::is_void_v<std::invoke_result_t<decltype(&GCConsumer::consume_end_of_stream), GCConsumer&>>) {
            gc_consumer.consume_end_of_stream();
            return consumer.consume_end_of_stream();
@@ -546,7 +539,7 @@ public:
    /// The decorated key of the partition the compaction is positioned in.
    /// Can be null if the compaction wasn't started yet.
    const dht::decorated_key* current_partition() const {
-        return _dk;
+        return _dk ? &*_dk : nullptr;
    }

    // Only updated when SSTableCompaction == compact_for_sstables::no.
@@ -629,7 +622,7 @@ public:
        if (!_stop) {
            return {};
        }
-        partition_start ps(std::move(_last_dk), _partition_tombstone);
+        partition_start ps(*std::exchange(_dk, std::nullopt), _partition_tombstone);
        if (_effective_tombstone) {
            return detached_compaction_state{std::move(ps), std::move(_last_static_row),
                    range_tombstone_change(position_in_partition::after_key(_schema, _last_pos), _effective_tombstone)};
--- a/mutation_writer/multishard_writer.cc
+++ b/mutation_writer/multishard_writer.cc
@@ -224,7 +224,7 @@ future<uint64_t> distribute_reader_and_consume_on_shards(schema_ptr s,
    std::function<future<> (mutation_reader)> consumer,
    utils::phased_barrier::operation&& op) {
    return do_with(multishard_writer(std::move(s), sharder, std::move(producer), std::move(consumer)), std::move(op), [] (multishard_writer& writer, utils::phased_barrier::operation&) {
-        return writer().finally([&writer] {
+        return seastar::futurize_invoke(writer).finally([&writer] {
            return writer.close();
        });
    });
--- a/mutation_writer/token_group_based_splitting_writer.cc
+++ b/mutation_writer/token_group_based_splitting_writer.cc
@@ -35,6 +35,19 @@ private:
        }
    }

+    // Keeps the previous writer alive while closed
+    // and then allocates a new write, if needed.
+    future<> do_switch_to_new_writer() {
+        _current_writer->consume_end_of_stream();
+        // reset _current_writer while closing the previous one
+        // to prevent race with close() after abort()
+        auto wr = std::exchange(_current_writer, std::nullopt);
+        co_await wr->close();
+        allocate_new_writer_if_needed();
+    }
+
+    // Called frequently, hence yields (and allocates)
+    // only on the unlikely slow path.
    future<> maybe_switch_to_new_writer(dht::token t) {
        auto prev_group_id = _current_group_id;
        _current_group_id = _classify(t);
@@ -44,11 +57,7 @@ private:
        }

        if (_current_writer && _current_group_id > prev_group_id) [[unlikely]] {
-            _current_writer->consume_end_of_stream();
-            return _current_writer->close().then([this] {
-                _current_writer = std::nullopt;
-                allocate_new_writer_if_needed();
-            });
+            return do_switch_to_new_writer();
        }
        allocate_new_writer_if_needed();
        return make_ready_future<>();
--- a/node_ops/task_manager_module.cc
+++ b/node_ops/task_manager_module.cc
@@ -78,7 +78,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status_help
        .entity = "",
        .progress_units = "",
        .progress = tasks::task_manager::task::progress{},
-        .children = started ? co_await get_children(get_module(), id) : std::vector<tasks::task_identity>{}
+        .children = started ? co_await get_children(get_module(), id, [&gossiper = _ss.gossiper()] (gms::inet_address addr) { return gossiper.is_alive(addr); }) : std::vector<tasks::task_identity>{}
    };
 }

--- a/raft/server.cc
+++ b/raft/server.cc
@@ -589,7 +589,9 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
    check_not_aborted();

    if (as && as->abort_requested()) {
-        throw request_aborted(format("Abort requested before waiting for entry with idx: {}, term: {}", eid.idx, eid.term));
+        throw request_aborted(format(
+                "Abort requested before waiting for entry with idx: {}, term: {}; last committed entry: {}, last applied entry: {}",
+                eid.idx, eid.term, _fsm->commit_idx(), _applied_idx));
    }

    auto& container = type == wait_type::committed ? _awaited_commits : _awaited_applies;
@@ -637,9 +639,11 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
    }
    SCYLLA_ASSERT(inserted);
    if (as) {
-        it->second.abort = as->subscribe([it = it, &container] noexcept {
+        it->second.abort = as->subscribe([this, it = it, &container] noexcept {
            it->second.done.set_exception(
-                request_aborted(format("Abort requested while waiting for entry with idx: {}, term: {}", it->first, it->second.term)));
+                request_aborted(format(
+                        "Abort requested while waiting for entry with idx: {}, term: {}; last committed entry: {}, last applied entry: {}",
+                        it->first, it->second.term, _fsm->commit_idx(), _applied_idx)));
            container.erase(it);
        });
        SCYLLA_ASSERT(it->second.abort);
@@ -854,6 +858,10 @@ future<add_entry_reply> server_impl::execute_modify_config(server_id from,
 }

 future<> server_impl::modify_config(std::vector<config_member> add, std::vector<server_id> del, seastar::abort_source* as) {
+    utils::get_local_injector().inject("raft/throw_commit_status_unknown_in_modify_config", [] {
+        throw raft::commit_status_unknown();
+    });
+
    if (!_config.enable_forwarding) {
        const auto leader = _fsm->current_leader();
        if (leader != _id) {
@@ -1451,7 +1459,9 @@ term_t server_impl::get_current_term() const {

 future<> server_impl::wait_for_apply(index_t idx, abort_source* as) {
    if (as && as->abort_requested()) {
-        throw request_aborted(format("Aborted before waiting for applying entry: {}, last applied entry: {}", idx, _applied_idx));
+        throw request_aborted(format(
+                "Aborted before waiting for applying entry: {}, last committed entry: {}, last applied entry: {}",
+                idx, _fsm->commit_idx(), _applied_idx));
    }

    check_not_aborted();
@@ -1463,7 +1473,9 @@ future<> server_impl::wait_for_apply(index_t idx, abort_source* as) {
        if (as) {
            it->second.abort = as->subscribe([this, it] noexcept {
                it->second.promise.set_exception(
-                    request_aborted(format("Aborted while waiting to apply entry: {}, last applied entry: {}", it->first, _applied_idx)));
+                    request_aborted(format(
+                            "Aborted while waiting to apply entry: {}, last committed entry: {}, last applied entry: {}",
+                            it->first, _fsm->commit_idx(), _applied_idx)));
                _awaited_indexes.erase(it);
            });
            SCYLLA_ASSERT(it->second.abort);
--- a/read_context.hh
+++ b/read_context.hh
@@ -126,6 +126,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    mutation_reader::forwarding _fwd_mr;
    bool _range_query;
    const tombstone_gc_state* _tombstone_gc_state;
+    max_purgeable_fn _get_max_purgeable;
    // When reader enters a partition, it must be set up for reading that
    // partition from the underlying mutation source (_underlying) in one of two ways:
    //
@@ -149,6 +150,7 @@ public:
            const dht::partition_range& range,
            const query::partition_slice& slice,
            const tombstone_gc_state* gc_state,
+            max_purgeable_fn get_max_purgeable,
            tracing::trace_state_ptr trace_state,
            mutation_reader::forwarding fwd_mr)
        : _cache(cache)
@@ -160,6 +162,7 @@ public:
        , _fwd_mr(fwd_mr)
        , _range_query(!query::is_single_partition(range))
        , _tombstone_gc_state(gc_state)
+        , _get_max_purgeable(std::move(get_max_purgeable))
        , _underlying(_cache, *this)
    {
        ++_cache._tracker._stats.reads;
@@ -195,6 +198,7 @@ public:
    void on_underlying_created() { ++_underlying_created; }
    bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
    const tombstone_gc_state* tombstone_gc_state() const { return _tombstone_gc_state; }
+    api::timestamp_type get_max_purgeable(const dht::decorated_key& dk, is_shadowable is) const { return _get_max_purgeable(dk, is); }
 public:
    future<> ensure_underlying() {
        if (_underlying_snapshot) {
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -146,9 +146,6 @@ public:
        promise<> pr;
        std::optional<shared_future<>> fut;
        reader_concurrency_semaphore::read_func func;
-        // Self reference to keep the permit alive while queued for execution.
-        // Must be cleared on all code-paths, otherwise it will keep the permit alive in perpetuity.
-        reader_permit_opt permit_keepalive;
        std::optional<reader_concurrency_semaphore::inactive_read> ir;
    };

@@ -224,8 +221,6 @@ private:
    }

    void on_timeout() {
-        auto keepalive = std::exchange(_aux_data.permit_keepalive, std::nullopt);
-
        _ex = std::make_exception_ptr(timed_out_error{});

        if (_state == state::waiting_for_admission || _state == state::waiting_for_memory || _state == state::waiting_for_execution) {
@@ -487,7 +482,11 @@ public:
        _trace_ptr = std::move(trace_ptr);
    }

-    void check_abort() {
+    bool aborted() const {
+        return bool(_ex);
+    }
+
+    void check_abort() const {
        if (_ex) {
            std::rethrow_exception(_ex);
        }
@@ -636,7 +635,7 @@ void reader_permit::set_trace_state(tracing::trace_state_ptr trace_ptr) noexcept
    _impl->set_trace_state(std::move(trace_ptr));
 }

-void reader_permit::check_abort() {
+void reader_permit::check_abort() const {
    return _impl->check_abort();
 }

@@ -1089,6 +1088,11 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
 reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(mutation_reader reader,
        const dht::partition_range* range) noexcept {
    auto& permit = reader.permit();
+    if (permit->aborted()) {
+        permit->release_base_resources();
+        close_reader(std::move(reader));
+        return inactive_read_handle();
+    }
    if (permit->get_state() == reader_permit::state::waiting_for_memory) {
        // Kill all outstanding memory requests, the read is going to be evicted.
        permit->aux_data().pr.set_exception(std::make_exception_ptr(std::bad_alloc{}));
@@ -1126,6 +1130,7 @@ void reader_concurrency_semaphore::set_notify_handler(inactive_read_handle& irh,
    auto& ir = *(*irh._permit)->aux_data().ir;
    ir.notify_handler = std::move(notify_handler);
    if (ttl_opt) {
+        irh._permit->set_timeout(db::no_timeout);
        ir.ttl_timer.set_callback([this, permit = *irh._permit] () mutable {
            evict(*permit, evict_reason::time);
        });
@@ -1579,10 +1584,10 @@ reader_permit reader_concurrency_semaphore::make_tracking_only_permit(schema_ptr
 }

 future<> reader_concurrency_semaphore::with_permit(schema_ptr schema, const char* const op_name, size_t memory,
-        db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_ptr, read_func func) {
-    auto permit = reader_permit(*this, std::move(schema), std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout, std::move(trace_ptr));
+        db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_ptr, reader_permit_opt& permit_holder, read_func func) {
+    permit_holder = reader_permit(*this, std::move(schema), std::string_view(op_name), {1, static_cast<ssize_t>(memory)}, timeout, std::move(trace_ptr));
+    auto permit = *permit_holder;
    permit->aux_data().func = std::move(func);
-    permit->aux_data().permit_keepalive = permit;
    return do_wait_admission(*permit);
 }

@@ -1635,6 +1640,7 @@ void reader_concurrency_semaphore::foreach_permit(noncopyable_function<void(cons
    boost::for_each(_wait_list._admission_queue, std::ref(func));
    boost::for_each(_wait_list._memory_queue, std::ref(func));
    boost::for_each(_ready_list, std::ref(func));
+    boost::for_each(_inactive_reads, std::ref(func));
 }

 void reader_concurrency_semaphore::foreach_permit(noncopyable_function<void(const reader_permit&)> func) const {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -456,7 +456,8 @@ public:
    ///
    /// Some permits cannot be associated with any table, so passing nullptr as
    /// the schema parameter is allowed.
-    future<> with_permit(schema_ptr schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_ptr, read_func func);
+    future<> with_permit(schema_ptr schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout,
+            tracing::trace_state_ptr trace_ptr, reader_permit_opt& permit_holder, read_func func);

    /// Run the function through the semaphore's execution stage with a pre-admitted permit
    ///
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -174,7 +174,7 @@ public:

    // If the read was aborted, throw the exception the read was aborted with.
    // Otherwise no-op.
-    void check_abort();
+    void check_abort() const;

    query::max_result_size max_result_size() const;
    void set_max_result_size(query::max_result_size);
--- a/repair/CMakeLists.txt
+++ b/repair/CMakeLists.txt
@@ -2,8 +2,7 @@ add_library(repair STATIC)
 target_sources(repair
  PRIVATE
    repair.cc
-    row_level.cc
-    table_check.cc)
+    row_level.cc)
 target_include_directories(repair
  PUBLIC
    ${CMAKE_SOURCE_DIR})
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -15,7 +15,7 @@
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
 #include "message/messaging_service.hh"
-#include "repair/table_check.hh"
+#include "streaming/table_check.hh"
 #include "replica/database.hh"
 #include "service/migration_manager.hh"
 #include "service/storage_service.hh"
@@ -536,9 +536,10 @@ size_t repair::task_manager_module::nr_running_repair_jobs() {
 }

 future<bool> repair::task_manager_module::is_aborted(const tasks::task_id& uuid, shard_id shard) {
-    return smp::submit_to(shard, [&] () {
-        auto it = get_local_tasks().find(uuid);
-        return it != get_local_tasks().end() && it->second->abort_requested();
+    return get_task_manager().container().invoke_on(shard, [name = get_name(), uuid] (tasks::task_manager& tm) {
+        auto module = tm.find_module(name);
+        auto it = module->get_local_tasks().find(uuid);
+        return it != module->get_local_tasks().end() && it->second->abort_requested();
    });
 }

@@ -741,7 +742,7 @@ future<> repair::shard_repair_task_impl::repair_range(const dht::token_range& ra
        co_return;
    }
    try {
-        auto dropped = co_await with_table_drop_silenced(db.local(), mm, table.id, [&] (const table_id& uuid) {
+        auto dropped = co_await streaming::with_table_drop_silenced(db.local(), mm, table.id, [&] (const table_id& uuid) {
            return repair_cf_range_row_level(*this, table.name, table.id, range, neighbors, _small_table_optimization);
        });
        if (dropped) {
@@ -1487,7 +1488,16 @@ future<> repair::data_sync_repair_task_impl::run() {
    auto& keyspace = _status.keyspace;
    auto& sharded_db = rs.get_db();
    auto& db = sharded_db.local();
-    auto germs = make_lw_shared(co_await locator::make_global_effective_replication_map(sharded_db, keyspace));
+    auto germs_fut = co_await coroutine::as_future(locator::make_global_effective_replication_map(sharded_db, keyspace));
+    if (germs_fut.failed()) {
+        auto ex = germs_fut.get_exception();
+        if (try_catch<data_dictionary::no_such_keyspace>(ex)) {
+            rlogger.warn("sync data: keyspace {} does not exist, skipping", keyspace);
+            co_return;
+        }
+        co_await coroutine::return_exception_ptr(std::move(ex));
+    }
+    auto germs = make_lw_shared(germs_fut.get());

    auto id = get_repair_uniq_id();
    rlogger.info("repair[{}]: sync data for keyspace={}, status=started", id.uuid(), keyspace);
@@ -2215,7 +2225,7 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam
        }
        table_id tid = t->schema()->id();
        // Invoke group0 read barrier before obtaining erm pointer so that it sees all prior metadata changes
-        auto dropped = co_await repair::table_sync_and_check(_db.local(), _mm, tid);
+        auto dropped = co_await streaming::table_sync_and_check(_db.local(), _mm, tid);
        if (dropped) {
            rlogger.debug("repair[{}] Table {}.{} does not exist anymore", rid.uuid(), keyspace_name, table_name);
            continue;
@@ -2372,6 +2382,16 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam
    auto task = co_await _repair_module->make_and_start_task<repair::tablet_repair_task_impl>({}, rid, keyspace_name, table_names, streaming::stream_reason::repair, std::move(task_metas), ranges_parallelism);
 }

+void repair::tablet_repair_task_impl::release_resources() noexcept {
+    _metas_size = _metas.size();
+    _metas = {};
+    _tables = {};
+}
+
+size_t repair::tablet_repair_task_impl::get_metas_size() const noexcept {
+    return _metas.size() > 0 ? _metas.size() : _metas_size;
+}
+
 future<> repair::tablet_repair_task_impl::run() {
    auto m = dynamic_pointer_cast<repair::task_manager_module>(_module);
    auto& rs = m->get_repair_service();
@@ -2478,7 +2498,7 @@ future<> repair::tablet_repair_task_impl::run() {
                    auto ep = res.get_exception();
                    sstring ignore_msg;
                    // Ignore the error if the keyspace and/or table were dropped
-                    auto ignore = co_await repair::table_sync_and_check(rs.get_db().local(), rs.get_migration_manager(), m.tid);
+                    auto ignore = co_await streaming::table_sync_and_check(rs.get_db().local(), rs.get_migration_manager(), m.tid);
                    if (ignore) {
                        ignore_msg = format("{} does not exist any more, ignoring it, ",
                                rs.get_db().local().has_keyspace(m.keyspace_name) ? "table" : "keyspace");
@@ -2507,12 +2527,12 @@ future<> repair::tablet_repair_task_impl::run() {
 }

 future<std::optional<double>> repair::tablet_repair_task_impl::expected_total_workload() const {
-    auto sz = _metas.size();
+    auto sz = get_metas_size();
    co_return sz ? std::make_optional<double>(sz) : std::nullopt;
 }

 std::optional<double> repair::tablet_repair_task_impl::expected_children_number() const {
-    return _metas.size();
+    return get_metas_size();
 }

 node_ops_cmd_category categorize_node_ops_cmd(node_ops_cmd cmd) noexcept {
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -3184,6 +3184,7 @@ future<> repair_service::stop() {
        rlogger.debug("Unregistering gossiper helper");
        co_await _gossiper.local().unregister_(_gossip_helper);
    }
+    co_await async_gate().close();
    _stopped = true;
    rlogger.info("Stopped repair_service");
  } catch (...) {
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -109,6 +109,8 @@ class repair_service : public seastar::peering_sharded_service<repair_service> {
    shared_ptr<row_level_repair_gossip_helper> _gossip_helper;
    bool _stopped = false;

+    gate _gate;
+
    size_t _max_repair_memory;
    seastar::semaphore _memory_sem;
    seastar::named_semaphore _load_parallelism_semaphore = {16, named_semaphore_exception_factory{"Load repair history parallelism"}};
@@ -191,6 +193,7 @@ public:
    size_t max_repair_memory() const { return _max_repair_memory; }
    seastar::semaphore& memory_sem() { return _memory_sem; }
    gms::inet_address my_address() const noexcept;
+    gate& async_gate() noexcept { return _gate; }

    repair::task_manager_module& get_repair_module() noexcept {
        return *_repair_module;
--- a/repair/task_manager_module.hh
+++ b/repair/task_manager_module.hh
@@ -108,6 +108,7 @@ private:
    std::vector<tablet_repair_task_meta> _metas;
    optimized_optional<abort_source::subscription> _abort_subscription;
    std::optional<int> _ranges_parallelism;
+    size_t _metas_size = 0;
 public:
    tablet_repair_task_impl(tasks::task_manager::module_ptr module, repair_uniq_id id, sstring keyspace, std::vector<sstring> tables, streaming::stream_reason reason, std::vector<tablet_repair_task_meta> metas, std::optional<int> ranges_parallelism)
        : repair_task_impl(module, id.uuid(), id.id, "keyspace", keyspace, "", "", tasks::task_id::create_null_id(), reason)
@@ -121,6 +122,10 @@ public:
    virtual tasks::is_abortable is_abortable() const noexcept override {
        return tasks::is_abortable(!_abort_subscription);
    }
+
+    virtual void release_resources() noexcept override;
+private:
+    size_t get_metas_size() const noexcept;
 protected:
    future<> run() override;

--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -150,15 +150,10 @@ public:
    void add_maintenance_sstable(sstables::shared_sstable sst);
    api::timestamp_type max_seen_timestamp() const { return _max_seen_timestamp; }

-    // Update main sstable set based on info in completion descriptor, where input sstables
-    // will be replaced by output ones, row cache ranges are possibly invalidated and
-    // statistics are updated.
-    future<> update_main_sstable_list_on_compaction_completion(sstables::compaction_completion_desc desc);
-
-    // This will update sstable lists on behalf of off-strategy compaction, where
-    // input files will be removed from the maintenance set and output files will
-    // be inserted into the main set.
-    future<> update_sstable_lists_on_off_strategy_completion(sstables::compaction_completion_desc desc);
+    // Update main and/or maintenance sstable sets based in info in completion descriptor,
+    // where input sstables will be replaced by output ones, row cache ranges are possibly
+    // invalidated and statistics are updated.
+    future<> update_sstable_sets_on_compaction_completion(sstables::compaction_completion_desc desc);

    const lw_shared_ptr<sstables::sstable_set>& main_sstables() const noexcept;
    void set_main_sstables(lw_shared_ptr<sstables::sstable_set> new_main_sstables);
@@ -340,13 +335,13 @@ public:
    // new tablet replica is allocated).
    virtual future<> update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;

-    virtual compaction_group& compaction_group_for_token(dht::token token) const noexcept = 0;
+    virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
    virtual utils::chunked_vector<compaction_group*> compaction_groups_for_token_range(dht::token_range tr) const = 0;
-    virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const noexcept = 0;
-    virtual compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const noexcept = 0;
+    virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
+    virtual compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const = 0;

    virtual size_t log2_storage_groups() const = 0;
-    virtual storage_group& storage_group_for_token(dht::token) const noexcept = 0;
+    virtual storage_group& storage_group_for_token(dht::token) const = 0;

    virtual locator::table_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const noexcept = 0;
    virtual bool all_storage_groups_split() = 0;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -907,8 +907,15 @@ db::commitlog* database::commitlog_for(const schema_ptr& schema) {
 }

 future<> database::add_column_family(keyspace& ks, schema_ptr schema, column_family::config cfg, is_new_cf is_new) {
+    if (schema->is_view()) {
+        try {
+            auto base_schema = find_schema(schema->view_info()->base_id());
+            schema->view_info()->set_base_info(schema->view_info()->make_base_dependent_view_info(*base_schema));
+        } catch (no_such_column_family&) {
+            throw std::invalid_argument("The base table " + schema->view_info()->base_name() + " was already dropped");
+        }
+    }
    schema = local_schema_registry().learn(schema);
-    schema->registry_entry()->mark_synced();
    auto&& rs = ks.get_replication_strategy();
    locator::effective_replication_map_ptr erm;
    if (auto pt_rs = rs.maybe_as_per_table()) {
@@ -940,6 +947,8 @@ future<> database::add_column_family(keyspace& ks, schema_ptr schema, column_fam
        co_await cf->stop();
        co_await coroutine::return_exception_ptr(f.get_exception());
    }
+    // Table must be added before entry is marked synced.
+    schema->registry_entry()->mark_synced();
 }

 future<> database::add_column_family_and_make_directory(schema_ptr schema, is_new_cf is_new) {
@@ -951,6 +960,14 @@ future<> database::add_column_family_and_make_directory(schema_ptr schema, is_ne
 }

 bool database::update_column_family(schema_ptr new_schema) {
+    if (new_schema->is_view()) {
+        try {
+            auto base_schema = find_schema(new_schema->view_info()->base_id());
+            new_schema->view_info()->set_base_info(new_schema->view_info()->make_base_dependent_view_info(*base_schema));
+        } catch (no_such_column_family&) {
+            throw std::invalid_argument("The base table " + new_schema->view_info()->base_name() + " was already dropped");
+        }
+    }
    column_family& cfm = find_column_family(new_schema->id());
    bool columns_changed = !cfm.schema()->equal_columns(*new_schema);
    auto s = local_schema_registry().learn(new_schema);
@@ -958,11 +975,8 @@ bool database::update_column_family(schema_ptr new_schema) {
    cfm.set_schema(s);
    find_keyspace(s->ks_name()).metadata()->add_or_update_column_family(s);
    if (s->is_view()) {
-        try {
-            find_column_family(s->view_info()->base_id()).add_or_update_view(view_ptr(s));
-        } catch (no_such_column_family&) {
-            // Update view mutations received after base table drop.
-        }
+        // We already tested that the base table exists
+        find_column_family(s->view_info()->base_id()).add_or_update_view(view_ptr(s));
    }
    cfm.get_index_manager().reload();
    return columns_changed;
@@ -1499,7 +1513,9 @@ database::query(schema_ptr query_schema, const query::read_command& cmd, query::
            querier_opt->permit().set_trace_state(trace_state);
            f = co_await coroutine::as_future(semaphore.with_ready_permit(querier_opt->permit(), read_func));
        } else {
-            f = co_await coroutine::as_future(semaphore.with_permit(query_schema, "data-query", cf.estimate_read_memory_cost(), timeout, trace_state, read_func));
+            reader_permit_opt permit_holder;
+            f = co_await coroutine::as_future(semaphore.with_permit(query_schema, "data-query", cf.estimate_read_memory_cost(), timeout,
+                        trace_state, permit_holder, read_func));
        }

        if (!f.failed()) {
@@ -1561,7 +1577,9 @@ database::query_mutations(schema_ptr query_schema, const query::read_command& cm
            querier_opt->permit().set_trace_state(trace_state);
            f = co_await coroutine::as_future(semaphore.with_ready_permit(querier_opt->permit(), read_func));
        } else {
-            f = co_await coroutine::as_future(semaphore.with_permit(query_schema, "mutation-query", cf.estimate_read_memory_cost(), timeout, trace_state, read_func));
+            reader_permit_opt permit_holder;
+            f = co_await coroutine::as_future(semaphore.with_permit(query_schema, "mutation-query", cf.estimate_read_memory_cost(), timeout,
+                        trace_state, permit_holder, read_func));
        }

        if (!f.failed()) {
@@ -1712,6 +1730,38 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
    co_return m;
 }

+api::timestamp_type memtable_list::min_live_timestamp(const dht::decorated_key& dk, is_shadowable is, api::timestamp_type max_seen_timestamp) const noexcept {
+    const auto get_min_ts = [is] (const memtable& mt) {
+        // see get_max_purgeable_timestamp() in compaction.cc for comments on choosing min timestamp
+        return is ? mt.get_min_live_row_marker_timestamp() : mt.get_min_live_timestamp();
+    };
+
+    auto min_live_ts = api::max_timestamp;
+
+    for (const auto& mt : _memtables) {
+        const auto mt_min_live_ts = get_min_ts(*mt);
+        if (mt_min_live_ts > max_seen_timestamp) {
+            continue;
+        }
+        // We cannot do lookups on flushing memtables, they might be in the
+        // process of merging into cache. Keys already merged will not be seen
+        // by the lookup.
+        if (!mt->is_merging_to_cache() && !mt->contains_partition(dk)) {
+            continue;
+        }
+        min_live_ts = std::min(min_live_ts, mt_min_live_ts);
+    }
+
+    for (const auto& mt : _flushed_memtables_with_active_reads) {
+        // We cannot check if the flushed memtable contains the key as it
+        // becomes empty after the merge to cache completes, so we only use the
+        // min ts metadata.
+        min_live_ts = std::min(min_live_ts, get_min_ts(mt));
+    }
+
+    return min_live_ts;
+}
+
 future<> memtable_list::flush() {
    if (!may_flush()) {
        return make_ready_future<>();
@@ -1907,6 +1957,12 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra
    // assume failure until proven otherwise
    auto update_writes_failed = defer([&] { ++_stats->total_writes_failed; });

+    utils::get_local_injector().inject("database_apply", [&s] () {
+        if (!is_system_keyspace(s->ks_name())) {
+            throw std::runtime_error("injected error");
+        }
+    });
+
    // I'm doing a nullcheck here since the init code path for db etc
    // is a little in flux and commitlog is created only when db is
    // initied from datadir.
@@ -2271,7 +2327,10 @@ future<> database::flush(const sstring& ksname, const sstring& cfname) {

 future<> database::flush_table_on_all_shards(sharded<database>& sharded_db, table_id id) {
    return sharded_db.invoke_on_all([id] (replica::database& db) {
-        return db.find_column_family(id).flush();
+        if (db.column_family_exists(id)) {
+            return db.find_column_family(id).flush();
+        }
+        return make_ready_future();
    });
 }

@@ -2282,7 +2341,12 @@ future<> database::drop_cache_for_table_on_all_shards(sharded<database>& sharded
 }

 future<> database::flush_table_on_all_shards(sharded<database>& sharded_db, std::string_view ks_name, std::string_view table_name) {
-    return flush_table_on_all_shards(sharded_db, sharded_db.local().find_uuid(ks_name, table_name));
+    try {
+        return flush_table_on_all_shards(sharded_db, sharded_db.local().find_uuid(ks_name, table_name));
+    } catch (no_such_column_family&) {
+        // Skip.
+        return make_ready_future();
+    }
 }

 static future<> force_new_commitlog_segments(std::unique_ptr<db::commitlog>& cl1, std::unique_ptr<db::commitlog>& cl2) {
@@ -2301,6 +2365,9 @@ future<> database::flush_tables_on_all_shards(sharded<database>& sharded_db, std
     * to discard the currently active segment, This ensures we get 
     * as sstable-ish a universe as we can, as soon as we can.
    */
+    if (utils::get_local_injector().enter("flush_tables_on_all_shards_table_drop")) {
+        table_names.push_back("");
+    }
    return sharded_db.invoke_on_all([] (replica::database& db) {
        return force_new_commitlog_segments(db._commitlog, db._schema_commitlog);
    }).then([&, ks_name, table_names = std::move(table_names)] {
@@ -2452,6 +2519,12 @@ future<> database::truncate_table_on_all_shards(sharded<database>& sharded_db, s
        });
    });

+    co_await utils::get_local_injector().inject("truncate_compaction_disabled_wait", [] (auto& handler) -> future<> {
+        dblog.info("truncate_compaction_disabled_wait: wait");
+        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
+        dblog.info("truncate_compaction_disabled_wait: done");
+    }, false);
+
    const auto should_flush = with_snapshot && cf.can_flush();
    dblog.trace("{} {}.{} and views on all shards", should_flush ? "Flushing" : "Clearing", s->ks_name(), s->cf_name());
    std::function<future<>(replica::table&)> flush_or_clear = should_flush ?
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -77,6 +77,8 @@ class compaction_manager;
 class frozen_mutation;
 class reconcilable_result;

+namespace bi = boost::intrusive;
+
 namespace tracing { class trace_state_ptr; }
 namespace s3 { struct endpoint_config; }

@@ -173,8 +175,13 @@ class global_table_ptr;
 class memtable_list {
 public:
    using seal_immediate_fn_type = std::function<future<> (flush_permit&&)>;
+    using intrusive_memtable_list = bi::list<
+            memtable,
+            bi::base_hook<bi::list_base_hook<bi::link_mode<bi::auto_unlink>>>,
+            bi::constant_time_size<false>>;
 private:
    std::vector<shared_memtable> _memtables;
+    intrusive_memtable_list _flushed_memtables_with_active_reads;
    seal_immediate_fn_type _seal_immediate_fn;
    std::function<schema_ptr()> _current_schema;
    replica::dirty_memory_manager* _dirty_memory_manager;
@@ -230,6 +237,15 @@ public:
        return _memtables.back();
    }

+    // Returns the minimum live timestamp. Considers all memtables, even
+    // those that were flushed and removed with erase(), but an
+    // in-progress read is still using them.
+    // Memtables whose min live timestamp > max_seen_timestamp are ignored as we
+    // consider that their content is more recent than any potential tombstone in
+    // other mutation sources.
+    // Returns api::max_timestamp if the key is not in any of the memtables.
+    api::timestamp_type min_live_timestamp(const dht::decorated_key& dk, is_shadowable is, api::timestamp_type max_seen_timestamp) const noexcept;
+
    // # 8904 - this method is akin to std::set::erase(key_type), not
    // erase(iterator). Should be tolerant against non-existing.
    void erase(const shared_memtable& element) noexcept {
@@ -237,6 +253,7 @@ public:
        if (i != _memtables.end()) {
            _memtables.erase(i);
        }
+        _flushed_memtables_with_active_reads.push_back(*element);
    }

    // Synchronously swaps the active memtable with a new, empty one,
@@ -563,8 +580,15 @@ public:
        sstable_list_builder& operator=(const sstable_list_builder&) = delete;
        sstable_list_builder(const sstable_list_builder&) = delete;

+        // Struct to return the newly built sstable set and the removed sstables
+        struct result {
+            lw_shared_ptr<sstables::sstable_set> new_sstable_set;
+            std::vector<sstables::shared_sstable> removed_sstables;
+        };
+
        // Builds new sstable set from existing one, with new sstables added to it and old sstables removed from it.
-        future<lw_shared_ptr<sstables::sstable_set>>
+        // Returns the updated sstable set and a list of removed sstables.
+        future<result>
        build_new_list(const sstables::sstable_set& current_sstables,
                       sstables::sstable_set new_sstable_list,
                       const std::vector<sstables::shared_sstable>& new_sstables,
@@ -593,19 +617,19 @@ private:
    future<> handle_tablet_split_completion(size_t old_tablet_count, const locator::tablet_map& new_tmap);

    // Select a storage group from a given token.
-    storage_group& storage_group_for_token(dht::token token) const noexcept;
+    storage_group& storage_group_for_token(dht::token token) const;
    storage_group& storage_group_for_id(size_t i) const;

    std::unique_ptr<storage_group_manager> make_storage_group_manager();
-    compaction_group* get_compaction_group(size_t id) const noexcept;
+    compaction_group* get_compaction_group(size_t id) const;
    // Select a compaction group from a given token.
-    compaction_group& compaction_group_for_token(dht::token token) const noexcept;
+    compaction_group& compaction_group_for_token(dht::token token) const;
    // Return compaction groups, present in this shard, that own a particular token range.
    utils::chunked_vector<compaction_group*> compaction_groups_for_token_range(dht::token_range tr) const;
    // Select a compaction group from a given key.
-    compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const noexcept;
+    compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const;
    // Select a compaction group from a given sstable based on its token range.
-    compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const noexcept;
+    compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const;
    // Safely iterate through compaction groups, while performing async operations on them.
    future<> parallel_foreach_compaction_group(std::function<future<>(compaction_group&)> action);
    void for_each_compaction_group(std::function<void(compaction_group&)> action);
@@ -613,13 +637,6 @@ private:
    // Unsafe reference to all storage groups. Don't use it across preemption points.
    const storage_group_map& storage_groups() const;

-    // Safely iterate through SSTables, with deletion guard taken to make sure they're not
-    // removed during iteration.
-    // WARNING: Be careful that the action doesn't perform an operation that will itself
-    // take the deletion guard, as that will cause a deadlock. For example, memtable flush
-    // can wait on compaction (backpressure) which in turn takes deletion guard on completion.
-    future<> safe_foreach_sstable(const sstables::sstable_set&, noncopyable_function<future<>(const sstables::shared_sstable&)> action);
-
    // Returns a sstable set that can be safely used for purging any expired tombstone in a compaction group.
    // Only the sstables in the compaction group is not sufficient, since there might be other compaction
    // groups during tablet split with overlapping token range, and we need to include them all in a single
@@ -1627,6 +1644,8 @@ public:

    seastar::scheduling_group get_streaming_scheduling_group() const { return _dbcfg.streaming_scheduling_group; }

+    seastar::scheduling_group get_gossip_scheduling_group() const { return _dbcfg.gossip_scheduling_group; }
+
    compaction_manager& get_compaction_manager() {
        return _compaction_manager;
    }
--- a/replica/dirty_memory_manager.hh
+++ b/replica/dirty_memory_manager.hh
@@ -100,7 +100,7 @@ struct reclaim_config {
 // A container for memtables. Called "region_group" for historical
 // reasons. Receives updates about memtable size change via the
 // LSA region_listener interface.
-class region_group : public logalloc::region_listener {
+class region_group {
    using region_heap = dirty_memory_manager_logalloc::region_heap;
 public:
    struct allocating_function {
@@ -238,8 +238,6 @@ private:
    future<> release_queued_allocations();
    void notify_unspooled_pressure_relieved();
    friend void region_group_binomial_group_sanity_check(const region_group::region_heap& bh);
-private: // from region_listener
-    virtual void moved(logalloc::region* old_address, logalloc::region* new_address) override;
 public:
    // When creating a region_group, one can specify an optional throttle_threshold parameter. This
    // parameter won't affect normal allocations, but an API is provided, through the region_group's
@@ -268,30 +266,24 @@ public:
    }
    void update_unspooled(ssize_t delta);

-    // It would be easier to call update, but it is unfortunately broken in boost versions up to at
-    // least 1.59.
-    //
-    // One possibility would be to just test for delta sigdness, but we adopt an explicit call for
-    // two reasons:
-    //
-    // 1) it save us a branch
-    // 2) some callers would like to pass delta = 0. For instance, when we are making a region
-    //    evictable / non-evictable. Because the evictable occupancy changes, we would like to call
-    //    the full update cycle even then.
-    virtual void increase_usage(logalloc::region* r, ssize_t delta) override { // From region_listener
+    void increase_usage(logalloc::region* r) { // Called by memtable's region_listener
+        // It would be easier to call update, but it is unfortunately broken in boost versions up to at
+        // least 1.59.
+        //
+        // One possibility would be to just test for delta sigdness, but we adopt an explicit call for
+        // two reasons:
+        //
+        // 1) it save us a branch
+        // 2) some callers would like to pass delta = 0. For instance, when we are making a region
+        //    evictable / non-evictable. Because the evictable occupancy changes, we would like to call
+        //    the full update cycle even then.
        _regions.increase(*static_cast<size_tracked_region*>(r)->_heap_handle);
-        update_unspooled(delta);
    }

-    virtual void decrease_evictable_usage(logalloc::region* r) override { // From region_listener
+    void decrease_usage(logalloc::region* r) { // Called by memtable's region_listener
        _regions.decrease(*static_cast<size_tracked_region*>(r)->_heap_handle);
    }

-    virtual void decrease_usage(logalloc::region* r, ssize_t delta) override { // From region_listener
-        decrease_evictable_usage(r);
-        update_unspooled(delta);
-    }
-
    //
    // Make sure that the function specified by the parameter func only runs when this region_group,
    // as well as each of its ancestors have a memory_used() amount of memory that is lesser or
@@ -332,10 +324,11 @@ private:
    bool execution_permitted() noexcept;

    uint64_t top_region_evictable_space() const noexcept;
-
-    virtual void add(logalloc::region* child) override; // from region_listener
-    virtual void del(logalloc::region* child) override; // from region_listener
-
+public:
+    void add(logalloc::region* child); // Called by memtable's region_listener
+    void del(logalloc::region* child); // Called by memtable's region_listener
+    void moved(logalloc::region* old_address, logalloc::region* new_address); // Called by memtable's region_listener
+private:
    friend class ::test_region_group;
 };

--- a/replica/memtable.cc
+++ b/replica/memtable.cc
@@ -124,14 +124,13 @@ memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm,
    memtable_list* memtable_list, seastar::scheduling_group compaction_scheduling_group)
        : dirty_memory_manager_logalloc::size_tracked_region()
        , _dirty_mgr(dmm)
-        , _cleaner(*this, no_cache_tracker, table_stats.memtable_app_stats, compaction_scheduling_group,
-                   [this] (size_t freed) { remove_flushed_memory(freed); })
+        , _cleaner(*this, no_cache_tracker, table_stats.memtable_app_stats, compaction_scheduling_group)
        , _memtable_list(memtable_list)
        , _schema(std::move(schema))
        , _table_shared_data(table_shared_data)
        , partitions(dht::raw_token_less_comparator{})
        , _table_stats(table_stats) {
-    logalloc::region::listen(&dmm.region_group());
+    logalloc::region::listen(this);
 }

 static thread_local dirty_memory_manager mgr_for_tests;
@@ -149,23 +148,17 @@ memtable::~memtable() {
    logalloc::region::unlisten();
 }

-uint64_t memtable::dirty_size() const {
-    return occupancy().total_space();
-}
-
 void memtable::evict_entry(memtable_entry& e, mutation_cleaner& cleaner) noexcept {
    e.partition().evict(cleaner);
    nr_partitions--;
 }

 void memtable::clear() noexcept {
-    auto dirty_before = dirty_size();
    with_allocator(allocator(), [this] {
        partitions.clear_and_dispose([this] (memtable_entry* e) noexcept {
            evict_entry(*e, _cleaner);
        });
    });
-    remove_flushed_memory(dirty_before - dirty_size());
 }

 future<> memtable::clear_gently() noexcept {
@@ -176,7 +169,6 @@ future<> memtable::clear_gently() noexcept {
            auto p = std::move(partitions);
            nr_partitions = 0;
            while (!p.empty()) {
-                auto dirty_before = dirty_size();
                with_allocator(alloc, [&] () noexcept {
                    while (!p.empty()) {
                        if (p.begin()->clear_gently() == stop_iteration::no) {
@@ -188,7 +180,6 @@ future<> memtable::clear_gently() noexcept {
                        }
                    }
                });
-                remove_flushed_memory(dirty_before - dirty_size());
                seastar::thread::yield();
            }

@@ -283,6 +274,11 @@ memtable::slice(const dht::partition_range& range) const {
 }

 class iterator_reader {
+    // DO NOT RELEASE the memtable! Keep a reference to it, so it stays in
+    // memtable_list::_flushed_memtables_with_active_reads and so that it keeps
+    // blocking tombstone GC of tombstone in the cache, which cover data that
+    // used to be in this memtable, and which will possibly be produced by this
+    // reader later on.
    lw_shared_ptr<memtable> _memtable;
    schema_ptr _schema;
    const dht::partition_range* _range;
@@ -381,7 +377,6 @@ protected:
                                    streamed_mutation::forwarding fwd,
                                    mutation_reader::forwarding fwd_mr) {
        auto ret = _memtable->_underlying->make_reader_v2(_schema, std::move(permit), delegate, slice, nullptr, fwd, fwd_mr);
-        _memtable = {};
        _last = {};
        return ret;
    }
@@ -525,13 +520,16 @@ public:

 void memtable::add_flushed_memory(uint64_t delta) {
    _flushed_memory += delta;
-    _dirty_mgr.account_potentially_cleaned_up_memory(this, delta);
+    if (_flushed_memory > 0) {
+        _dirty_mgr.account_potentially_cleaned_up_memory(this, std::min<int64_t>(delta, _flushed_memory));
+    }
 }

 void memtable::remove_flushed_memory(uint64_t delta) {
-    delta = std::min(_flushed_memory, delta);
+    if (_flushed_memory > 0) {
+        _dirty_mgr.revert_potentially_cleaned_up_memory(this, std::min<int64_t>(delta, _flushed_memory));
+    }
    _flushed_memory -= delta;
-    _dirty_mgr.revert_potentially_cleaned_up_memory(this, delta);
 }

 void memtable::on_detach_from_region_group() noexcept {
@@ -540,8 +538,11 @@ void memtable::on_detach_from_region_group() noexcept {
 }

 void memtable::revert_flushed_memory() noexcept {
-    _dirty_mgr.revert_potentially_cleaned_up_memory(this, _flushed_memory);
+    if (_flushed_memory > 0) {
+        _dirty_mgr.revert_potentially_cleaned_up_memory(this, _flushed_memory);
+    }
    _flushed_memory = 0;
+    _total_memory_low_watermark_during_flush = _total_memory;
 }

 class flush_memory_accounter {
@@ -554,7 +555,7 @@ public:
        : _mt(mt)
 	{}
    ~flush_memory_accounter() {
-        SCYLLA_ASSERT(_mt._flushed_memory <= _mt.occupancy().total_space());
+        SCYLLA_ASSERT(_mt._flushed_memory <= static_cast<int64_t>(_mt.occupancy().total_space()));
    }
    uint64_t compute_size(memtable_entry& e, partition_snapshot& snp) {
        return e.size_in_allocator_without_rows(_mt.allocator())
@@ -747,6 +748,7 @@ memtable::make_flat_reader_opt(schema_ptr query_schema,
 mutation_reader
 memtable::make_flush_reader(schema_ptr s, reader_permit permit) {
    if (!_merged_into_cache) {
+        revert_flushed_memory();
        return make_mutation_reader<flush_reader>(std::move(s), std::move(permit), shared_from_this());
    } else {
        auto& full_slice = s->full_slice();
@@ -834,6 +836,10 @@ void memtable::mark_flushed(mutation_source underlying) noexcept {
    _underlying = std::move(underlying);
 }

+bool memtable::is_merging_to_cache() const noexcept {
+    return _merging_into_cache;
+}
+
 bool memtable::is_flushed() const noexcept {
    return bool(_underlying);
 }
@@ -871,3 +877,35 @@ auto fmt::formatter<replica::memtable>::format(replica::memtable& mt,
    logalloc::reclaim_lock rl(mt);
    return fmt::format_to(ctx.out(), "{{memtable: [{}]}}", fmt::join(mt.partitions, ",\n"));
 }
+
+void replica::memtable::increase_usage(logalloc::region* r, ssize_t delta) {
+    SCYLLA_ASSERT(delta >= 0);
+    _dirty_mgr.region_group().increase_usage(r);
+    _dirty_mgr.region_group().update_unspooled(delta);
+    _total_memory += delta;
+}
+
+void replica::memtable::decrease_evictable_usage(logalloc::region* r) {
+    _dirty_mgr.region_group().decrease_usage(r);
+}
+
+void replica::memtable::decrease_usage(logalloc::region* r, ssize_t delta) {
+    SCYLLA_ASSERT(delta <= 0);
+    _dirty_mgr.region_group().decrease_usage(r);
+    _dirty_mgr.region_group().update_unspooled(delta);
+    _total_memory += delta;
+    if (_total_memory < _total_memory_low_watermark_during_flush) {
+        remove_flushed_memory(_total_memory_low_watermark_during_flush - _total_memory);
+        _total_memory_low_watermark_during_flush = _total_memory;
+    }
+}
+
+void replica::memtable::add(logalloc::region* r) {
+    _dirty_mgr.region_group().add(r);
+}
+void replica::memtable::del(logalloc::region* r) {
+    _dirty_mgr.region_group().del(r);
+}
+void replica::memtable::moved(logalloc::region* old_address, logalloc::region* new_address) {
+    _dirty_mgr.region_group().moved(old_address, new_address);
+}
--- a/replica/memtable.hh
+++ b/replica/memtable.hh
@@ -104,7 +104,11 @@ class dirty_memory_manager;
 struct table_stats;

 // Managed by lw_shared_ptr<>.
-class memtable final : public enable_lw_shared_from_this<memtable>, private dirty_memory_manager_logalloc::size_tracked_region {
+class memtable final
+    : public enable_lw_shared_from_this<memtable>
+    , public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>
+    , private dirty_memory_manager_logalloc::size_tracked_region
+    , public logalloc::region_listener {
 public:
    using partitions_type = double_decker<int64_t, memtable_entry,
                            dht::raw_token_less_comparator, dht::ring_position_comparator,
@@ -126,7 +130,25 @@ private:
    // mutation_source is necessary for the combined mutation source to be
    // monotonic. That combined source in this case is cache + memtable.
    mutation_source_opt _underlying;
-    uint64_t _flushed_memory = 0;
+    bool _merging_into_cache = false;
+    // Tracks the difference between the amount of memory "spooled" during the flush
+    // and the memory freed during the flush.
+    //
+    // If positive, this is equal to the difference between the amount of "spooled" memory
+    // registered in dirty_memory_manager with account_potentially_cleaned_up_memory
+    // and unregistered with revert_potentially_cleaned_up_memory.
+    // If negative, the above difference is 0.
+    int64_t _flushed_memory = 0;
+    // For most of the time, this is equal to occupancy().total_memory.
+    // But we want to know the current memory usage in our logalloc::region_listener
+    // handlers, and at that point in time occupancy() is undefined. (LSA can choose to
+    // update it before or after the handler). So we track it ourselves, based on the deltas
+    // passed to the handlers.
+    uint64_t _total_memory = 0;
+    // During LSA compaction, _total_memory can fluctuate up and down.
+    // But we are only interested in the maximal total decrease since the beginning of flush.
+    // This tracks the lowest value of _total_memory seen during the flush.
+    uint64_t _total_memory_low_watermark_during_flush = 0;
    bool _merged_into_cache = false;
    replica::table_stats& _table_stats;

@@ -194,7 +216,6 @@ private:
    void add_flushed_memory(uint64_t);
    void remove_flushed_memory(uint64_t);
    void clear() noexcept;
-    uint64_t dirty_size() const;
 public:
    explicit memtable(schema_ptr schema, dirty_memory_manager&,
            memtable_table_shared_data& shared_data,
@@ -304,6 +325,7 @@ public:

    bool empty() const noexcept { return partitions.empty(); }
    void mark_flushed(mutation_source) noexcept;
+    bool is_merging_to_cache() const noexcept;
    bool is_flushed() const noexcept;
    void on_detach_from_region_group() noexcept;
    void revert_flushed_memory() noexcept;
@@ -326,6 +348,14 @@ public:
        return _dirty_mgr;
    }

+    // Implementation of region_listener.
+    virtual void increase_usage(logalloc::region* r, ssize_t delta) override;
+    virtual void decrease_evictable_usage(logalloc::region* r) override;
+    virtual void decrease_usage(logalloc::region* r, ssize_t delta) override;
+    virtual void add(logalloc::region* r) override;
+    virtual void del(logalloc::region* r) override;
+    virtual void moved(logalloc::region* old_address, logalloc::region* new_address) override;
+
    friend fmt::formatter<memtable>;
 };

--- a/replica/mutation_dump.cc
+++ b/replica/mutation_dump.cc
@@ -291,7 +291,12 @@ private:
            auto& cdef = _underlying_schema->column_at(kind, id);
            writer.writer().Key(cdef.name_as_text());
            if (cdef.is_atomic()) {
-                writer.write_atomic_cell_value(cell.as_atomic_cell(cdef), cdef.type);
+                auto acv = cell.as_atomic_cell(cdef);
+                if (acv.is_live()) {
+                    writer.write_atomic_cell_value(acv, cdef.type);
+                } else {
+                    writer.writer().Null();
+                }
            } else if (cdef.type->is_collection() || cdef.type->is_user_type()) {
                cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mv) {
                    writer.write_collection_value(mv, cdef.type);
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -247,7 +247,8 @@ table::make_reader_v2(schema_ptr s,

    const auto bypass_cache = slice.options.contains(query::partition_slice::option::bypass_cache);
    if (cache_enabled() && !bypass_cache) {
-        if (auto reader_opt = _cache.make_reader_opt(s, permit, range, slice, &_compaction_manager.get_tombstone_gc_state(), std::move(trace_state), fwd, fwd_mr)) {
+        if (auto reader_opt = _cache.make_reader_opt(s, permit, range, slice, &_compaction_manager.get_tombstone_gc_state(),
+                    get_max_purgeable_fn_for_cache_underlying_reader(), std::move(trace_state), fwd, fwd_mr)) {
            readers.emplace_back(std::move(*reader_opt));
        }
    } else {
@@ -692,7 +693,7 @@ public:

    future<> update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override { return make_ready_future(); }

-    compaction_group& compaction_group_for_token(dht::token token) const noexcept override {
+    compaction_group& compaction_group_for_token(dht::token token) const override {
        return get_compaction_group();
    }
    utils::chunked_vector<compaction_group*> compaction_groups_for_token_range(dht::token_range tr) const override {
@@ -700,16 +701,16 @@ public:
        ret.push_back(&get_compaction_group());
        return ret;
    }
-    compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const noexcept override {
+    compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const override {
        return get_compaction_group();
    }
-    compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const noexcept override {
+    compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const override {
        return get_compaction_group();
    }
    size_t log2_storage_groups() const override {
        return 0;
    }
-    storage_group& storage_group_for_token(dht::token token) const noexcept override {
+    storage_group& storage_group_for_token(dht::token token) const override {
        return *_single_sg;
    }

@@ -805,27 +806,34 @@ public:
        auto local_replica = locator::tablet_replica{_my_host_id, this_shard_id()};

        for (auto tid : tmap.tablet_ids()) {
-            auto range = tmap.get_token_range(tid);
-
-            if (tmap.has_replica(tid, local_replica)) {
-                tlogger.debug("Tablet with id {} and range {} present for {}.{}", tid, range, schema()->ks_name(), schema()->cf_name());
-                ret[tid.value()] = allocate_storage_group(tmap, tid, std::move(range));
+            if (!tmap.has_replica(tid, local_replica)) {
+                continue;
            }
+
+            // if the tablet was cleaned up already on this replica, don't allocate a storage group for it.
+            auto trinfo = tmap.get_tablet_transition_info(tid);
+            if (trinfo && locator::is_post_cleanup(local_replica, tmap.get_tablet_info(tid), *trinfo)) {
+                continue;
+            }
+
+            auto range = tmap.get_token_range(tid);
+            tlogger.debug("Tablet with id {} and range {} present for {}.{}", tid, range, schema()->ks_name(), schema()->cf_name());
+            ret[tid.value()] = allocate_storage_group(tmap, tid, std::move(range));
        }
        _storage_groups = std::move(ret);
    }

    future<> update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;

-    compaction_group& compaction_group_for_token(dht::token token) const noexcept override;
+    compaction_group& compaction_group_for_token(dht::token token) const override;
    utils::chunked_vector<compaction_group*> compaction_groups_for_token_range(dht::token_range tr) const override;
-    compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const noexcept override;
-    compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const noexcept override;
+    compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const override;
+    compaction_group& compaction_group_for_sstable(const sstables::shared_sstable& sst) const override;

    size_t log2_storage_groups() const override {
        return log2ceil(tablet_map().tablet_count());
    }
-    storage_group& storage_group_for_token(dht::token token) const noexcept override {
+    storage_group& storage_group_for_token(dht::token token) const override {
        return storage_group_for_id(storage_group_of(token).first);
    }

@@ -952,8 +960,10 @@ bool tablet_storage_group_manager::all_storage_groups_split() {
        return true;
    }

-    auto split_ready = std::ranges::all_of(_storage_groups | boost::adaptors::map_values,
-        std::mem_fn(&storage_group::set_split_mode));
+    bool split_ready = true;
+    for (const storage_group_ptr& sg : _storage_groups | boost::adaptors::map_values) {
+        split_ready &= sg->set_split_mode();
+    }

    // The table replica will say to coordinator that its split status is ready by
    // mirroring the sequence number from tablet metadata into its local state,
@@ -981,6 +991,12 @@ sstables::compaction_type_options::split tablet_storage_group_manager::split_com
 future<> tablet_storage_group_manager::split_all_storage_groups() {
    sstables::compaction_type_options::split opt = split_compaction_options();

+    co_await utils::get_local_injector().inject("split_storage_groups_wait", [] (auto& handler) -> future<> {
+        dblog.info("split_storage_groups_wait: waiting");
+        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
+        dblog.info("split_storage_groups_wait: done");
+    }, false);
+
    co_await for_each_storage_group_gently([opt] (storage_group& storage_group) {
        return storage_group.split(opt);
    });
@@ -1036,11 +1052,11 @@ std::unique_ptr<storage_group_manager> table::make_storage_group_manager() {
    return ret;
 }

-compaction_group* table::get_compaction_group(size_t id) const noexcept {
+compaction_group* table::get_compaction_group(size_t id) const {
    return storage_group_for_id(id).main_compaction_group().get();
 }

-storage_group& table::storage_group_for_token(dht::token token) const noexcept {
+storage_group& table::storage_group_for_token(dht::token token) const {
    return _sg_manager->storage_group_for_token(token);
 }

@@ -1048,13 +1064,13 @@ storage_group& table::storage_group_for_id(size_t i) const {
    return _sg_manager->storage_group_for_id(_schema, i);
 }

-compaction_group& tablet_storage_group_manager::compaction_group_for_token(dht::token token) const noexcept {
+compaction_group& tablet_storage_group_manager::compaction_group_for_token(dht::token token) const {
    auto [idx, range_side] = storage_group_of(token);
    auto& sg = storage_group_for_id(idx);
    return *sg.select_compaction_group(range_side);
 }

-compaction_group& table::compaction_group_for_token(dht::token token) const noexcept {
+compaction_group& table::compaction_group_for_token(dht::token token) const {
    return _sg_manager->compaction_group_for_token(token);
 }

@@ -1085,15 +1101,15 @@ utils::chunked_vector<compaction_group*> table::compaction_groups_for_token_rang
    return _sg_manager->compaction_groups_for_token_range(tr);
 }

-compaction_group& tablet_storage_group_manager::compaction_group_for_key(partition_key_view key, const schema_ptr& s) const noexcept {
+compaction_group& tablet_storage_group_manager::compaction_group_for_key(partition_key_view key, const schema_ptr& s) const {
    return compaction_group_for_token(dht::get_token(*s, key));
 }

-compaction_group& table::compaction_group_for_key(partition_key_view key, const schema_ptr& s) const noexcept {
+compaction_group& table::compaction_group_for_key(partition_key_view key, const schema_ptr& s) const {
    return _sg_manager->compaction_group_for_key(key, s);
 }

-compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(const sstables::shared_sstable& sst) const noexcept {
+compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(const sstables::shared_sstable& sst) const {
    auto [first_id, first_range_side] = storage_group_of(sst->get_first_decorated_key().token());
    auto [last_id, last_range_side] = storage_group_of(sst->get_last_decorated_key().token());

@@ -1111,7 +1127,7 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con
    return *sg.select_compaction_group(first_range_side);
 }

-compaction_group& table::compaction_group_for_sstable(const sstables::shared_sstable& sst) const noexcept {
+compaction_group& table::compaction_group_for_sstable(const sstables::shared_sstable& sst) const {
    return _sg_manager->compaction_group_for_sstable(sst);
 }

@@ -1153,14 +1169,6 @@ const storage_group_map& table::storage_groups() const {
    return _sg_manager->storage_groups();
 }

-future<> table::safe_foreach_sstable(const sstables::sstable_set& set, noncopyable_function<future<>(const sstables::shared_sstable&)> action) {
-    auto deletion_guard = co_await get_units(_sstable_deletion_sem, 1);
-
-    co_await set.for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
-        return action(sst);
-    });
-}
-
 future<utils::chunked_vector<sstables::sstable_files_snapshot>> table::take_storage_snapshot(dht::token_range tr) {
    utils::chunked_vector<sstables::sstable_files_snapshot> ret;

@@ -1173,9 +1181,11 @@ future<utils::chunked_vector<sstables::sstable_files_snapshot>> table::take_stor

        co_await cg->flush();

-        auto set = cg->make_sstable_set();
-
-        co_await safe_foreach_sstable(*set, [&] (const sstables::shared_sstable& sst) -> future<> {
+        // The sstable set must be obtained *after* the deletion lock is taken,
+        // otherwise components of sstables in the set might be unlinked from the filesystem
+        // by compaction while we are waiting for the lock.
+        auto deletion_guard = co_await get_units(_sstable_deletion_sem, 1);
+        co_await cg->make_sstable_set()->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
           ret.push_back({
               .sst = sst,
               .files = co_await sst->readable_file_for_all_components(),
@@ -1194,8 +1204,11 @@ table::clone_tablet_storage(locator::tablet_id tid) {
    auto& sg = storage_group_for_id(tid.value());
    auto sg_holder = sg.async_gate().hold();
    co_await sg.flush();
-    auto set = sg.make_sstable_set();
-    co_await safe_foreach_sstable(*set, [&] (const sstables::shared_sstable& sst) -> future<> {
+    // The sstable set must be obtained *after* the deletion lock is taken,
+    // otherwise components of sstables in the set might be unlinked from the filesystem
+    // by compaction while we are waiting for the lock.
+    auto deletion_guard = co_await get_units(_sstable_deletion_sem, 1);
+    co_await sg.make_sstable_set()->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
        ret.push_back(co_await sst->clone(calculate_generation_for_new_table()));
    });
    co_return ret;
@@ -1561,6 +1574,17 @@ table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtabl
                co_await with_scheduling_group(_config.memtable_to_cache_scheduling_group, [this, old, &newtabs, &cg] {
                    return update_cache(cg, old, newtabs);
                });
+
+                co_await utils::get_local_injector().inject("replica_post_flush_after_update_cache", [this] (auto& handler) -> future<> {
+                    const auto this_table_name = format("{}.{}", _schema->ks_name(), _schema->cf_name());
+                    if (this_table_name == handler.get("table_name")) {
+                        tlogger.info("error injection handler replica_post_flush_after_update_cache: suspending flush for table {}", this_table_name);
+                        handler.set("suspended", true);
+                        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
+                        tlogger.info("error injection handler replica_post_flush_after_update_cache: resuming flush for table {}", this_table_name);
+                    }
+                });
+
                cg.memtables()->erase(old);
                tlogger.debug("Memtable for {}.{} replaced, into {} sstables", old->schema()->ks_name(), old->schema()->cf_name(), newtabs.size());
                co_return;
@@ -1744,23 +1768,30 @@ void table::subtract_compaction_group_from_stats(const compaction_group& cg) noe
    _stats.live_sstable_count -= cg.live_sstable_count();
 }

-future<lw_shared_ptr<sstables::sstable_set>>
+future<table::sstable_list_builder::result>
 table::sstable_list_builder::build_new_list(const sstables::sstable_set& current_sstables,
                              sstables::sstable_set new_sstable_list,
                              const std::vector<sstables::shared_sstable>& new_sstables,
                              const std::vector<sstables::shared_sstable>& old_sstables) {
    std::unordered_set<sstables::shared_sstable> s(old_sstables.begin(), old_sstables.end());

-    // this might seem dangerous, but "move" here just avoids constness,
-    // making the two ranges compatible when compiling with boost 1.55.
-    // No one is actually moving anything...
-    for (auto all = current_sstables.all(); auto&& tab : boost::range::join(new_sstables, std::move(*all))) {
-        if (!s.contains(tab)) {
+    // add sstables from the current list into the new list except the ones that are in the old list
+    std::vector<sstables::shared_sstable> removed_sstables;
+    co_await current_sstables.for_each_sstable_gently([&s, &removed_sstables, &new_sstable_list] (const sstables::shared_sstable& tab) {
+        if (s.contains(tab)) {
+            removed_sstables.push_back(tab);
+        } else {
            new_sstable_list.insert(tab);
        }
+    });
+
+    // add new sstables into the new list
+    for (auto& tab : new_sstables) {
+        new_sstable_list.insert(tab);
        co_await coroutine::maybe_yield();
    }
-    co_return make_lw_shared<sstables::sstable_set>(std::move(new_sstable_list));
+    co_return table::sstable_list_builder::result {
+        make_lw_shared<sstables::sstable_set>(std::move(new_sstable_list)), std::move(removed_sstables)};
 }

 future<>
@@ -1788,54 +1819,7 @@ compaction_group::delete_unused_sstables(sstables::compaction_completion_desc de
 }

 future<>
-compaction_group::update_sstable_lists_on_off_strategy_completion(sstables::compaction_completion_desc desc) {
-    class sstable_lists_updater : public row_cache::external_updater_impl {
-        using sstables_t = std::vector<sstables::shared_sstable>;
-        table& _t;
-        compaction_group& _cg;
-        table::sstable_list_builder _builder;
-        const sstables_t& _old_maintenance;
-        const sstables_t& _new_main;
-        lw_shared_ptr<sstables::sstable_set> _new_maintenance_list;
-        lw_shared_ptr<sstables::sstable_set> _new_main_list;
-    public:
-        explicit sstable_lists_updater(compaction_group& cg, table::sstable_list_builder::permit_t permit, const sstables_t& old_maintenance, const sstables_t& new_main)
-                : _t(cg._t), _cg(cg), _builder(std::move(permit)), _old_maintenance(old_maintenance), _new_main(new_main) {
-        }
-        virtual future<> prepare() override {
-            sstables_t empty;
-            // adding new sstables, created by off-strategy operation, to main set
-            _new_main_list = co_await _builder.build_new_list(*_cg.main_sstables(), _t._compaction_strategy.make_sstable_set(_t._schema), _new_main, empty);
-            // removing old sstables, used as input by off-strategy, from the maintenance set
-            _new_maintenance_list = co_await _builder.build_new_list(*_cg.maintenance_sstables(), std::move(*_t.make_maintenance_sstable_set()), empty, _old_maintenance);
-        }
-        virtual void execute() override {
-            _cg.set_main_sstables(std::move(_new_main_list));
-            _cg.set_maintenance_sstables(std::move(_new_maintenance_list));
-            // FIXME: the following is not exception safe
-            _t.refresh_compound_sstable_set();
-            // Input sstables aren't not removed from backlog tracker because they come from the maintenance set.
-            _cg.backlog_tracker_adjust_charges({}, _new_main);
-        }
-        static std::unique_ptr<row_cache::external_updater_impl> make(compaction_group& cg, table::sstable_list_builder::permit_t permit, const sstables_t& old_maintenance, const sstables_t& new_main) {
-            return std::make_unique<sstable_lists_updater>(cg, std::move(permit), old_maintenance, new_main);
-        }
-    };
-    auto permit = co_await seastar::get_units(_t._sstable_set_mutation_sem, 1);
-    auto updater = row_cache::external_updater(sstable_lists_updater::make(*this, std::move(permit), desc.old_sstables, desc.new_sstables));
-
-    // row_cache::invalidate() is only used to synchronize sstable list updates, to prevent race conditions from occurring,
-    // meaning nothing is actually invalidated.
-    dht::partition_range_vector empty_ranges = {};
-    co_await _t.get_row_cache().invalidate(std::move(updater), std::move(empty_ranges));
-    _t.get_row_cache().refresh_snapshot();
-    _t.rebuild_statistics();
-
-    co_await delete_unused_sstables(std::move(desc));
-}
-
-future<>
-compaction_group::update_main_sstable_list_on_compaction_completion(sstables::compaction_completion_desc desc) {
+compaction_group::update_sstable_sets_on_compaction_completion(sstables::compaction_completion_desc desc) {
    // Build a new list of _sstables: We remove from the existing list the
    // tables we compacted (by now, there might be more sstables flushed
    // later), and we add the new tables generated by the compaction.
@@ -1884,7 +1868,8 @@ compaction_group::update_main_sstable_list_on_compaction_completion(sstables::co
        const sstables::compaction_completion_desc& _desc;
        struct replacement_desc {
            sstables::compaction_completion_desc desc;
-            lw_shared_ptr<sstables::sstable_set> new_sstables;
+            table::sstable_list_builder::result main_sstable_set_builder_result;
+            std::optional<lw_shared_ptr<sstables::sstable_set>> new_maintenance_sstables;
        };
        std::unordered_map<compaction_group*, replacement_desc> _cg_desc;
    public:
@@ -1900,18 +1885,34 @@ compaction_group::update_main_sstable_list_on_compaction_completion(sstables::co
            // The group that triggered compaction is the only one to have sstables removed from it.
            _cg_desc[&_cg].desc.old_sstables = _desc.old_sstables;
            for (auto& [cg, d] : _cg_desc) {
-                d.new_sstables = co_await _builder.build_new_list(*cg->main_sstables(), _t._compaction_strategy.make_sstable_set(_t._schema),
+                d.main_sstable_set_builder_result = co_await _builder.build_new_list(*cg->main_sstables(), _t._compaction_strategy.make_sstable_set(_t._schema),
                                                                  d.desc.new_sstables, d.desc.old_sstables);
+
+                if (!d.desc.old_sstables.empty()
+                        && d.main_sstable_set_builder_result.removed_sstables.size() != d.desc.old_sstables.size()) {
+                    // Not all old_sstables were removed from the main sstable set, which implies that
+                    // they don't exist there. This can happen if the input sstables were picked up from
+                    // the maintenance set during an offstrategy or scrub compaction. So, remove the old
+                    // sstables from the maintenance set. No need to add any new sstables to the maintenance
+                    // set though, as they are always added to the main set.
+                    auto builder_result = co_await _builder.build_new_list(
+                            *cg->maintenance_sstables(), std::move(*_t.make_maintenance_sstable_set()), {}, d.desc.old_sstables);
+                    d.new_maintenance_sstables = std::move(builder_result.new_sstable_set);
+                }
            }
        }
        virtual void execute() override {
            for (auto&& [cg, d] : _cg_desc) {
-                cg->set_main_sstables(std::move(d.new_sstables));
+                cg->set_main_sstables(std::move(d.main_sstable_set_builder_result.new_sstable_set));
+                if (d.new_maintenance_sstables) {
+                    // offstrategy or scrub compaction - replace the maintenance set
+                    cg->set_maintenance_sstables(std::move(d.new_maintenance_sstables.value()));
+                }
            }
            // FIXME: the following is not exception safe
            _t.refresh_compound_sstable_set();
            for (auto& [cg, d] : _cg_desc) {
-                cg->backlog_tracker_adjust_charges(d.desc.old_sstables, d.desc.new_sstables);
+                cg->backlog_tracker_adjust_charges(d.main_sstable_set_builder_result.removed_sstables, d.desc.new_sstables);
            }
        }
        static std::unique_ptr<row_cache::external_updater_impl> make(compaction_group& cg, table::sstable_list_builder::permit_t permit, sstables::compaction_completion_desc& d) {
@@ -2245,12 +2246,10 @@ public:
        return _cg.memtable_has_key(key);
    }
    future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) override {
+        co_await _cg.update_sstable_sets_on_compaction_completion(std::move(desc));
        if (offstrategy) {
-            co_await _cg.update_sstable_lists_on_off_strategy_completion(std::move(desc));
            _cg.trigger_compaction();
-            co_return;
        }
-        co_await _cg.update_main_sstable_list_on_compaction_completion(std::move(desc));
    }
    bool is_auto_compaction_disabled_by_user() const noexcept override {
        return _t.is_auto_compaction_disabled_by_user();
@@ -2441,7 +2440,7 @@ future<> tablet_storage_group_manager::update_effective_replication_map(const lo
        co_return;
    }

-    // Allocate storage group if tablet is migrating in.
+    // Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
    auto this_replica = locator::tablet_replica{
        .host = erm.get_token_metadata().get_my_id(),
        .shard = this_shard_id()
@@ -2457,6 +2456,8 @@ future<> tablet_storage_group_manager::update_effective_replication_map(const lo
            auto range = new_tablet_map->get_token_range(tid);
            _storage_groups[tid.value()] = allocate_storage_group(*new_tablet_map, tid, std::move(range));
            tablet_migrating_in = true;
+        } else if (_storage_groups.contains(tid.value()) && locator::is_post_cleanup(this_replica, new_tablet_map->get_tablet_info(tid), transition_info)) {
+            remove_storage_group(tid.value());
        }
    }

@@ -2536,20 +2537,7 @@ max_purgeable_fn table::get_max_purgeable_fn_for_cache_underlying_reader() const
        auto max_purgeable_timestamp = api::max_timestamp;

        sg.for_each_compaction_group([&dk, is_shadowable, &max_purgeable_timestamp] (const compaction_group_ptr& cg) {
-            const auto& mt = cg->memtables()->active_memtable();
-            // see get_max_purgeable_timestamp() in compaction.cc for comments on choosing min timestamp
-            api::timestamp_type memtable_min_timestamp = is_shadowable ? mt.get_min_live_row_marker_timestamp() : mt.get_min_live_timestamp();
-            if (memtable_min_timestamp > cg->max_seen_timestamp()) {
-                // All the entries in the memtable are newer than the entries in the
-                // SSTable within this compaction group. So, no need to check further.
-                return;
-            }
-
-            // If a memtable with a minimum timestamp lower than the current maximum
-            // purgeable timestamp has the given key, the tombstone should not be purged.
-            if (memtable_min_timestamp < max_purgeable_timestamp && mt.contains_partition(dk)) {
-                max_purgeable_timestamp = memtable_min_timestamp;
-            }
+            max_purgeable_timestamp = std::min(cg->memtables()->min_live_timestamp(dk, is_shadowable, cg->max_seen_timestamp()), max_purgeable_timestamp);
        });

        return max_purgeable_timestamp;
@@ -2572,7 +2560,7 @@ table::sstables_as_snapshot_source() {
                std::move(reader),
                gc_clock::now(),
                get_max_purgeable_fn_for_cache_underlying_reader(),
-                _compaction_manager.get_tombstone_gc_state(),
+                _compaction_manager.get_tombstone_gc_state().with_commitlog_check_disabled(),
                fwd);
        }, [this, sst_set] {
            return make_partition_presence_checker(sst_set);
@@ -2983,8 +2971,15 @@ void table::set_schema(schema_ptr s) {
    _schema = std::move(s);

    for (auto&& v : _views) {
-        v->view_info()->set_base_info(
-            v->view_info()->make_base_dependent_view_info(*_schema));
+        auto base_info = v->view_info()->make_base_dependent_view_info(*_schema);
+        v->view_info()->set_base_info(base_info);
+        if (v->registry_entry()) {
+            v->registry_entry()->update_base_schema(_schema);
+        }
+        if (auto reverse_schema = local_schema_registry().get_or_null(reversed(v->version()))) {
+            reverse_schema->view_info()->set_base_info(base_info);
+            reverse_schema->registry_entry()->update_base_schema(_schema);
+        }
    }

    set_compaction_strategy(_schema->compaction_strategy());
@@ -2998,8 +2993,6 @@ static std::vector<view_ptr>::iterator find_view(std::vector<view_ptr>& views, c
 }

 void table::add_or_update_view(view_ptr v) {
-    v->view_info()->set_base_info(
-        v->view_info()->make_base_dependent_view_info(*_schema));
    auto existing = find_view(_views, v);
    if (existing != _views.end()) {
        *existing = std::move(v);
@@ -3174,10 +3167,6 @@ db::replay_position table::set_low_replay_position_mark() {

 template<typename... Args>
 void table::do_apply(compaction_group& cg, db::rp_handle&& h, Args&&... args) {
-    if (cg.async_gate().is_closed()) [[unlikely]] {
-        on_internal_error(tlogger, "async_gate of table's compaction group is closed");
-    }
-
    utils::latency_counter lc;
    _stats.writes.set_latency(lc);
    db::replay_position rp = h;
@@ -3823,7 +3812,6 @@ future<> table::cleanup_tablet(database& db, db::system_keyspace& sys_ks, locato
    co_await stop_compaction_groups(sg);
    co_await utils::get_local_injector().inject("delay_tablet_compaction_groups_cleanup", std::chrono::seconds(5));
    co_await cleanup_compaction_groups(db, sys_ks, tid, sg);
-    _sg_manager->remove_storage_group(tid.value());
 }

 future<> table::cleanup_tablet_without_deallocation(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid) {
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -774,12 +774,13 @@ row_cache::make_reader_opt(schema_ptr s,
                       const dht::partition_range& range,
                       const query::partition_slice& slice,
                       const tombstone_gc_state* gc_state,
+                       max_purgeable_fn get_max_purgeable,
                       tracing::trace_state_ptr trace_state,
                       streamed_mutation::forwarding fwd,
                       mutation_reader::forwarding fwd_mr)
 {
    auto make_context = [&] {
-        return std::make_unique<read_context>(*this, s, permit, range, slice, gc_state, trace_state, fwd_mr);
+        return std::make_unique<read_context>(*this, s, permit, range, slice, gc_state, get_max_purgeable, trace_state, fwd_mr);
    };

    if (query::is_single_partition(range) && !fwd_mr) {
@@ -1108,6 +1109,7 @@ future<> row_cache::do_update(external_updater eu, replica::memtable& m, Updater
 }

 future<> row_cache::update(external_updater eu, replica::memtable& m, preemption_source& preempt_src) {
+    m._merging_into_cache = true;
    return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc,
            row_cache::partitions_type::iterator cache_i, replica::memtable_entry& mem_e, partition_presence_checker& is_present,
            real_dirty_memory_accounter& acc, const partitions_type::bound_hint& hint, preemption_source& preempt_src) mutable {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -24,6 +24,7 @@
 #include "db/cache_tracker.hh"
 #include "readers/empty_v2.hh"
 #include "readers/mutation_source.hh"
+#include "compaction/compaction_garbage_collector.hh"

 namespace bi = boost::intrusive;

@@ -376,8 +377,9 @@ public:
                                     tracing::trace_state_ptr trace_state = nullptr,
                                     streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
                                     mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no,
-                                     const tombstone_gc_state* gc_state = nullptr) {
-        if (auto reader_opt = make_reader_opt(s, permit, range, slice, gc_state, std::move(trace_state), fwd, fwd_mr)) {
+                                     const tombstone_gc_state* gc_state = nullptr,
+                                     max_purgeable_fn get_max_purgeable = can_never_purge) {
+        if (auto reader_opt = make_reader_opt(s, permit, range, slice, gc_state, std::move(get_max_purgeable), std::move(trace_state), fwd, fwd_mr)) {
            return std::move(*reader_opt);
        }
        [[unlikely]] return make_empty_flat_reader_v2(std::move(s), std::move(permit));
@@ -389,6 +391,7 @@ public:
                                     const dht::partition_range&,
                                     const query::partition_slice&,
                                     const tombstone_gc_state*,
+                                     max_purgeable_fn get_max_purgeable,
                                     tracing::trace_state_ptr trace_state = nullptr,
                                     streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
                                     mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no);
@@ -396,10 +399,11 @@ public:
    mutation_reader make_reader(schema_ptr s,
                                    reader_permit permit,
                                    const dht::partition_range& range = query::full_partition_range,
-                                    const tombstone_gc_state* gc_state = nullptr) {
+                                    const tombstone_gc_state* gc_state = nullptr,
+                                    max_purgeable_fn get_max_purgeable = can_never_purge) {
        auto& full_slice = s->full_slice();
        return make_reader(std::move(s), std::move(permit), range, full_slice, nullptr,
-                streamed_mutation::forwarding::no, mutation_reader::forwarding::no, gc_state);
+                streamed_mutation::forwarding::no, mutation_reader::forwarding::no, gc_state, std::move(get_max_purgeable));
    }

    // Only reads what is in the cache, doesn't populate.
--- a/schema/schema.cc
+++ b/schema/schema.cc
@@ -1837,8 +1837,16 @@ schema_ptr schema::make_reversed() const {
 }

 schema_ptr schema::get_reversed() const {
-    return local_schema_registry().get_or_load(reversed(_raw._version), [this] (table_schema_version) {
-        return frozen_schema(make_reversed());
+    return local_schema_registry().get_or_load(reversed(_raw._version), [this] (table_schema_version) -> base_and_view_schemas {
+        auto s = make_reversed();
+
+        if (s->is_view()) {
+            if (!s->view_info()->base_info()) {
+                on_internal_error(dblog, format("Tried to make a reverse schema for view {}.{} with an uninitialized base info", s->ks_name(), s->cf_name()));
+            }
+            return {frozen_schema(s), s->view_info()->base_info()->base_schema()};
+        }
+        return {frozen_schema(s)};
    });
 }

--- a/schema/schema_registry.cc
+++ b/schema/schema_registry.cc
@@ -169,8 +169,11 @@ void schema_registry::clear() {
    _entries.clear();
 }

-schema_ptr schema_registry_entry::load(frozen_schema fs) {
-    _frozen_schema = std::move(fs);
+schema_ptr schema_registry_entry::load(base_and_view_schemas fs) {
+    _frozen_schema = std::move(fs.schema);
+    if (fs.base_schema) {
+        _base_schema = std::move(fs.base_schema);
+    }
    auto s = get_schema();
    if (_state == state::LOADING) {
        _schema_promise.set_value(s);
@@ -183,6 +186,9 @@ schema_ptr schema_registry_entry::load(frozen_schema fs) {

 schema_ptr schema_registry_entry::load(schema_ptr s) {
    _frozen_schema = frozen_schema(s);
+    if (s->is_view()) {
+        _base_schema = s->view_info()->base_info()->base_schema();
+    }
    _schema = &*s;
    _schema->_registry_entry = this;
    _erase_timer.cancel();
@@ -202,7 +208,7 @@ future<schema_ptr> schema_registry_entry::start_loading(async_schema_loader load
    _state = state::LOADING;
    slogger.trace("Loading {}", _version);
    // Move to background.
-    (void)f.then_wrapped([self = shared_from_this(), this] (future<frozen_schema>&& f) {
+    (void)f.then_wrapped([self = shared_from_this(), this] (future<base_and_view_schemas>&& f) {
        _loader = {};
        if (_state != state::LOADING) {
            slogger.trace("Loading of {} aborted", _version);
@@ -231,6 +237,10 @@ schema_ptr schema_registry_entry::get_schema() {
        if (s->version() != _version) {
            throw std::runtime_error(format("Unfrozen schema version doesn't match entry version ({}): {}", _version, *s));
        }
+        if (s->is_view()) {
+            // We may encounter a no_such_column_family here, which means that the base table was deleted and we should fail the request
+            s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(**_base_schema));
+        }
        _erase_timer.cancel();
        s->_registry_entry = this;
        _schema = &*s;
@@ -251,6 +261,10 @@ frozen_schema schema_registry_entry::frozen() const {
    return *_frozen_schema;
 }

+void schema_registry_entry::update_base_schema(schema_ptr s) {
+    _base_schema = s;
+}
+
 future<> schema_registry_entry::maybe_sync(std::function<future<>()> syncer) {
    switch (_sync_state) {
        case schema_registry_entry::sync_state::SYNCED:
@@ -324,17 +338,17 @@ schema_ptr global_schema_ptr::get() const {
    if (this_shard_id() == _cpu_of_origin) {
        return _ptr;
    } else {
-        auto registered_schema = [](const schema_registry_entry& e) {
+        auto registered_schema = [](const schema_registry_entry& e, std::optional<schema_ptr> base_schema = std::nullopt) -> schema_ptr {
            schema_ptr ret = local_schema_registry().get_or_null(e.version());
            if (!ret) {
-                ret = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
-                    return e.frozen();
+                ret = local_schema_registry().get_or_load(e.version(), [&e, &base_schema](table_schema_version) -> base_and_view_schemas {
+                    return {e.frozen(), base_schema};
                });
            }
            return ret;
        };

-        schema_ptr registered_bs;
+        std::optional<schema_ptr> registered_bs;
        // the following code contains registry entry dereference of a foreign shard
        // however, it is guaranteed to succeed since we made sure in the constructor
        // that _bs_schema and _ptr will have a registry on the foreign shard where this
@@ -343,16 +357,10 @@ schema_ptr global_schema_ptr::get() const {
        if (_base_schema) {
            registered_bs = registered_schema(*_base_schema->registry_entry());
            if (_base_schema->registry_entry()->is_synced()) {
-                registered_bs->registry_entry()->mark_synced();
-            }
-        }
-        schema_ptr s = registered_schema(*_ptr->registry_entry());
-        if (s->is_view()) {
-            if (!s->view_info()->base_info()) {
-                // we know that registered_bs is valid here because we make sure of it in the constructors.
-                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*registered_bs));
+                registered_bs.value()->registry_entry()->mark_synced();
            }
        }
+        schema_ptr s = registered_schema(*_ptr->registry_entry(), registered_bs);
        if (_ptr->registry_entry()->is_synced()) {
            s->registry_entry()->mark_synced();
        }
@@ -369,25 +377,22 @@ global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
        if (e) {
            return s;
        } else {
-            return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) {
-                return frozen_schema(s);
+            return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) -> base_and_view_schemas {
+                if (s->is_view()) {
+                    if (!s->view_info()->base_info()) {
+                        on_internal_error(slogger, format("Tried to build a global schema for view {}.{} with an uninitialized base info", s->ks_name(), s->cf_name()));
+                    }
+                    return {frozen_schema(s), s->view_info()->base_info()->base_schema()};
+                } else {
+                    return {frozen_schema(s)};
+                }
            });
        }
    };

    schema_ptr s = ensure_registry_entry(ptr);
    if (s->is_view()) {
-        if (s->view_info()->base_info()) {
-            _base_schema = ensure_registry_entry(s->view_info()->base_info()->base_schema());
-        } else if (ptr->view_info()->base_info()) {
-            _base_schema = ensure_registry_entry(ptr->view_info()->base_info()->base_schema());
-        } else {
-            on_internal_error(slogger, format("Tried to build a global schema for view {}.{} with an uninitialized base info", s->ks_name(), s->cf_name()));
-        }
-
-        if (!s->view_info()->base_info() || !s->view_info()->base_info()->base_schema()->registry_entry()) {
-            s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*_base_schema));
-        }
+        _base_schema = ensure_registry_entry(s->view_info()->base_info()->base_schema());
    }
    _ptr = s;
 }
--- a/schema/schema_registry.hh
+++ b/schema/schema_registry.hh
@@ -22,8 +22,13 @@ class schema_ctxt;

 class schema_registry;

-using async_schema_loader = std::function<future<frozen_schema>(table_schema_version)>;
-using schema_loader = std::function<frozen_schema(table_schema_version)>;
+struct base_and_view_schemas {
+    frozen_schema schema;
+    std::optional<schema_ptr> base_schema;
+};
+
+using async_schema_loader = std::function<future<base_and_view_schemas>(table_schema_version)>;
+using schema_loader = std::function<base_and_view_schemas(table_schema_version)>;

 class schema_version_not_found : public std::runtime_error {
 public:
@@ -61,6 +66,8 @@ class schema_registry_entry : public enable_lw_shared_from_this<schema_registry_
    shared_promise<schema_ptr> _schema_promise; // valid when state == LOADING

    std::optional<frozen_schema> _frozen_schema; // engaged when state == LOADED
+    std::optional<schema_ptr> _base_schema;// engaged when state == LOADED for view schemas
+
    // valid when state == LOADED
    // This is != nullptr when there is an alive schema_ptr associated with this entry.
    const ::schema* _schema = nullptr;
@@ -77,7 +84,7 @@ public:
    schema_registry_entry(schema_registry_entry&&) = delete;
    schema_registry_entry(const schema_registry_entry&) = delete;
    ~schema_registry_entry();
-    schema_ptr load(frozen_schema);
+    schema_ptr load(base_and_view_schemas);
    schema_ptr load(schema_ptr);
    future<schema_ptr> start_loading(async_schema_loader);
    schema_ptr get_schema(); // call only when state >= LOADED
@@ -87,6 +94,9 @@ public:
    future<> maybe_sync(std::function<future<>()> sync);
    // Marks this schema version as synced. Syncing cannot be in progress.
    void mark_synced();
+    // Updates the frozen base schema for a view, should be called when updating the base info
+    // Is not needed when we set the base info for the first time - that means this schema is not in the registry
+    void update_base_schema(schema_ptr);
    // Can be called from other shards
    frozen_schema frozen() const;
    // Can be called from other shards
@@ -108,6 +118,7 @@ public:
 // alive the registry will keep its entry. To ensure remote nodes can query current node
 // for schema version, make sure that schema_ptr for the request is alive around the call.
 //
+// Schemas of views returned by this registry always have base_info set.
 class schema_registry {
    std::unordered_map<table_schema_version, lw_shared_ptr<schema_registry_entry>> _entries;
    std::unique_ptr<db::schema_ctxt> _ctxt;
@@ -125,6 +136,7 @@ public:
    void init(const db::schema_ctxt&);

    // Looks up schema by version or loads it using supplied loader.
+    // If the schema refers to a view, the loader must return both view and base schemas.
    schema_ptr get_or_load(table_schema_version, const schema_loader&);

    // Looks up schema by version or returns an empty pointer if not available.
@@ -134,6 +146,7 @@ public:
    // deferring. The loader is copied must be alive only until this method
    // returns. If the loader fails, the future resolves with
    // schema_version_loading_failed.
+    // If the schema refers to a view, the loader must return both view and base schemas.
    future<schema_ptr> get_or_load(table_schema_version, const async_schema_loader&);

    // Looks up schema version. Throws schema_version_not_found when not found
@@ -149,6 +162,7 @@ public:
    // the schema which was passed as argument.
    // The schema instance pointed to by the argument will be attached to the registry
    // entry and will keep it alive.
+    // If the schema refers to a view, it must have base_info set.
    schema_ptr learn(const schema_ptr&);

    // Removes all entries from the registry. This in turn removes all dependencies
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -5012,8 +5012,10 @@ class scylla_small_objects(gdb.Command):
    [2019] 0x635002ecbc60
    """
    class small_object_iterator():
-        def __init__(self, small_pool, resolve_symbols):
-            self._small_pool = small_pool
+        def __init__(self, small_pools, resolve_symbols):
+            self._small_pools = small_pools
+            self._small_pool_addresses = [small_pool.address for small_pool in small_pools]
+            self._object_size = int(small_pools[0]['_object_size'])
            self._resolve_symbols = resolve_symbols

            self._text_ranges = get_text_ranges()
@@ -5023,8 +5025,9 @@ class scylla_small_objects(gdb.Command):
            self._free_in_pool = set()
            self._free_in_span = set()

-            pool_next_free = self._small_pool['_free']
-            while pool_next_free:
+            for small_pool in self._small_pools:
+              pool_next_free = small_pool['_free']
+              while pool_next_free:
                self._free_in_pool.add(int(pool_next_free))
                pool_next_free = pool_next_free.reinterpret_cast(self._free_object_ptr).dereference()

@@ -5035,7 +5038,7 @@ class scylla_small_objects(gdb.Command):
            # Let any StopIteration bubble up, as it signals we are done with
            # all spans.
            span = next(self._span_it)
-            while span.pool() != self._small_pool.address:
+            while span.pool() not in self._small_pool_addresses:
                span = next(self._span_it)

            self._free_in_span = set()
@@ -5058,7 +5061,7 @@ class scylla_small_objects(gdb.Command):
                pass

            span_start, span_end = self._next_span()
-            self._obj_it = iter(range(span_start, span_end, int(self._small_pool['_object_size'])))
+            self._obj_it = iter(range(span_start, span_end, int(self._object_size)))
            return next(self._obj_it)

        def __next__(self):
@@ -5094,16 +5097,14 @@ class scylla_small_objects(gdb.Command):
        return [int(small_pools['_u']['a'][i]['_object_size']) for i in range(nr)]

    @staticmethod
-    def find_small_pool(object_size):
+    def find_small_pools(object_size):
        cpu_mem = gdb.parse_and_eval('\'seastar::memory::cpu_mem\'')
        small_pools = cpu_mem['small_pools']
+        small_pools_a = small_pools['_u']['a']
        nr = int(small_pools['nr_small_pools'])
-        for i in range(nr):
-            sp = small_pools['_u']['a'][i]
-            if object_size == int(sp['_object_size']):
-                return sp
-
-        return None
+        return [small_pools_a[i]
+                for i in range(nr)
+                if int(small_pools_a[i]['_object_size']) == object_size]

    def init_parser(self):
        parser = argparse.ArgumentParser(description="scylla small-objects")
@@ -5120,10 +5121,10 @@ class scylla_small_objects(gdb.Command):

        self._parser = parser

-    def get_objects(self, small_pool, offset=0, count=0, resolve_symbols=False, verbose=False):
-        if self._last_object_size != int(small_pool['_object_size']) or offset < self._last_pos:
+    def get_objects(self, small_pools, offset=0, count=0, resolve_symbols=False, verbose=False):
+        if self._last_object_size != int(small_pools[0]['_object_size']) or offset < self._last_pos:
            self._last_pos = 0
-            self._iterator = scylla_small_objects.small_object_iterator(small_pool, resolve_symbols)
+            self._iterator = scylla_small_objects.small_object_iterator(small_pools, resolve_symbols)

        skip = offset - self._last_pos
        if verbose:
@@ -5153,15 +5154,15 @@ class scylla_small_objects(gdb.Command):
        except SystemExit:
            return

-        small_pool = scylla_small_objects.find_small_pool(args.object_size)
-        if small_pool is None:
+        small_pools = scylla_small_objects.find_small_pools(args.object_size)
+        if not small_pools:
            raise ValueError("{} is not a valid object size for any small pools, valid object sizes are: {}", scylla_small_objects.get_object_sizes())

        if args.summarize:
            if self._last_object_size != args.object_size:
                if args.verbose:
                    gdb.write("Object size changed ({} -> {}), scanning pool.\n".format(self._last_object_size, args.object_size))
-                self._num_objects = len(self.get_objects(small_pool, verbose=args.verbose))
+                self._num_objects = len(self.get_objects(small_pools, verbose=args.verbose))
                self._last_object_size = args.object_size
            gdb.write("number of objects: {}\n"
                      "page size        : {}\n"
@@ -5176,7 +5177,7 @@ class scylla_small_objects(gdb.Command):
            if self._last_object_size != args.object_size:
                if args.verbose:
                    gdb.write("Object size changed ({} -> {}), scanning pool.\n".format(self._last_object_size, args.object_size))
-                self._num_objects = len(self.get_objects(small_pool, verbose=args.verbose))
+                self._num_objects = len(self.get_objects(small_pools, verbose=args.verbose))
                self._last_object_size = args.object_size
            page = random.randint(0, int(self._num_objects / args.page_size) - 1)
        else:
@@ -5184,7 +5185,7 @@ class scylla_small_objects(gdb.Command):

        offset = page * args.page_size
        gdb.write("page {}: {}-{}\n".format(page, offset, offset + args.page_size - 1))
-        for i, (obj, sym) in enumerate(self.get_objects(small_pool, offset, args.page_size, resolve_symbols=True, verbose=args.verbose)):
+        for i, (obj, sym) in enumerate(self.get_objects(small_pools, offset, args.page_size, resolve_symbols=True, verbose=args.verbose)):
            if sym is None:
                sym_text = ""
            else:
--- a/2
+++ b/2
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -243,27 +243,33 @@ future<> service::client_state::maybe_update_per_service_level_params() {
        if (!slo_opt) {
            co_return;
        }
-        auto slo_timeout_or = [&] (const lowres_clock::duration& default_timeout) {
-            return std::visit(overloaded_functor{
-                [&] (const qos::service_level_options::unset_marker&) -> lowres_clock::duration {
-                    return default_timeout;
-                },
-                [&] (const qos::service_level_options::delete_marker&) -> lowres_clock::duration {
-                    return default_timeout;
-                },
-                [&] (const lowres_clock::duration& d) -> lowres_clock::duration {
-                    return d;
-                },
-            }, slo_opt->timeout);
-        };
-        _timeout_config.read_timeout = slo_timeout_or(_default_timeout_config.read_timeout);
-        _timeout_config.write_timeout = slo_timeout_or(_default_timeout_config.write_timeout);
-        _timeout_config.range_read_timeout = slo_timeout_or(_default_timeout_config.range_read_timeout);
-        _timeout_config.counter_write_timeout = slo_timeout_or(_default_timeout_config.counter_write_timeout);
-        _timeout_config.truncate_timeout = slo_timeout_or(_default_timeout_config.truncate_timeout);
-        _timeout_config.cas_timeout = slo_timeout_or(_default_timeout_config.cas_timeout);
-        _timeout_config.other_timeout = slo_timeout_or(_default_timeout_config.other_timeout);
-
-        _workload_type = slo_opt->workload;
+        
+        update_per_service_level_params(*slo_opt);
    }
 }
+
+void service::client_state::update_per_service_level_params(qos::service_level_options& slo) {
+    auto slo_timeout_or = [&] (const lowres_clock::duration& default_timeout) {
+        return std::visit(overloaded_functor{
+            [&] (const qos::service_level_options::unset_marker&) -> lowres_clock::duration {
+                return default_timeout;
+            },
+            [&] (const qos::service_level_options::delete_marker&) -> lowres_clock::duration {
+                return default_timeout;
+            },
+            [&] (const lowres_clock::duration& d) -> lowres_clock::duration {
+                return d;
+            },
+        }, slo.timeout);
+    };
+
+    _timeout_config.read_timeout = slo_timeout_or(_default_timeout_config.read_timeout);
+    _timeout_config.write_timeout = slo_timeout_or(_default_timeout_config.write_timeout);
+    _timeout_config.range_read_timeout = slo_timeout_or(_default_timeout_config.range_read_timeout);
+    _timeout_config.counter_write_timeout = slo_timeout_or(_default_timeout_config.counter_write_timeout);
+    _timeout_config.truncate_timeout = slo_timeout_or(_default_timeout_config.truncate_timeout);
+    _timeout_config.cas_timeout = slo_timeout_or(_default_timeout_config.cas_timeout);
+    _timeout_config.other_timeout = slo_timeout_or(_default_timeout_config.other_timeout);
+
+    _workload_type = slo.workload;
+}
--- a/service/client_state.hh
+++ b/service/client_state.hh
@@ -346,6 +346,7 @@ public:
    future<bool> check_has_permission(auth::command_desc) const;
    future<> ensure_has_permission(auth::command_desc) const;
    future<> maybe_update_per_service_level_params();
+    void update_per_service_level_params(qos::service_level_options& slo);

    /**
     * Returns an exceptional future with \ref exceptions::invalid_request_exception if the resource does not exist.
--- a/service/mapreduce_service.cc
+++ b/service/mapreduce_service.cc
@@ -577,7 +577,7 @@ future<query::mapreduce_result> mapreduce_service::dispatch(query::mapreduce_req
    co_await coroutine::parallel_for_each(vnodes_per_addr.begin(), vnodes_per_addr.end(),
            [&] (std::pair<const netw::messaging_service::msg_addr, dht::partition_range_vector>& vnodes_with_addr) -> future<> {
        netw::messaging_service::msg_addr addr = vnodes_with_addr.first;
-        query::mapreduce_result& result_ = result;
+        query::mapreduce_result& shared_accumulator = result;
        tracing::trace_state_ptr& tr_state_ = tr_state;
        retrying_dispatcher& dispatcher_ = dispatcher;

@@ -598,9 +598,21 @@ future<query::mapreduce_result> mapreduce_service::dispatch(query::mapreduce_req
        flogger.debug("received mapreduce_result={} from {}", partial_printer, addr);

        auto aggrs = mapreduce_aggregates(req);
-        co_return co_await aggrs.with_thread_if_needed([&result_, &aggrs, partial_result = std::move(partial_result)] () mutable {
-            aggrs.merge(result_, std::move(partial_result));
-        });
+
+        // Anytime this coroutine yields, other coroutines may want to write to `shared_accumulator`.
+        // As merging can yield internally, merging directly to `shared_accumulator` would result in race condition.
+        // We can safely write to `shared_accumulator` only when it is empty.
+        while (!shared_accumulator.query_results.empty()) {
+            // Move `shared_accumulator` content to local variable. Leave `shared_accumulator` empty - now other coroutines can safely write to it.
+            query::mapreduce_result previous_results = std::exchange(shared_accumulator, {});
+            // Merge two local variables - it can yield.
+            co_await aggrs.with_thread_if_needed([&previous_results, &aggrs, &partial_result] () mutable {
+                aggrs.merge(partial_result, std::move(previous_results));
+            });
+            // `partial_result` now contains results merged by this coroutine, but `shared_accumulator` might have been updated by others.
+        }
+        // `shared_accumulator` is empty, we can atomically write results merged by this coroutine.
+        shared_accumulator = std::move(partial_result);
    });

    mapreduce_aggregates aggrs(req);
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -1053,29 +1053,20 @@ static future<schema_ptr> get_schema_definition(table_schema_version v, netw::me
            // with TTL (refresh TTL in case column mapping already existed prior to that).
            auto us = s.unfreeze(db::schema_ctxt(proxy));
            // if this is a view - sanity check that its schema doesn't need fixing.
+            schema_ptr base_schema;
            if (us->is_view()) {
                auto& db = proxy.local().local_db();
-                schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
+                base_schema = db.find_schema(us->view_info()->base_id());
                db::schema_tables::check_no_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema);
            }
-            return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
-                return frozen_schema{us};
+            return db::schema_tables::store_column_mapping(proxy, us, true).then([us, base_schema] -> base_and_view_schemas {
+                if (us->is_view()) {
+                    return {frozen_schema(us), base_schema};
+                } else {
+                    return {frozen_schema(us)};
+                }
            });
        });
-    }).then([&storage_proxy] (schema_ptr s) {
-        // If this is a view so this schema also needs a reference to the base
-        // table.
-        if (s->is_view()) {
-            if (!s->view_info()->base_info()) {
-                auto& db = storage_proxy.local_db();
-                // This line might throw a no_such_column_family
-                // It should be fine since if we tried to register a view for which
-                // we don't know the base table, our registry is broken.
-                schema_ptr base_schema = db.find_schema(s->view_info()->base_id());
-                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*base_schema));
-            }
-        }
-        return s;
    });
 }

@@ -1099,6 +1090,8 @@ future<schema_ptr> migration_manager::get_schema_for_write(table_schema_version
    }

    if (!s) {
+        // The schema returned by get_schema_definition comes (eventually) from the schema registry,
+        // so if it is a view, it already has base info and we don't need to set it later
        s = co_await get_schema_definition(v, dst, ms, _storage_proxy);
    }

@@ -1111,23 +1104,6 @@ future<schema_ptr> migration_manager::get_schema_for_write(table_schema_version
            co_await maybe_sync(s, dst);
        }
    }
-    // here s is guaranteed to be valid and synced
-    if (s->is_view() && !s->view_info()->base_info()) {
-        // The way to get here is if the view schema was deactivated
-        // and reactivated again, or if we loaded it from the schema
-        // history.
-        auto& db = _storage_proxy.local_db();
-        // This line might throw a no_such_column_family but
-        // that is fine, if the schema is synced, it means that if
-        // we failed to get the base table, we learned about the base
-        // table not existing (which means that the view also doesn't exist
-        // any more), which means that this schema is actually useless for either
-        // read or write so we better throw than return an incomplete useless
-        // schema
-        schema_ptr base_schema = db.find_schema(s->view_info()->base_id());
-        s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*base_schema));
-    }
-
    co_return s;
 }

--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -383,18 +383,14 @@ void query_pager::handle_result(
    auto view = query::result_view(*results);

    _last_pos = position_in_partition::for_partition_start();
-    uint64_t row_count;
+    uint64_t replica_row_count, row_count;
    if constexpr(!std::is_same_v<std::decay_t<Visitor>, noop_visitor>) {
        query_result_visitor<Visitor> v(std::forward<Visitor>(visitor));
        view.consume(_cmd->slice, v);

-        if (_last_pkey) {
-            update_slice(*_last_pkey);
-        }
-
        row_count = v.total_rows - v.dropped_rows;
-        _max = _max - row_count;
-        _exhausted = (v.total_rows < page_size && !results->is_short_read() && v.dropped_rows == 0) || _max == 0;
+        replica_row_count = v.total_rows;
+
        // If per partition limit is defined, we need to accumulate rows fetched for last partition key if the key matches
        if (_cmd->slice.partition_row_limit() < query::max_rows_if_set) {
            if (_last_pkey && v.last_pkey && _last_pkey->equal(*_query_schema, *v.last_pkey)) {
@@ -403,32 +399,30 @@ void query_pager::handle_result(
                _rows_fetched_for_last_partition = v.last_partition_row_count;
            }
        }
-        const auto& last_pos = results->last_position();
-        if (last_pos && !v.dropped_rows) {
-            _last_pkey = last_pos->partition;
-            _last_pos = last_pos->position;
-        } else {
-            _last_pkey = v.last_pkey;
-            if (v.last_ckey) {
-                _last_pos = position_in_partition::for_key(*v.last_ckey);
-            }
-        }
    } else {
        row_count = results->row_count() ? *results->row_count() : std::get<1>(view.count_partitions_and_rows());
-        _max = _max - row_count;
-        _exhausted = (row_count < page_size && !results->is_short_read()) || _max == 0;
+        replica_row_count = row_count;
+    }

-        if (!_exhausted) {
-            if (_last_pkey) {
-                update_slice(*_last_pkey);
-            }
+    {
+        _max = _max - row_count;
+        _exhausted = (replica_row_count < page_size && !results->is_short_read()) || _max == 0;
+
+        if (_last_pkey) {
+            update_slice(*_last_pkey);
+        }
+
+        // The last page can be truly empty -- with unset last-position and no data to calculate it based on.
+        if (!replica_row_count && !results->is_short_read()) {
+            _last_pkey = {};
+        } else {
            auto last_pos = results->get_or_calculate_last_position();
            _last_pkey = std::move(last_pos.partition);
            _last_pos = std::move(last_pos.position);
        }
    }

-    qlogger.debug("Fetched {} rows, max_remain={} {}", row_count, _max, _exhausted ? "(exh)" : "");
+    qlogger.debug("Fetched {} rows (kept {}), max_remain={} {}", replica_row_count, row_count, _max, _exhausted ? "(exh)" : "");

    if (_last_pkey) {
        qlogger.debug("Last partition key: {}", *_last_pkey);
--- a/Show More
+++ b/Show More