test/cluster/test_view_building_coordinator: start view-building nodes one by one to make the tests stronger

The test_node_operation_during_view_building() setup used servers_add() to bring up all initial nodes concurrently. That is more aggressive than this test needs, and it makes the setup sensitive to bootstrap/topology races and to single-node startup failures. The add_server has notes about this case. In the decommission case in particular, the test starts with 4 nodes and only later exercises the node operation under test. When all 4 nodes are started concurrently, a failure in one node during initial bootstrap can cause the whole batch add to fail before the test even reaches the decommission step. This showed up as Failed to add servers, with later nodes timing out while waiting for topology/IP mapping after one of the early nodes shut down. Switch the initial cluster setup to repeated server_add() calls. This keeps the topology changes serialized, allows each node to fully join before the next one starts, and matches the actual needs of the test. The change does not alter the scenario being tested; it only makes the test setup less fragile and easier to diagnose when a node startup problem happens.
Merge 'vector_search: fix race condition on connection timeout' from Karol Nowacki
2026-03-22 12:08:34 +02:00 · 2026-03-20 11:12:04 +01:00 · 2026-03-20 10:55:35 +01:00 · 2026-03-20 10:09:30 +02:00 · 2026-03-20 10:08:38 +02:00 · 2026-03-20 10:07:01 +02:00
138 changed files with 7305 additions and 1357 deletions
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -55,22 +55,26 @@ ninja build/<mode>/test/boost/<test_name>
 ninja build/<mode>/scylla

 # Run all tests in a file
-./test.py --mode=<mode> <test_path>
+./test.py --mode=<mode> test/<suite>/<test_name>.py

 # Run a single test case from a file
-./test.py --mode=<mode> <test_path>::<test_function_name>
+./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
+
+# Run all tests in a directory
+./test.py --mode=<mode> test/<suite>/

 # Examples
-./test.py --mode=dev alternator/
-./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/alternator/
+./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/cqlpy/test_json.py

 # Optional flags
-./test.py --mode=dev cluster/test_raft_no_quorum -v  # Verbose output
-./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5  # Repeat test 5 times
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v  # Verbose output
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5  # Repeat test 5 times
 ```

 **Important:**
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
+- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
 - To run a single test case, append `::<test_function_name>` to the file path
 - Add `-v` for verbose output
 - Add `--repeat <num>` to repeat a test multiple times
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1295,6 +1295,45 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_compaction",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger compaction of the key-value storage",
+               "type":"void",
+               "nickname":"logstor_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"major",
+                     "description":"When true, perform a major compaction",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/logstor_flush",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger flush of logstor storage",
+               "type":"void",
+               "nickname":"logstor_flush",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/storage_service/active_repair/",
         "operations":[
@@ -3229,6 +3268,38 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_info",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Logstor segment information for one table",
+               "type":"table_logstor_info",
+               "nickname":"logstor_info",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"table",
+                     "description":"table name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/retrain_dict",
         "operations":[
@@ -3637,6 +3708,47 @@
            }
        }
      },
+        "logstor_hist_bucket":{
+         "id":"logstor_hist_bucket",
+         "properties":{
+            "bucket":{
+               "type":"long"
+            },
+            "count":{
+               "type":"long"
+            },
+            "min_data_size":{
+               "type":"long"
+            },
+            "max_data_size":{
+               "type":"long"
+            }
+         }
+        },
+        "table_logstor_info":{
+         "id":"table_logstor_info",
+         "description":"Per-table logstor segment distribution",
+         "properties":{
+            "keyspace":{
+               "type":"string"
+            },
+            "table":{
+               "type":"string"
+            },
+            "compaction_groups":{
+               "type":"long"
+            },
+            "segments":{
+               "type":"long"
+            },
+            "data_size_histogram":{
+               "type":"array",
+               "items":{
+                  "$ref":"logstor_hist_bucket"
+               }
+            }
+         }
+        },
      "tablet_repair_result":{
        "id":"tablet_repair_result",
        "description":"Tablet repair result",
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -209,6 +209,21 @@
               "parameters":[]
            }
         ]
+      },
+      {
+         "path":"/system/chosen_sstable_version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get sstable version currently chosen for use in new sstables",
+               "type":"string",
+               "nickname":"get_chosen_sstable_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
      }
   ]
 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -18,7 +18,9 @@
 #include "utils/assert.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
+#include <sstream>
 #include "db/data_listeners.hh"
+#include "utils/hash.hh"
 #include "storage_service.hh"
 #include "compaction/compaction_manager.hh"
 #include "unimplemented.hh"
@@ -342,6 +344,56 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
    return ret;
 }

+static
+future<json::json_return_type>
+rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
+        bool filters_provided = false;
+
+        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
+        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                table_filters.emplace(parse_fully_qualified_cf_name(filter));
+            }
+        }
+
+        std::unordered_set<sstring> keyspace_filters {};
+        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                keyspace_filters.emplace(std::move(filter));
+            }
+        }
+
+        // when the query is empty return immediately
+        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
+            apilog.debug("toppartitions query: processing results");
+            cf::toppartitions_query_results results;
+
+            results.read_cardinality = 0;
+            results.write_cardinality = 0;
+
+            return make_ready_future<json::json_return_type>(results);
+        }
+
+        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
+        api::req_param<unsigned> capacity(*req, "capacity", 256);
+        api::req_param<unsigned> list_size(*req, "list_size", 10);
+
+        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
+            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
+
+        return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
+            return run_toppartitions_query(q);
+        });
+}
+
 void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
    cf::get_column_family_name.set(r, [&db] (const_req req){
        std::vector<sstring> res;
@@ -1047,6 +1099,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
        });
    });

+    ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
+        return rest_toppartitions_generic(db, std::move(req));
+    });
+
    cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        if (!req->get_query_param("split_output").empty()) {
            fail(unimplemented::cause::API);
@@ -1213,6 +1269,7 @@ void unset_column_family(http_context& ctx, routes& r) {
    cf::get_sstable_count_per_level.unset(r);
    cf::get_sstables_for_key.unset(r);
    cf::toppartitions.unset(r);
+    ss::toppartitions_generic.unset(r);
    cf::force_major_compaction.unset(r);
    ss::get_load.unset(r);
    ss::get_metrics_load.unset(r);
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -17,9 +17,7 @@
 #include "gms/feature_service.hh"
 #include "schema/schema_builder.hh"
 #include "sstables/sstables_manager.hh"
-#include "utils/hash.hh"
 #include <optional>
-#include <sstream>
 #include <stdexcept>
 #include <time.h>
 #include <algorithm>
@@ -612,56 +610,6 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
        co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
 }

-static
-future<json::json_return_type>
-rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
-        bool filters_provided = false;
-
-        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
-        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                table_filters.emplace(parse_fully_qualified_cf_name(filter));
-            }
-        }
-
-        std::unordered_set<sstring> keyspace_filters {};
-        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                keyspace_filters.emplace(std::move(filter));
-            }
-        }
-
-        // when the query is empty return immediately
-        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
-            apilog.debug("toppartitions query: processing results");
-            httpd::column_family_json::toppartitions_query_results results;
-
-            results.read_cardinality = 0;
-            results.write_cardinality = 0;
-
-            return make_ready_future<json::json_return_type>(results);
-        }
-
-        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
-        api::req_param<unsigned> capacity(*req, "capacity", 256);
-        api::req_param<unsigned> list_size(*req, "list_size", 10);
-
-        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
-            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
-
-        return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
-            return run_toppartitions_query(q);
-        });
-}
-
 static
 json::json_return_type
 rest_get_release_version(sharded<service::storage_service>& ss, const_req& req) {
@@ -833,6 +781,28 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)
        co_return json_void();
 }

+static
+future<json::json_return_type>
+rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+        bool major = false;
+        if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
+            major = validate_bool(major_param);
+        }
+        apilog.info("logstor_compaction: major={}", major);
+        auto& db = ctx.db;
+        co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
+        co_return json_void();
+}
+
+static
+future<json::json_return_type>
+rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
+        apilog.info("logstor_flush");
+        auto& db = ctx.db;
+        co_await replica::database::flush_logstor_separator_on_all_shards(db);
+        co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
@@ -1553,6 +1523,54 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
        });
 }

+static
+future<json::json_return_type>
+rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
+        auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
+        auto table = api::req_param<sstring>(*req, "table", {}).value;
+        if (table.empty()) {
+            table = api::req_param<sstring>(*req, "cf", {}).value;
+        }
+
+        if (keyspace.empty()) {
+            throw bad_param_exception("The query parameter 'keyspace' is required");
+        }
+        if (table.empty()) {
+            throw bad_param_exception("The query parameter 'table' is required");
+        }
+
+        keyspace = validate_keyspace(ctx, keyspace);
+        auto tid = validate_table(ctx.db.local(), keyspace, table);
+
+        auto& cf = ctx.db.local().find_column_family(tid);
+        if (!cf.uses_logstor()) {
+            throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
+        }
+
+        return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
+            return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
+                merged_stats += shard_stats;
+            }, [tid](const replica::database& db) {
+                return db.get_logstor_table_segment_stats(tid);
+            }).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
+                ss::table_logstor_info result;
+                result.keyspace = keyspace;
+                result.table = table;
+                result.compaction_groups = merged_stats.compaction_group_count;
+                result.segments = merged_stats.segment_count;
+
+                for (const auto& bucket : merged_stats.histogram) {
+                    ss::logstor_hist_bucket hist;
+                    hist.count = bucket.count;
+                    hist.max_data_size = bucket.max_data_size;
+                    result.data_size_histogram.push(std::move(hist));
+                }
+
+                return make_ready_future<json::json_return_type>(stream_object(result));
+            });
+        });
+}
+
 static
 future<json::json_return_type>
 rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
@@ -1784,7 +1802,6 @@ rest_bind(FuncType func, BindArgs&... args) {

 void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
    ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
-    ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
    ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
    ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
    ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
@@ -1800,6 +1817,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
    ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
+    ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
+    ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
    ss::move.set(r, rest_bind(rest_move, ss));
    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
@@ -1848,6 +1867,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
    ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
    ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
+    ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
@@ -1864,7 +1884,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

 void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_token_endpoint.unset(r);
-    ss::toppartitions_generic.unset(r);
    ss::get_release_version.unset(r);
    ss::get_scylla_release_version.unset(r);
    ss::get_schema_version.unset(r);
@@ -1878,6 +1897,8 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reset_cleanup_needed.unset(r);
    ss::force_flush.unset(r);
    ss::force_keyspace_flush.unset(r);
+    ss::logstor_compaction.unset(r);
+    ss::logstor_flush.unset(r);
    ss::decommission.unset(r);
    ss::move.unset(r);
    ss::remove_node.unset(r);
@@ -1925,6 +1946,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_ownership.unset(r);
    ss::get_effective_ownership.unset(r);
    ss::sstable_info.unset(r);
+    ss::logstor_info.unset(r);
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
--- a/api/system.cc
+++ b/api/system.cc
@@ -190,6 +190,13 @@ void set_system(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
        });
    });
+
+    hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx] {
+            auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
+            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
+        });
+    });
 }

 }
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -47,7 +47,7 @@ void cache::set_permission_loader(permission_loader_func loader) {
    _permission_loader = std::move(loader);
 }

-lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
+lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
    auto it = _roles.find(role);
    if (it == _roles.end()) {
        return {};
@@ -55,6 +55,16 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
    return it->second;
 }

+void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
+    for (const auto& [name, record] : _roles) {
+        func(name, *record);
+    }
+}
+
+size_t cache::roles_count() const noexcept {
+    return _roles.size();
+}
+
 future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
    std::unordered_map<resource, permission_set>* perms_cache;
    lw_shared_ptr<role_record> role_ptr;
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
+#include <string_view>
 #include <unordered_set>
 #include <unordered_map>

@@ -19,7 +20,7 @@
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/metrics_registration.hh>

-#include <absl/container/flat_hash_map.h>
+#include "absl-flat_hash_map.hh"

 #include "auth/permission.hh"
 #include "auth/common.hh"
@@ -42,8 +43,8 @@ public:
        std::unordered_set<role_name_t> member_of;
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
-        std::unordered_map<sstring, sstring> attributes;
-        std::unordered_map<sstring, permission_set> permissions;
+        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
+        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
    private:
        friend cache;
        // cached permissions include effects of role's inheritance
@@ -52,7 +53,7 @@ public:
    };

    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
-    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
+    lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
    void set_permission_loader(permission_loader_func loader);
    future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
    future<> prune(const resource& r);
@@ -61,8 +62,15 @@ public:
    future<> load_roles(std::unordered_set<role_name_t> roles);
    static bool includes_table(const table_id&) noexcept;

+    // Returns the number of roles in the cache.
+    size_t roles_count() const noexcept;
+
+    // The callback doesn't suspend (no co_await) so it observes the state
+    // of the cache atomically.
+    void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
+
 private:
-    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
+    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
    roles_map _roles;
    // anonymous permissions map exists mainly due to compatibility with
    // higher layers which use role_or_anonymous to get permissions.
--- a/auth/maintenance_socket_authorizer.hh
+++ b/auth/maintenance_socket_authorizer.hh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "auth/default_authorizer.hh"
+#include "auth/permission.hh"
+
+namespace auth {
+
+// maintenance_socket_authorizer is used for clients connecting to the
+// maintenance socket. It grants all permissions unconditionally (like
+// AllowAllAuthorizer) while still supporting grant/revoke operations
+// (delegated to the underlying CassandraAuthorizer / default_authorizer).
+class maintenance_socket_authorizer : public default_authorizer {
+public:
+    using default_authorizer::default_authorizer;
+
+    ~maintenance_socket_authorizer() override = default;
+
+    future<> start() override {
+        return make_ready_future<>();
+    }
+
+    future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
+        return make_ready_future<permission_set>(permissions::ALL);
+    }
+};
+
+} // namespace auth
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -30,6 +30,7 @@
 #include "auth/default_authorizer.hh"
 #include "auth/ldap_role_manager.hh"
 #include "auth/maintenance_socket_authenticator.hh"
+#include "auth/maintenance_socket_authorizer.hh"
 #include "auth/maintenance_socket_role_manager.hh"
 #include "auth/password_authenticator.hh"
 #include "auth/role_or_anonymous.hh"
@@ -866,6 +867,12 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
    };
 }

+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
+    return [&qp] {
+        return std::make_unique<maintenance_socket_authorizer>(qp.local());
+    };
+}
+
 role_manager_factory make_maintenance_socket_role_manager_factory(
        sharded<cql3::query_processor>& qp,
        ::service::raft_group0_client& g0,
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -434,6 +434,11 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
        sharded<::service::migration_manager>& mm,
        sharded<cache>& cache);

+/// Creates a factory for the maintenance socket authorizer.
+/// This authorizer is not config-selectable and is only used for the maintenance socket.
+/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
+
 /// Creates a factory for the maintenance socket role manager.
 /// This role manager is not config-selectable and is only used for the maintenance socket.
 role_manager_factory make_maintenance_socket_role_manager_factory(
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -44,13 +44,12 @@ namespace auth {
 static logging::logger log("standard_role_manager");

 future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
-    auto name = sstring(role_name);
-    auto role = _cache.get(name);
+    auto role = _cache.get(role_name);
    if (!role) {
        return make_ready_future<std::optional<record>>(std::nullopt);
    }
    return make_ready_future<std::optional<record>>(std::make_optional(record{
-        .name = std::move(name),
+        .name = sstring(role_name),
        .is_superuser = role->is_superuser,
        .can_login = role->can_login,
        .member_of = role->member_of
@@ -393,51 +392,21 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
 }

 future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT * FROM {}.{}",
-            db::system_keyspace::NAME,
-            ROLE_MEMBERS_CF);
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_to_directly_granted_map roles_map;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles_map, roles_map.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
-    );
-
+    _cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
+        for (const auto& granted_role : record.member_of) {
+            roles_map.emplace(name, granted_role);
+        }
+    });
    co_return roles_map;
 }

 future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT {} FROM {}.{}",
-            meta::roles_table::role_col_name,
-            db::system_keyspace::NAME,
-            meta::roles_table::name);
-
-    // To avoid many copies of a view.
-    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::LOCAL_ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_set roles;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles, roles.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return row.get_as<sstring>(role_col_name_string);}
-    );
+    roles.reserve(_cache.roles_count());
+    _cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
+        roles.insert(name);
+    });
    co_return roles;
 }

@@ -460,31 +429,26 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
-            db::system_keyspace::NAME,
-            ROLE_ATTRIBUTES_CF);
-    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
-    if (!result_set->empty()) {
-        const cql3::untyped_result_set_row &row = result_set->one();
-        co_return std::optional<sstring>(row.get_as<sstring>("value"));
+    auto role = _cache.get(role_name);
+    if (!role) {
+        co_return std::nullopt;
    }
-    co_return std::optional<sstring>{};
+    auto it = role->attributes.find(attribute_name);
+    if (it != role->attributes.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
-    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
-                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
-                    if (att_val) {
-                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
-                    }
-                });
-            }).then([&role_to_att_val] () {
-                return make_ready_future<attribute_vals>(std::move(role_to_att_val));
-            });
-        });
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    attribute_vals result;
+    _cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
+        auto it = record.attributes.find(attribute_name);
+        if (it != record.attributes.end()) {
+            result.emplace(name, it->second);
+        }
    });
+    co_return result;
 }

 future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1268,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
    if (dsm && (this_shard_id() == 0)) {
        _out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
            if (threshold_reached) {
-                return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
+                return container().invoke_on_all([] (compaction_manager& cm) {
+                    cm._in_critical_disk_utilization_mode = true;
+                    return cm.drain();
+                });
            }
-            return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
+            return container().invoke_on_all([] (compaction_manager& cm) {
+                cm._in_critical_disk_utilization_mode = false;
+                cm.enable();
+            });
        });
    }

@@ -2348,6 +2354,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
 }

+std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
+    std::exception_ptr ex;
+    if (_in_critical_disk_utilization_mode) {
+        ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
+    } else {
+        ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
+    }
+    return ex;
+}
+
 future<std::vector<sstables::shared_sstable>>
 compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
@@ -2357,8 +2373,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
    // We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
    // which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
    if (is_disabled()) {
-        co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
-                                                                                         "reason might be out of space prevention", sst->get_filename()))));
+        co_return coroutine::exception(make_disabled_exception(t));
    }
    std::vector<sstables::shared_sstable> ret;

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -115,6 +115,8 @@ private:
    uint32_t _disabled_state_count = 0;

    bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
+    // precondition: is_disabled() is true.
+    std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);

    std::optional<future<>> _stop_future;

@@ -170,6 +172,7 @@ private:
    shared_tombstone_gc_state _shared_tombstone_gc_state;

    utils::disk_space_monitor::subscription _out_of_space_subscription;
+    bool _in_critical_disk_utilization_mode = false;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -397,6 +397,17 @@ commitlog_total_space_in_mb: -1
 #      you can cache more hot rows
 # column_index_size_in_kb: 64

+# sstable format version for newly written sstables.
+# Currently allowed values are `me` and `ms`.
+# If not specified in the config, this defaults to `me`.
+#
+# The difference between `me` and `ms` are the data structures used
+# in the primary index.
+# In short, `ms` needs more CPU during sstable writes,
+# but should behave better during reads,
+# although it might behave worse for very long clustering keys.
+sstable_format: ms
+
 # Auto-scaling of the promoted index prevents running out of memory
 # when the promoted index grows too large (due to partitions with many rows
 # vs. too small column_index_size_in_kb).  When the serialized representation
--- a/configure.py
+++ b/configure.py
@@ -896,6 +896,9 @@ scylla_core = (['message/messaging_service.cc',
                'replica/multishard_query.cc',
                'replica/mutation_dump.cc',
                'replica/querier.cc',
+                'replica/logstor/segment_manager.cc',
+                'replica/logstor/logstor.cc',
+                'replica/logstor/write_buffer.cc',
                'mutation/atomic_cell.cc',
                'mutation/canonical_mutation.cc',
                'mutation/frozen_mutation.cc',
@@ -1467,6 +1470,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/query.idl.hh',
        'idl/idl_test.idl.hh',
        'idl/commitlog.idl.hh',
+        'idl/logstor.idl.hh',
        'idl/tracing.idl.hh',
        'idl/consistency_level.idl.hh',
        'idl/cache_temperature.idl.hh',
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -265,7 +265,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
        return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(
                exceptions::invalid_request_exception(
-                        format("Consistency level {} is not allowed for write operations", cl)));
+                        format("Write consistency level {} is forbidden by the current configuration "
+                               "setting of write_consistency_levels_disallowed. Please use a different "
+                               "consistency level, or remove {} from write_consistency_levels_disallowed "
+                               "set in the configuration.", cl, cl)));
    }

    for (size_t i = 0; i < _statements.size(); ++i) {
@@ -277,7 +280,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        _stats.statements_in_cas_batches += _statements.size();
        return execute_with_conditions(qp, options, query_state).then([guardrail_state, cl] (auto result) {
            if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-                result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+                result->add_warning(format("Using write consistency level {} listed on the "
+                                           "write_consistency_levels_warned is not recommended.", cl));
            }
            return result;
        });
@@ -297,7 +301,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        }
        auto result = make_shared<cql_transport::messages::result_message::void_message>();
        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+            result->add_warning(format("Using write consistency level {} listed on the "
+                                       "write_consistency_levels_warned is not recommended.", cl));
        }
        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(result));
    });
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -59,6 +59,8 @@ const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";

 const sstring cf_prop_defs::KW_TABLETS = "tablets";

+const sstring cf_prop_defs::KW_STORAGE_ENGINE = "storage_engine";
+
 schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
    schema::extensions_map er;
    for (auto& p : exts.schema_extensions()) {
@@ -106,6 +108,7 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
        KW_COMPRESSION, KW_CRC_CHECK_CHANCE,  KW_ID, KW_PAXOSGRACESECONDS,
        KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
+        KW_STORAGE_ENGINE,
    });
    static std::set<sstring> obsolete_keywords({
        sstring("index_interval"),
@@ -196,6 +199,20 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        }
        db::tablet_options::validate(*tablet_options_map);
    }
+
+    if (has_property(KW_STORAGE_ENGINE)) {
+        auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor") {
+            if (!db.features().logstor) {
+                throw exceptions::configuration_exception(format("The experimental feature 'logstor' must be enabled in order to use the 'logstor' storage engine."));
+            }
+            if (!db.get_config().enable_logstor()) {
+                throw exceptions::configuration_exception(format("The configuration option 'enable_logstor' must be set to true in the configuration in order to use the 'logstor' storage engine."));
+            }
+        } else {
+            throw exceptions::configuration_exception(format("Illegal value for '{}'", KW_STORAGE_ENGINE));
+        }
+    }
 }

 std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
@@ -396,6 +413,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
    if (auto tablet_options_opt = get_map(KW_TABLETS)) {
        builder.set_tablet_options(std::move(*tablet_options_opt));
    }
+
+    if (has_property(KW_STORAGE_ENGINE)) {
+        auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor") {
+            builder.set_logstor();
+        }
+    }
 }

 void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -64,6 +64,8 @@ public:

    static const sstring KW_TABLETS;

+    static const sstring KW_STORAGE_ENGINE;
+
    // FIXME: In origin the following consts are in CFMetaData.
    static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
    static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -9,6 +9,7 @@
 */


+#include "cql3/statements/cf_prop_defs.hh"
 #include "utils/assert.hh"
 #include <inttypes.h>
 #include <boost/regex.hpp>
@@ -266,6 +267,13 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
        stmt_warning("CREATE TABLE WITH COMPACT STORAGE is deprecated and will eventually be removed in a future version.");
    }

+    if (_properties.properties()->has_property(cf_prop_defs::KW_STORAGE_ENGINE)) {
+        auto storage_engine = _properties.properties()->get_string(cf_prop_defs::KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor" && !_column_aliases.empty()) {
+            throw exceptions::configuration_exception("The 'logstor' storage engine cannot be used with tables that have clustering columns");
+        }
+    }
+
    auto& key_aliases = _key_aliases[0];
    std::vector<data_type> key_types;
    for (auto&& alias : key_aliases) {
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -273,7 +273,10 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
        co_return coroutine::exception(
                std::make_exception_ptr(exceptions::invalid_request_exception(
-                        format("Consistency level {} is not allowed for write operations", cl))));
+                        format("Write consistency level {} is forbidden by the current configuration "
+                               "setting of write_consistency_levels_disallowed. Please use a different "
+                               "consistency level, or remove {} from write_consistency_levels_disallowed "
+                               "set in the configuration.", cl, cl))));
    }

    _restrictions->validate_primary_key(options);
@@ -281,7 +284,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
    if (has_conditions()) {
        auto result = co_await execute_with_condition(qp, qs, options);
        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+            result->add_warning(format("Using write consistency level {} listed on the "
+                                       "write_consistency_levels_warned is not recommended.", cl));
        }
        co_return result;
    }
@@ -303,7 +307,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    auto result = seastar::make_shared<cql_transport::messages::result_message::void_message>();
    if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-        result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+        result->add_warning(format("Using write consistency level {} listed on the "
+                                   "write_consistency_levels_warned is not recommended.", cl));
    }
    if (keys_size_one) {
        auto&& table = s->table();
--- a/db/config.cc
+++ b/db/config.cc
@@ -679,6 +679,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The directory where hints files are stored if hinted handoff is enabled.")
    , view_hints_directory(this, "view_hints_directory", value_status::Used, "",
        "The directory where materialized-view updates are stored while a view replica is unreachable.")
+    , logstor_directory(this, "logstor_directory", value_status::Used, "",
+        "The directory where data files for logstor storage are stored.")
    , saved_caches_directory(this, "saved_caches_directory", value_status::Unused, "",
        "The directory location where table key and row caches are stored.")
    /**
@@ -862,6 +864,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "* offheap_objects  Native memory, eliminating NIO buffer heap overhead.")
    , memtable_cleanup_threshold(this, "memtable_cleanup_threshold", value_status::Invalid, .11,
        "Ratio of occupied non-flushing memtable size to total permitted size for triggering a flush of the largest memtable. Larger values mean larger flushes and less compaction, but also less concurrent flush activity, which can make it difficult to keep your disks saturated under heavy write load.")
+    , logstor_disk_size_in_mb(this, "logstor_disk_size_in_mb", value_status::Used, 2048,
+        "Total size in megabytes allocated for logstor storage on disk.")
+    , logstor_file_size_in_mb(this, "logstor_file_size_in_mb", value_status::Used, 32,
+        "Total size in megabytes allocated for each logstor data file on disk.")
+    , logstor_separator_delay_limit_ms(this, "logstor_separator_delay_limit_ms", value_status::Used, 100,
+        "Maximum delay in milliseconds for logstor separator debt control.")
+    , logstor_separator_max_memory_in_mb(this, "logstor_separator_max_memory_in_mb", value_status::Used, 256,
+        "Maximum memory in megabytes for logstor separator memory buffers.")
    , file_cache_size_in_mb(this, "file_cache_size_in_mb", value_status::Unused, 512,
        "Total memory to use for SSTable-reading buffers.")
    , memtable_flush_queue_size(this, "memtable_flush_queue_size", value_status::Unused, 4,
@@ -1281,6 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted).")
    , enable_cache(this, "enable_cache", value_status::Used, true, "Enable cache.")
    , enable_commitlog(this, "enable_commitlog", value_status::Used, true, "Enable commitlog.")
+    , enable_logstor(this, "enable_logstor", value_status::Used, false, "Enable the logstor storage engine.")
    , volatile_system_keyspace_for_testing(this, "volatile_system_keyspace_for_testing", value_status::Used, false, "Don't persist system keyspace - testing only!")
    , api_port(this, "api_port", value_status::Used, 10000, "Http Rest API port.")
    , api_address(this, "api_address", value_status::Used, "", "Http Rest API address.")
@@ -1692,6 +1703,7 @@ void db::config::setup_directories() {
    maybe_in_workdir(data_file_directories, "data");
    maybe_in_workdir(hints_directory, "hints");
    maybe_in_workdir(view_hints_directory, "view_hints");
+    maybe_in_workdir(logstor_directory, "logstor");
    maybe_in_workdir(saved_caches_directory, "saved_caches");
 }

@@ -1861,7 +1873,8 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},
        {"tablets", feature::UNUSED},
        {"views-with-tablets", feature::UNUSED},
-        {"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES}
+        {"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES},
+        {"logstor", feature::LOGSTOR}
    };
 }

--- a/db/config.hh
+++ b/db/config.hh
@@ -117,7 +117,8 @@ struct experimental_features_t {
        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
-        STRONGLY_CONSISTENT_TABLES
+        STRONGLY_CONSISTENT_TABLES,
+        LOGSTOR,
    };
    static std::map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
@@ -201,6 +202,7 @@ public:
    named_value<uint64_t> data_file_capacity;
    named_value<sstring> hints_directory;
    named_value<sstring> view_hints_directory;
+    named_value<sstring> logstor_directory;
    named_value<sstring> saved_caches_directory;
    named_value<sstring> commit_failure_policy;
    named_value<sstring> disk_failure_policy;
@@ -244,6 +246,10 @@ public:
    named_value<bool> defragment_memory_on_idle;
    named_value<sstring> memtable_allocation_type;
    named_value<double> memtable_cleanup_threshold;
+    named_value<uint32_t> logstor_disk_size_in_mb;
+    named_value<uint32_t> logstor_file_size_in_mb;
+    named_value<uint32_t> logstor_separator_delay_limit_ms;
+    named_value<uint32_t> logstor_separator_max_memory_in_mb;
    named_value<uint32_t> file_cache_size_in_mb;
    named_value<uint32_t> memtable_flush_queue_size;
    named_value<uint32_t> memtable_flush_writers;
@@ -364,6 +370,7 @@ public:
    named_value<bool> enable_in_memory_data_store;
    named_value<bool> enable_cache;
    named_value<bool> enable_commitlog;
+    named_value<bool> enable_logstor;
    named_value<bool> volatile_system_keyspace_for_testing;
    named_value<uint16_t> api_port;
    named_value<sstring> api_address;
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -336,6 +336,8 @@ schema_ptr scylla_tables(schema_features features) {
        // since it is written to only after the cluster feature is enabled.
        sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false));

+        sb.with_column("storage_engine", utf8_type);
+
        sb.with_hash_version();
        s = sb.build();
    }
@@ -1676,6 +1678,9 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times
            m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp));
        }
    }
+    if (table->logstor_enabled()) {
+        m.set_clustered_cell(ckey, "storage_engine", "logstor", timestamp);
+    }
    // In-memory tables are deprecated since scylla-2024.1.0
    // FIXME: delete the column when there's no live version supporting it anymore.
    // Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature
@@ -2161,6 +2166,13 @@ static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, sche
        auto tablet_options = db::tablet_options(*opt_map);
        builder.set_tablet_options(tablet_options.to_map());
    }
+    if (auto storage_engine = table_row.get<sstring>("storage_engine")) {
+        if (*storage_engine == "logstor") {
+            builder.set_logstor();
+        } else {
+            throw std::invalid_argument(format("Invalid value for storage_engine: {}", *storage_engine));
+        }
+    }
 }

 schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, const data_dictionary::user_types_storage& user_types, schema_ptr cdc_schema, std::optional<table_schema_version> version)
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -3052,7 +3052,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
        co_return ret;
    }

-    const bool strongly_consistent_tables = _db.features().strongly_consistent_tables;
+    const bool tablet_balancing_not_supported = _db.features().strongly_consistent_tables || _db.features().logstor;

    for (auto& row : *rs) {
        if (!row.has("host_id")) {
@@ -3289,7 +3289,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            ret.session = service::session_id(some_row.get_as<utils::UUID>("session"));
        }

-        if (strongly_consistent_tables) {
+        if (tablet_balancing_not_supported) {
            ret.tablet_balancing_enabled = false;
        } else if (some_row.has("tablet_balancing_enabled")) {
            ret.tablet_balancing_enabled = some_row.get_as<bool>("tablet_balancing_enabled");
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2647,7 +2647,7 @@ future<> view_builder::add_new_view(view_ptr view, build_step& step) {
    }

    if (this_shard_id() == smp::count - 1) {
-        co_await utils::get_local_injector().inject("add_new_view_pause_last_shard", utils::wait_for_message(5min));
+        inject_failure("add_new_view_fail_last_shard");
    }

    co_await _sys_ks.register_view_for_building(view->ks_name(), view->cf_name(), step.current_token());
--- a/dht/token.hh
+++ b/dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
    after_all_keys,
 };

+// Represents a token for partition keys.
+// Has a disengaged state, which sorts before all engaged states.
+struct raw_token {
+    int64_t value;
+
+    /// Constructs a disengaged token.
+    raw_token() : value(std::numeric_limits<int64_t>::min()) {}
+
+    /// Constructs an engaged token.
+    /// The token must be of token_kind::key kind.
+    explicit raw_token(const token&);
+
+    explicit raw_token(int64_t v) : value(v) {};
+
+    std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
+    std::strong_ordering operator<=>(const token& o) const noexcept;
+
+    /// Returns true iff engaged.
+    explicit operator bool() const noexcept {
+        return value != std::numeric_limits<int64_t>::min();
+    }
+};
+
+using raw_token_opt = seastar::optimized_optional<raw_token>;
+
 class token {
    // INT64_MIN is not a legal token, but a special value used to represent
    // infinity in token intervals.
@@ -52,6 +77,10 @@ public:

    constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}

+    token(raw_token raw) noexcept
+        : token(raw ? kind::key : kind::before_all_keys, raw.value)
+    { }
+
    // This constructor seems redundant with the bytes_view constructor, but
    // it's necessary for IDL, which passes a deserialized_bytes_proxy here.
    // (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
@@ -223,6 +252,29 @@ public:
    }
 };

+inline
+raw_token::raw_token(const token& t)
+    : value(t.raw())
+{
+#ifdef DEBUG
+    assert(t._kind == token::kind::key);
+#endif
+}
+
+inline
+std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
+    switch (o._kind) {
+        case token::kind::after_all_keys:
+            return std::strong_ordering::less;
+        case token::kind::before_all_keys:
+            // before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
+            // So we can order them by just comparing raw values.
+            [[fallthrough]];
+        case token::kind::key:
+            return value <=> o._data;
+    }
+}
+
 inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
    if (l1 == l2) {
        return std::strong_ordering::equal;
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
    }
 };

+template <>
+struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const dht::raw_token& t, FormatContext& ctx) const {
+        if (!t) {
+            return fmt::format_to(ctx.out(), "null");
+        }
+        return fmt::format_to(ctx.out(), "{}", t.value);
+    }
+};
+
 namespace std {

 template<>
--- a/dist/common/sysconfig/scylla-node-exporter
+++ b/dist/common/sysconfig/scylla-node-exporter
@@ -1 +1 @@
-SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
+SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --collector.systemd --collector.systemd.unit-include='^(scylla-server|systemd-coredump.*)\.service$' --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -139,7 +139,7 @@ The ``WHERE`` clause
 ~~~~~~~~~~~~~~~~~~~~

 The ``WHERE`` clause specifies which rows must be queried. It is composed of relations on the columns that are part of
-the ``PRIMARY KEY``.
+the ``PRIMARY KEY``, and relations can be joined only with ``AND`` (``OR`` and other logical operators are not supported).

 Not all relations are allowed in a query. For instance, non-equal relations (where ``IN`` is considered as an equal
 relation) on a partition key are not supported (see the use of the ``TOKEN`` method below to do non-equal queries on
@@ -200,6 +200,23 @@ The tuple notation may also be used for ``IN`` clauses on clustering columns::
     WHERE userid = 'john doe'
       AND (blog_title, posted_at) IN (('John''s Blog', '2012-01-01'), ('Extreme Chess', '2014-06-01'))

+This tuple notation is different from boolean grouping. For example, the following query is not supported::
+
+    SELECT * FROM users
+     WHERE (country = 'BR' AND state = 'SP')
+
+because parentheses are only allowed around a single relation, so this works: ``(country = 'BR') AND (state = 'SP')``, but this does not: ``(country = 'BR' AND state = 'SP')``.
+Similarly, an extended query of the form of::
+
+    SELECT * FROM users
+     WHERE (country = 'BR' AND state = 'SP')
+       OR (country = 'BR' AND state = 'RJ')
+
+won't work due to both: grouping boolean expressions and not supporting ``OR``, so when possible,
+rewrite such queries with ``IN`` on the varying column, for example
+``country = 'BR' AND state IN ('SP', 'RJ')``, or run multiple queries and merge
+the results client-side.
+
 The ``CONTAINS`` operator may only be used on collection columns (lists, sets, and maps). In the case of maps,
 ``CONTAINS`` applies to the map values. The ``CONTAINS KEY`` operator may only be used on map columns and applies to the
 map keys.
--- a/docs/cql/guardrails.rst
+++ b/docs/cql/guardrails.rst
@@ -0,0 +1,236 @@
+.. highlight:: cql
+
+.. _cql-guardrails:
+
+CQL Guardrails
+==============
+
+ScyllaDB provides a set of configurable guardrail parameters that help operators
+enforce best practices and prevent misconfigurations that could degrade cluster
+health, availability, or performance. Guardrails operate at two severity levels:
+
+* **Warn**: The request succeeds, but the server includes a warning in the CQL
+  response. Depending on the specific guardrail, the warning may also be logged on the server side.
+* **Fail**: The request is rejected with an error/exception (the specific type
+  depends on the guardrail). The user must correct the request or adjust the
+  guardrail configuration to proceed.
+
+.. note::
+
+   Guardrails are checked only when a statement is
+   executed. They do not retroactively validate existing keyspaces, tables, or
+   previously completed writes.
+
+For the full list of configuration properties, including types, defaults, and
+liveness information, see :doc:`Configuration Parameters </reference/configuration-parameters>`.
+
+.. _guardrails-replication-factor:
+
+Replication Factor Guardrails
+-----------------------------
+
+These four parameters control the minimum and maximum allowed replication factor
+(RF) values. They are evaluated whenever a ``CREATE KEYSPACE`` or
+``ALTER KEYSPACE`` statement is executed. Each data center's RF is checked
+individually.
+
+An RF of ``0`` — which means "do not replicate to this data center" — is
+always allowed and never triggers a guardrail.
+
+A threshold value of ``-1`` disables the corresponding check.
+
+``minimum_replication_factor_warn_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF is set to a value greater than ``0`` and lower than
+this threshold, the server attaches a warning to the CQL response identifying
+the offending data center and RF value.
+
+**When to use.** The default of ``3`` is the standard recommendation for
+production clusters. An RF below ``3`` means that the cluster cannot tolerate
+even a single node failure without data loss or read unavailability (assuming
+``QUORUM`` consistency). Keep this at ``3`` unless your deployment has specific
+constraints (e.g., a development or test cluster with fewer than 3 nodes).
+
+``minimum_replication_factor_fail_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF is set to a value greater than ``0`` and lower than
+this threshold, the request is rejected with a ``ConfigurationException``
+identifying the offending data center and RF value.
+
+**When to use.** Enable this parameter (e.g., set to ``3``) in production
+environments where allowing a low RF would be operationally dangerous. Unlike
+the warn threshold, this provides a hard guarantee that no keyspace can be
+created or altered to have an RF below the limit.
+
+``maximum_replication_factor_warn_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF exceeds this threshold, the server attaches a warning to the CQL response identifying
+the offending data center and RF value.
+
+**When to use.** An excessively high RF increases write amplification and
+storage costs proportionally. For example, an RF of ``5`` means every write
+is replicated to five nodes. Set this threshold to alert operators who
+may unintentionally set an RF that is too high.
+
+``maximum_replication_factor_fail_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF exceeds this threshold, the request is rejected with a ``ConfigurationException``
+identifying the offending data center and RF value.
+
+**When to use.** Enable this parameter to prevent accidental creation of
+keyspaces with an unreasonably high RF. An extremely high RF wastes storage and
+network bandwidth and can lead to write latency spikes. This is a hard limit —
+the keyspace creation or alteration will not proceed until the RF is lowered.
+
+**Metrics.** ScyllaDB exposes per-shard metrics that track the number of
+times each replication factor guardrail has been triggered:
+
+* ``scylla_cql_minimum_replication_factor_warn_violations``
+* ``scylla_cql_minimum_replication_factor_fail_violations``
+* ``scylla_cql_maximum_replication_factor_warn_violations``
+* ``scylla_cql_maximum_replication_factor_fail_violations``
+
+A sustained increase in any of these metrics indicates that
+``CREATE KEYSPACE`` or ``ALTER KEYSPACE`` requests are hitting the configured
+thresholds.
+
+.. _guardrails-replication-strategy:
+
+Replication Strategy Guardrails
+-------------------------------
+
+These two parameters control which replication strategies trigger warnings or
+are rejected when a keyspace is created or altered.
+
+``replication_strategy_warn_list``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
+statement is on this list, the server attaches a warning to the CQL response
+identifying the discouraged strategy and the affected keyspace.
+
+**When to use.** ``SimpleStrategy`` is not recommended for production use.
+It places replicas without awareness of data center or rack topology, which
+can undermine fault tolerance in multi-DC deployments. Even in single-DC
+deployments, ``NetworkTopologyStrategy`` is recommended because it keeps the
+schema ready for future topology changes.
+
+The default configuration warns on ``SimpleStrategy``, which is appropriate
+for most deployments. If you have existing keyspaces that use
+``SimpleStrategy``, see :doc:`Update Topology Strategy From Simple to Network
+</operating-scylla/procedures/cluster-management/update-topology-strategy-from-simple-to-network>`
+for the migration procedure.
+
+``replication_strategy_fail_list``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
+statement is on this list, the request is rejected with a
+``ConfigurationException`` identifying the forbidden strategy and the affected
+keyspace.
+
+**When to use.** In production environments, add ``SimpleStrategy`` to this
+list to enforce ``NetworkTopologyStrategy`` across all keyspaces. This helps
+prevent new production keyspaces from being created with a topology-unaware
+strategy.
+
+**Metrics.** The following per-shard metrics track replication strategy
+guardrail violations:
+
+* ``scylla_cql_replication_strategy_warn_list_violations``
+* ``scylla_cql_replication_strategy_fail_list_violations``
+
+.. _guardrails-write-consistency-level:
+
+Write Consistency Level Guardrails
+----------------------------------
+
+These two parameters control which consistency levels (CL) are allowed for
+write operations (``INSERT``, ``UPDATE``, ``DELETE``, and ``BATCH``
+statements).
+
+Be aware that adding warnings to CQL responses can significantly increase
+network traffic and reduce overall throughput.
+
+``write_consistency_levels_warned``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If a write operation uses a consistency level on this list, the server attaches
+a warning to the CQL response identifying the discouraged consistency level.
+
+**When to use.** Use this parameter to alert application developers when they
+use a consistency level that, while technically functional, is not recommended
+for the workload. Common examples:
+
+* **Warn on** ``ANY``: writes at ``ANY`` are acknowledged as soon as at least
+  one node (including a coordinator acting as a hinted handoff store) receives
+  the mutation. This means data may not be persisted on any replica node at
+  the time of acknowledgement, risking data loss if the coordinator fails
+  before hinted handoff completes.
+* **Warn on** ``ALL``: writes at ``ALL`` require every replica to acknowledge
+  the write. If any single replica is down, the write fails. This significantly
+  reduces write availability.
+
+``write_consistency_levels_disallowed``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If a write operation uses a consistency level on this list, the request is
+rejected with an ``InvalidRequestException`` identifying the forbidden
+consistency level.
+
+**When to use.** Use this parameter to hard-block consistency levels that are
+considered unsafe for your deployment:
+
+* **Disallow** ``ANY``: in production environments, ``ANY`` is almost never
+  appropriate. It provides the weakest durability guarantee and is a common
+  source of data-loss incidents when operators or application developers use it
+  unintentionally.
+* **Disallow** ``ALL``: in clusters where high write availability is critical,
+  blocking ``ALL`` prevents a single node failure from causing write
+  unavailability.
+
+**Metrics.** The following per-shard metrics track write consistency level
+guardrail violations:
+
+* ``scylla_cql_write_consistency_levels_warned_violations``
+* ``scylla_cql_write_consistency_levels_disallowed_violations``
+
+Additionally, ScyllaDB exposes the
+``scylla_cql_writes_per_consistency_level`` metric, labeled by consistency
+level, which tracks the total number of write requests per CL. This metric is
+useful for understanding the current write-CL distribution across the cluster
+*before* deciding which levels to warn on or disallow. For example, querying
+this metric can reveal whether any application is inadvertently using ``ANY``
+or ``ALL`` for writes.
+
+.. _guardrails-compact-storage:
+
+Compact Storage Guardrail
+-------------------------
+
+``enable_create_table_with_compact_storage``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This boolean parameter controls whether ``CREATE TABLE`` statements with the
+deprecated ``COMPACT STORAGE`` option are allowed. Unlike the other guardrails,
+it acts as a simple on/off switch rather than using separate warn and fail
+thresholds.
+
+**When to use.** Leave this at the default (``false``) for all new
+deployments. ``COMPACT STORAGE`` is a legacy feature that will be permanently
+removed in a future version of ScyllaDB. Set to ``true`` only if you have a specific,
+temporary need to create compact storage tables (e.g., compatibility with legacy
+applications during a migration). For details on the ``COMPACT STORAGE`` option, see
+:ref:`Compact Tables <compact-tables>` in the Data Definition documentation.
+
+Additional References
+---------------------
+
+* :doc:`Consistency Level </cql/consistency>`
+* :doc:`Data Definition (CREATE/ALTER KEYSPACE) </cql/ddl>`
+* :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
+* :doc:`Metrics Reference </reference/metrics>`
--- a/docs/cql/index.rst
+++ b/docs/cql/index.rst
@@ -17,6 +17,7 @@ CQL Reference
   secondary-indexes
   time-to-live
   functions
+   guardrails
   wasm
   json
   mv
@@ -46,6 +47,7 @@ It allows you to create keyspaces and tables, insert and query tables, and more.
  * :doc:`Data Types </cql/types>`
  * :doc:`Definitions </cql/definitions>`
  * :doc:`Global Secondary Indexes </cql/secondary-indexes>`
+  * :doc:`CQL Guardrails </cql/guardrails>`
  * :doc:`Expiring Data with Time to Live (TTL) </cql/time-to-live>`
  * :doc:`Functions </cql/functions>`
  * :doc:`JSON Support </cql/json>`
--- a/docs/dev/logstor.md
+++ b/docs/dev/logstor.md
@@ -0,0 +1,124 @@
+# Logstor
+
+## Introduction
+
+Logstor is a log-structured storage engine for ScyllaDB optimized for key-value workloads. It provides an alternative storage backend for key-value tables - tables with a partition key only, with no clustering columns.
+
+Unlike the traditional LSM-tree based storage, logstor uses a log-structured approach with in-memory indexing, making it particularly suitable for workloads with frequent overwrites and point lookups.
+
+## Architecture
+
+Logstor consists of several key components:
+
+### Components
+
+#### Primary Index
+
+The primary index is entirely in memory and it maps a partition key to its location in the log segments. It consists of a B-tree per each table that is ordered token.
+
+#### Segment Manager
+
+The `segment_manager` handles the allocation and management of fixed-size segments (default 128KB). Segments are grouped into large files (default 32MB). Key responsibilities include:
+
+- **Segment allocation**: Provides segments for writing new data
+- **Space reclamation**: Tracks free space in each segment
+- **Compaction**: Copies live data from sparse segments to reclaim space
+- **Recovery**: Scans segments on startup to rebuild the index
+- **Separator**: Rewrites segments that have records from different compaction groups into new segments that are separated by compaction group.
+
+The data in the segments consists of records of type `log_record`. Each record contains the value for some key as a `canonical_mutation` and additional metadata.
+
+The `segment_manager` receives new writes via a `write_buffer` and writes them sequentially to the active segment with 4k-block alignment.
+
+#### Write Buffer
+
+The `write_buffer` manages a buffer of log records and handles the serialization of the records including headers and alignment. It can be used to write multiple records to the buffer and then write the buffer to the segment manager.
+
+The `buffered_writer` manages multiple write buffers for user writes, an active buffer and multiple flushing ones, to batch writes and manage backpressure.
+
+### Data Flow
+
+**Write Path:**
+1. Application writes mutation to logstor
+2. Mutation is converted to a log record
+3. Record is written to write buffer
+4. The buffer is switched and written to the active segment.
+5. Index is updated with new record locations
+6. Old record locations (for overwrites) are marked as free
+
+**Read Path:**
+1. Application requests data for a partition key
+2. Index lookup returns record location
+3. Segment manager reads record from disk
+4. Record is deserialized into a mutation and returned
+
+**Separator:**
+1. When a record is written to the active segment, it is also written to its compaction group's separator buffer. The separator buffer holds a reference to the original segment.
+2. The separator buffer is flushed when it's full, or requested to flush for other reason. It is written into a new segment in the compaction group, and it updates the location of the records from the original mixed segments to the new segments in the compaction group.
+3. After the separator buffer is flushed and all records from the original segment are moved, it releases the reference of the segment. When there are no more reference to the segment it is freed.
+
+**Compaction:**
+1. The amount of live data is tracked for each segment in its segment_descriptor. The segment descriptors are stored in a histogram by live data.
+2. A segment set from a single compaction group is submitted for compaction.
+3. Compaction picks segments for compaction from the segment set. It chooses segments with the lowest utilization such that compacting them results in net gain of free segments.
+4. It reads the segments, finding all live records, and writing them into a write buffer. When the buffer is full it is flushed into a new segment, and for each recording updating the index location to the new location.
+5. After all live records are rewritten the old segments are freed.
+
+## Usage
+
+### Enabling Logstor
+
+To use logstor, enable it in the configuration:
+
+```yaml
+enable_logstor: true
+
+experimental_features:
+  - logstor
+```
+
+### Creating Tables
+
+Tables using logstor must have no clustering columns, and created with the `storage_engine` property equals to 'logstor':
+
+```cql
+CREATE TABLE keyspace.user_profiles (
+    user_id uuid PRIMARY KEY,
+    name text,
+    email text,
+    metadata frozen<map<text, text>>
+) WITH storage_engine = 'logstor';
+```
+
+### Basic Operations
+
+**Insert/Update:**
+
+```cql
+INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'value1');
+INSERT INTO keyspace.table_name (pk, v) VALUES (2, 'value2');
+
+-- Overwrite with new value
+INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'updated_value');
+```
+
+Currently, updates must write the full row. Updating individual columns is not yet supported. Each write replaces the entire partition.
+
+**Select:**
+
+```cql
+SELECT * FROM keyspace.table_name WHERE pk = 1;
+-- Returns: (1, 'updated_value')
+
+SELECT pk, v FROM keyspace.table_name WHERE pk = 2;
+-- Returns: (2, 'value2')
+
+SELECT * FROM keyspace.table_name;
+-- Returns: (1, 'updated_value'), (2, 'value2')
+```
+
+**Delete:**
+
+```cql
+DELETE FROM keyspace.table_name WHERE pk = 1;
+```
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -52,7 +52,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
+               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|


        #. Install ScyllaDB packages.
@@ -125,7 +125,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
+               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|

        #. Install ScyllaDB packages.

@@ -133,19 +133,19 @@ Install ScyllaDB
    
               sudo yum install scylla

-            Running the command installs the latest official version of ScyllaDB Open Source.
-            Alternatively, you can to install a specific patch version:
+            Running the command installs the latest official version of ScyllaDB.
+            Alternatively, you can install a specific patch version:

            .. code-block:: console
    
               sudo yum install scylla-<your patch version>

-            Example: The following example shows the command to install ScyllaDB 5.2.3.
+            Example: The following example shows installing ScyllaDB 2025.3.1.

            .. code-block:: console
               :class: hide-copy-button
    
-               sudo yum install scylla-5.2.3
+               sudo yum install scylla-2025.3.1

 .. include:: /getting-started/_common/setup-after-install.rst

--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -36,11 +36,8 @@ release versions, run:
  curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases


-Versions 2025.1 and Later
-==============================
-
-Run the command with the ``--scylla-version`` option to specify the version
-you want to install.
+To install a non-default version, run the command with the ``--scylla-version``
+option to specify the version you want to install.

 **Example**

@@ -50,20 +47,4 @@ you want to install.
  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|


-Versions Earlier than 2025.1
-================================
-
-To install a supported version of *ScyllaDB Enterprise*, run the command with:
-
-* ``--scylla-product scylla-enterprise`` to specify that you want to install
-  ScyllaDB Entrprise.
-* ``--scylla-version`` to specify the version you want to install.
-
-For example:
-
-.. code:: console
-  
-  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
-
-
 .. include:: /getting-started/_common/setup-after-install.rst
--- a/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
+++ b/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
@@ -57,12 +57,11 @@ To enable shared dictionaries:
    internode_compression_enable_advanced: true
    rpc_dict_training_when: when_leader

-.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
+.. note::

-             Trained dictionaries contain randomly chosen samples of data transferred between
-             nodes. The data samples are persisted in the Raft log, which is not encrypted.
-             As a result, some data from otherwise encrypted tables might be stored on disk
-             unencrypted.
+   Some dictionary training data may be encrypted using storage-level encryption
+   (if enabled) instead of database-level encryption, meaning protection is
+   applied at the storage layer rather than within the database itself.


 Reference
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -172,6 +172,7 @@ public:
    gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
    gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
    gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
+    gms::feature logstor { *this, "LOGSTOR"sv };
    gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
    gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
    gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
--- a/idl/CMakeLists.txt
+++ b/idl/CMakeLists.txt
@@ -48,6 +48,7 @@ set(idl_headers
  messaging_service.idl.hh
  paxos.idl.hh
  raft.idl.hh
+  raft_util.idl.hh
  raft_storage.idl.hh
  group0.idl.hh
  hinted_handoff.idl.hh
@@ -55,6 +56,7 @@ set(idl_headers
  storage_proxy.idl.hh
  storage_service.idl.hh
  strong_consistency/state_machine.idl.hh
+  logstor.idl.hh
  group0_state_machine.idl.hh
  mapreduce_request.idl.hh
  replica_exception.idl.hh
--- a/idl/logstor.idl.hh
+++ b/idl/logstor.idl.hh
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "idl/frozen_schema.idl.hh"
+#include "idl/token.idl.hh"
+#include "mutation/canonical_mutation.hh"
+
+namespace replica {
+namespace logstor {
+
+struct primary_index_key {
+    dht::decorated_key dk;
+};
+
+class log_record {
+    replica::logstor::primary_index_key key;
+    replica::logstor::record_generation generation;
+    table_id table;
+    canonical_mutation mut;
+};
+
+}
+}
--- a/init.cc
+++ b/init.cc
@@ -96,6 +96,9 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
    if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
        disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
    }
+    if (!cfg.check_experimental(db::experimental_features_t::feature::LOGSTOR)) {
+        disabled.insert("LOGSTOR"s);
+    }
    if (!cfg.table_digest_insensitive_to_expiry()) {
        disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
    }
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -531,6 +531,11 @@ tablet_id tablet_map::get_tablet_id(token t) const {
    return tablet_id(dht::compaction_group_of(_log2_tablets, t));
 }

+tablet_range_side tablet_map::get_tablet_range_side(token t) const {
+    auto id_after_split = dht::compaction_group_of(_log2_tablets + 1, t);
+    return tablet_range_side(id_after_split & 0x1);
+}
+
 std::pair<tablet_id, tablet_range_side> tablet_map::get_tablet_id_and_range_side(token t) const {
    auto id_after_split = dht::compaction_group_of(_log2_tablets + 1, t);
    auto current_id = id_after_split >> 1;
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -611,6 +611,10 @@ public:
    /// Returns tablet_id of a tablet which owns a given token.
    tablet_id get_tablet_id(token) const;

+    // Returns the side of the tablet's range that a given token belongs to.
+    // Less expensive than get_tablet_id_and_range_side() when tablet_id is already known.
+    tablet_range_side get_tablet_range_side(token) const;
+
    // Returns tablet_id and also the side of the tablet's range that a given token belongs to.
    std::pair<tablet_id, tablet_range_side> get_tablet_id_and_range_side(token) const;

--- a/main.cc
+++ b/main.cc
@@ -19,8 +19,6 @@
 #include "gms/inet_address.hh"
 #include "auth/allow_all_authenticator.hh"
 #include "auth/allow_all_authorizer.hh"
-#include "auth/maintenance_socket_authenticator.hh"
-#include "auth/maintenance_socket_role_manager.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/signal.hh>
 #include <seastar/core/timer.hh>
@@ -1964,6 +1962,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            checkpoint(stop_signal, "loading non-system sstables");
            replica::distributed_loader::init_non_system_keyspaces(db, proxy, sys_ks).get();

+            checkpoint(stop_signal, "recovering logstor");
+            db.invoke_on_all([] (replica::database& db) {
+                return db.recover_logstor();
+            }).get();
+
            // Depends on all keyspaces being initialized because after this call
            // we can be reloading schema.
            mm.local().register_feature_listeners();
@@ -2102,7 +2105,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            if (cfg->maintenance_socket() != "ignore") {
                checkpoint(stop_signal, "starting maintenance auth service");
                maintenance_auth_service.start(std::ref(qp), std::ref(group0_client),
-                        auth::make_authorizer_factory(auth::allow_all_authorizer_name, qp),
+                        auth::make_maintenance_socket_authorizer_factory(qp),
                        auth::make_maintenance_socket_authenticator_factory(qp, group0_client, mm, auth_cache),
                        auth::make_maintenance_socket_role_manager_factory(qp, group0_client, mm, auth_cache),
                        maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
--- a/node_ops/task_manager_module.cc
+++ b/node_ops/task_manager_module.cc
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
        .entity = stats.entity,
        .progress_units = "",
        .progress = tasks::task_manager::task::progress{},
-        .children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
+        .children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
    };
 }

--- a/pgo/exec_cql.py
+++ b/pgo/exec_cql.py
@@ -8,9 +8,10 @@

 """exec_cql.py
 Execute CQL statements from a file where each non-empty, non-comment line is exactly one CQL statement.
+Connects via a Unix domain socket (maintenance socket), bypassing authentication.
 Requires python cassandra-driver. Stops at first failure.
 Usage:
-  ./exec_cql.py --file ./conf/auth.cql [--host 127.0.0.1 --port 9042]
+  ./exec_cql.py --file ./conf/auth.cql --socket /path/to/cql.m
 """
 import argparse, os, sys
 from typing import Sequence
@@ -26,18 +27,27 @@ def read_statements(path: str) -> list[tuple[int, str]]:
                stms.append((lineno, line))
    return stms

-def exec_driver(statements: list[tuple[int, str]], host: str, port: int, timeout: float, username: str, password: str) -> int:
+def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout: float) -> int:
+    """Execute CQL statements via a Unix domain socket (maintenance socket).
+
+    The maintenance socket only starts listening after the auth subsystem is
+    fully initialised, so a successful connect means the node is ready.
+    """
+    from cassandra.cluster import Cluster
+    from cassandra.connection import UnixSocketEndPoint  # type: ignore
+    from cassandra.policies import WhiteListRoundRobinPolicy  # type: ignore
+
+    ep = UnixSocketEndPoint(socket_path)
    try:
-        from cassandra.cluster import Cluster
-        from cassandra.auth import PlainTextAuthProvider  # type: ignore
-    except Exception:
-        print('ERROR: cassandra-driver not installed. Install with: pip install cassandra-driver', file=sys.stderr)
+        cluster = Cluster(
+            contact_points=[ep],
+            load_balancing_policy=WhiteListRoundRobinPolicy([ep]),
+        )
+        session = cluster.connect()
+    except Exception as e:
+        print(f'ERROR: failed to connect to maintenance socket {socket_path}: {e}', file=sys.stderr)
        return 2
-    auth_provider = None
-    if username != "":
-        auth_provider = PlainTextAuthProvider(username=username, password=password)
-    cluster = Cluster([host], port=port, auth_provider=auth_provider)
-    session = cluster.connect()
+
    try:
        for _, (lineno, s) in enumerate(statements, 1):
            try:
@@ -50,13 +60,11 @@ def exec_driver(statements: list[tuple[int, str]], host: str, port: int, timeout
    return 0

 def main(argv: Sequence[str]) -> int:
-    ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file (driver only)')
+    ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file via maintenance socket')
    ap.add_argument('--file', required=True)
-    ap.add_argument('--host', default='127.0.0.1')
-    ap.add_argument('--port', type=int, default=9042)
+    ap.add_argument('--socket', required=True,
+                    help='Path to the Unix domain maintenance socket (<workdir>/cql.m)')
    ap.add_argument('--timeout', type=float, default=30.0)
-    ap.add_argument('--username', default='cassandra')
-    ap.add_argument('--password', default='cassandra')
    args = ap.parse_args(argv)
    if not os.path.isfile(args.file):
        print(f"File not found: {args.file}", file=sys.stderr)
@@ -65,7 +73,7 @@ def main(argv: Sequence[str]) -> int:
    if not stmts:
        print('No statements found', file=sys.stderr)
        return 1
-    rc = exec_driver(stmts, args.host, args.port, args.timeout, args.username, args.password)
+    rc = exec_statements(stmts, args.socket, args.timeout)
    if rc == 0:
        print('All statements executed successfully')
    return rc
--- a/pgo/pgo.py
+++ b/pgo/pgo.py
@@ -452,6 +452,28 @@ async def merge_profraw(directory: PathLike) -> None:
    if glob.glob(f"{directory}/*.profraw"):
        await bash(fr"llvm-profdata merge {q(directory)}/*.profraw -output {q(directory)}/prof.profdata")

+def maintenance_socket_path(cluster_workdir: PathLike, addr: str) -> str:
+    """Returns the absolute path of the maintenance socket for a given node.
+
+    With ``maintenance_socket: workdir`` in scylla.yaml the socket lives at
+    ``<node-workdir>/cql.m``, i.e. ``<cluster_workdir>/<addr>/cql.m``.
+    """
+    return os.path.realpath(f"{cluster_workdir}/{addr}/cql.m")
+
+async def setup_cassandra_user(workdir: PathLike, addr: str) -> None:
+    """Create the ``cassandra`` superuser via the maintenance socket.
+
+    The default cassandra superuser is no longer seeded automatically, but
+    ``cassandra-stress`` hardcodes ``user=cassandra password=cassandra``.
+    We create the role over the maintenance socket so that cassandra-stress
+    and other tools that rely on the default credentials keep working.
+    """
+    socket = maintenance_socket_path(workdir, addr)
+    stmt = "CREATE ROLE cassandra WITH PASSWORD = 'cassandra' AND SUPERUSER = true AND LOGIN = true;"
+    f = q(socket)
+    # Write the statement to a temp file and execute it via exec_cql.py.
+    await bash(fr"""tmpf=$(mktemp); echo {q(stmt)} > "$tmpf"; python3 ./exec_cql.py --file "$tmpf" --socket {f}; rc=$?; rm -f "$tmpf"; exit $rc""")
+
 async def get_bolt_opts(executable: PathLike) -> list[str]:
    """Returns the extra opts which have to be passed to a BOLT-instrumented Scylla
    to trigger a generation of a BOLT profile file.
@@ -557,8 +579,10 @@ def kw(**kwargs):

@contextlib.asynccontextmanager
 async def with_cs_populate(executable: PathLike, workdir: PathLike) -> AsyncIterator[str]:
-    """Provides a Scylla cluster and waits for compactions to end before stopping it."""
+    """Provides a Scylla cluster, creates the cassandra superuser, and waits
+    for compactions to end before stopping it."""
    async with with_cluster(executable=executable, workdir=workdir) as (addrs, procs):
+        await setup_cassandra_user(workdir, addrs[0])
        yield addrs[0]
        async with asyncio.timeout(3600):
            # Should it also flush memtables?
@@ -667,9 +691,10 @@ populators["decommission_dataset"] = populate_decommission
 # AUTH CONNECTIONS STRESS ==================================================

 async def populate_auth_conns(executable: PathLike, workdir: PathLike) -> None:
-    # Create roles, table and permissions via CQL script.
+    # Create roles, table and permissions via CQL script over the maintenance socket.
    async with with_cs_populate(executable=executable, workdir=workdir) as server:
-        await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --host {server}")
+        socket = maintenance_socket_path(workdir, server)
+        await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --socket {q(socket)}")

 async def train_auth_conns(executable: PathLike, workdir: PathLike) -> None:
    # Repeatedly connect as the reader user and perform simple reads to stress
@@ -722,7 +747,8 @@ populators["si_dataset"] = populate_si

 async def populate_counters(executable: PathLike, workdir: PathLike) -> None:
    async with with_cs_populate(executable=executable, workdir=workdir) as server:
-        await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --host {server}")
+        socket = maintenance_socket_path(workdir, server)
+        await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --socket {q(socket)}")
        # Sleeps added in reaction to schema disagreement errors.
        # FIXME: get rid of this sleep and find a sane way to wait for schema
        # agreement.
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -68,6 +68,7 @@ public:
    using resources = reader_resources;

    friend class reader_permit;
+    friend struct reader_concurrency_semaphore_tester;

    enum class evict_reason {
        permit, // evicted due to permit shortage
--- a/replica/CMakeLists.txt
+++ b/replica/CMakeLists.txt
@@ -9,6 +9,9 @@ target_sources(replica
    memtable.cc
    exceptions.cc
    dirty_memory_manager.cc
+    logstor/segment_manager.cc
+    logstor/logstor.cc
+    logstor/write_buffer.cc
    multishard_query.cc
    mutation_dump.cc
    schema_describe_helper.cc
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -17,6 +17,7 @@
 // FIXME: un-nest compaction_reenabler, so we can forward declare it and remove this include.
 #include "compaction/compaction_manager.hh"
 #include "locator/tablets.hh"
+#include "replica/logstor/compaction.hh"
 #include "sstables/sstable_set.hh"
 #include "utils/chunked_vector.hh"
 #include <absl/container/flat_hash_map.h>
@@ -33,6 +34,10 @@ class effective_replication_map;

 namespace replica {

+namespace logstor {
+class primary_index;
+}
+
 using enable_backlog_tracker = bool_class<class enable_backlog_tracker_tag>;

 enum class repair_sstable_classification {
@@ -91,6 +96,12 @@ class compaction_group {
    bool _tombstone_gc_enabled = true;
    std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
    repair_classifier_func _repair_sstable_classifier;
+
+    lw_shared_ptr<logstor::segment_set> _logstor_segments;
+    std::optional<logstor::separator_buffer> _logstor_separator;
+    std::vector<future<>> _separator_flushes;
+    seastar::semaphore _separator_flush_sem{1};
+
 private:
    std::unique_ptr<compaction_group_view> make_compacting_view();
    std::unique_ptr<compaction_group_view> make_non_compacting_view();
@@ -223,6 +234,7 @@ public:
    const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept;
    // Triggers regular compaction.
    void trigger_compaction();
+    void trigger_logstor_compaction();
    bool compaction_disabled() const;
    future<unsigned> estimate_pending_compactions() const;

@@ -231,6 +243,7 @@ public:

    size_t live_sstable_count() const noexcept;
    uint64_t live_disk_space_used() const noexcept;
+    size_t logstor_disk_space_used() const noexcept;
    sstables::file_size_stats live_disk_space_used_full_stats() const noexcept;
    uint64_t total_disk_space_used() const noexcept;
    sstables::file_size_stats total_disk_space_used_full_stats() const noexcept;
@@ -262,12 +275,37 @@ public:
    compaction::compaction_manager& get_compaction_manager() noexcept;
    const compaction::compaction_manager& get_compaction_manager() const noexcept;

+    logstor::segment_manager& get_logstor_segment_manager() noexcept;
+    const logstor::segment_manager& get_logstor_segment_manager() const noexcept;
+
+    logstor::compaction_manager& get_logstor_compaction_manager() noexcept;
+    const logstor::compaction_manager& get_logstor_compaction_manager() const noexcept;
+
+    logstor::primary_index& get_logstor_index() noexcept;
+
    future<> split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info);

    void set_repair_sstable_classifier(repair_classifier_func repair_sstable_classifier) {
        _repair_sstable_classifier = std::move(repair_sstable_classifier);
    }

+    void add_logstor_segment(logstor::segment_descriptor& desc) {
+        _logstor_segments->add_segment(desc);
+    }
+
+    future<> discard_logstor_segments();
+
+    future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
+    logstor::separator_buffer& get_separator_buffer(size_t write_size);
+
+    logstor::segment_set& logstor_segments() noexcept {
+        return *_logstor_segments;
+    }
+
+    const logstor::segment_set& logstor_segments() const noexcept {
+        return *_logstor_segments;
+    }
+
    friend class storage_group;
 };

@@ -312,7 +350,14 @@ public:

    const compaction_group_ptr& main_compaction_group() const noexcept;
    const std::vector<compaction_group_ptr>& split_ready_compaction_groups() const;
-    compaction_group_ptr& select_compaction_group(locator::tablet_range_side) noexcept;
+    // Selects the compaction group for the given token. Computes the range side
+    // from the token only when in splitting mode. This avoids the cost of computing
+    // range side on the hot path when it's not needed.
+    compaction_group_ptr& select_compaction_group(dht::token, const locator::tablet_map&) noexcept;
+    // Selects the compaction group for an sstable spanning a token range.
+    // If the first and last tokens fall on different sides of the split point,
+    // the sstable belongs to the main compaction group.
+    compaction_group_ptr& select_compaction_group(dht::token first, dht::token last, const locator::tablet_map&) noexcept;

    uint64_t live_disk_space_used() const;

@@ -432,7 +477,9 @@ public:
    // refresh_mutation_source must be called when there are changes to data source
    // structures but logical state of data is not changed (e.g. when state for a
    // new tablet replica is allocated).
-    virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
+    virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                                  const locator::effective_replication_map& erm,
+                                                  noncopyable_function<void()> refresh_mutation_source) = 0;

    virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
    virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -76,6 +76,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "timeout_config.hh"
 #include "tombstone_gc.hh"
+#include "logstor/logstor.hh"
 #include "service/qos/service_level_controller.hh"

 #include "replica/data_dictionary_impl.hh"
@@ -393,6 +394,13 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
    // Allow system tables a pool of 10 MB memory to write, but never block on other regions.
    , _system_dirty_memory_manager(*this, 10 << 20, cfg.unspooled_dirty_soft_limit(), default_scheduling_group())
    , _dirty_memory_manager(*this, dbcfg.available_memory * 0.50, cfg.unspooled_dirty_soft_limit(), dbcfg.statement_scheduling_group)
+    , _dirty_memory_threshold_controller([this] {
+        if (_logstor) {
+            size_t logstor_memory_usage = get_logstor_memory_usage();
+            size_t available_memory = _dbcfg.available_memory > logstor_memory_usage ? _dbcfg.available_memory - logstor_memory_usage : 0;
+            _dirty_memory_manager.update_threshold(available_memory * 0.50);
+        }
+    })
    , _dbcfg(dbcfg)
    , _memtable_controller(make_flush_controller(_cfg, _dbcfg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
        auto backlog = (_dirty_memory_manager.unspooled_dirty_memory()) / limit;
@@ -906,6 +914,50 @@ database::init_commitlog() {
    });
 }

+future<>
+database::init_logstor() {
+    dblog.info("Initializing logstor");
+
+    auto cfg = logstor::logstor_config{
+        .segment_manager_cfg = {
+            .base_dir = std::filesystem::path(_cfg.logstor_directory()),
+            .file_size = _cfg.logstor_file_size_in_mb() * 1024ull * 1024ull,
+            .disk_size = _cfg.logstor_disk_size_in_mb() * 1024ull * 1024ull,
+            .compaction_sg = _dbcfg.compaction_scheduling_group,
+            .compaction_static_shares = _cfg.compaction_static_shares,
+            .separator_sg = _dbcfg.memtable_scheduling_group,
+            .separator_delay_limit_ms = _cfg.logstor_separator_delay_limit_ms(),
+            .max_separator_memory = _cfg.logstor_separator_max_memory_in_mb() * 1024ull * 1024ull,
+        },
+        .flush_sg = _dbcfg.commitlog_scheduling_group,
+    };
+    _logstor = std::make_unique<logstor::logstor>(std::move(cfg));
+
+    _logstor->set_trigger_compaction_hook([this] {
+        trigger_logstor_compaction(false);
+    });
+
+    _logstor->set_trigger_separator_flush_hook([this] (size_t seq_num) {
+        (void)flush_logstor_separator(seq_num);
+    });
+
+    dblog.info("logstor initialized");
+    co_return;
+}
+
+future<>
+database::recover_logstor() {
+    if (!_logstor) {
+        co_return;
+    }
+
+    co_await _logstor->do_recovery(*this);
+
+    co_await _logstor->start();
+
+    _dirty_memory_threshold_controller.arm_periodic(std::chrono::seconds(5));
+}
+
 future<> database::modify_keyspace_on_all_shards(sharded<database>& sharded_db, std::function<future<>(replica::database&)> func) {
    // Run func first on shard 0
    // to allow "seeding" of the effective_replication_map
@@ -1128,6 +1180,17 @@ void database::add_column_family(keyspace& ks, schema_ptr schema, column_family:
        cf->set_truncation_time(db_clock::time_point::min());
    }

+    if (schema->logstor_enabled()) {
+        if (!_cfg.enable_logstor()) {
+            throw std::runtime_error(fmt::format("The table {}.{} is using logstor storage but logstor is not enabled in the configuration", schema->ks_name(), schema->cf_name()));
+        }
+        if (!_logstor) {
+            on_internal_error(dblog, "The table is using logstor but logstor is not initialized");
+        }
+        cf->init_logstor(_logstor.get());
+        dblog.info0("Table {}.{} is using logstor storage", schema->ks_name(), schema->cf_name());
+    }
+
    auto uuid = schema->id();
    if (_tables_metadata.contains(uuid)) {
        throw std::invalid_argument("UUID " + uuid.to_sstring() + " already mapped");
@@ -1699,7 +1762,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
        if (!range.is_singular()) {
            continue;
        }
-        auto token = dht::token::to_int64(ranges.front().start()->value().token());
+        auto token = dht::token::to_int64(range.start()->value().token());
        if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
            // Don't return immediately - account all ranges first
            ret = can_proceed::no;
@@ -2163,7 +2226,7 @@ static std::exception_ptr wrap_commitlog_add_error(const schema_ptr& s, const fr

 future<> database::apply_with_commitlog(column_family& cf, const mutation& m, db::timeout_clock::time_point timeout) {
    db::rp_handle h;
-    if (cf.commitlog() != nullptr && cf.durable_writes()) {
+    if (cf.commitlog() != nullptr && cf.durable_writes() && !cf.uses_logstor()) {
        auto fm = freeze(m);
        std::exception_ptr ex;
        try {
@@ -2212,6 +2275,10 @@ future<> database::do_apply_many(const utils::chunked_vector<frozen_mutation>& m
        auto s = local_schema_registry().get(muts[i].schema_version());
        auto&& cf = find_column_family(muts[i].column_family_id());

+        if (cf.uses_logstor()) {
+            continue;
+        }
+
        if (!cl) {
            cl = cf.commitlog();
        } else if (cl != cf.commitlog()) {
@@ -2248,16 +2315,16 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra
    // assume failure until proven otherwise
    auto update_writes_failed = defer([&] { ++_stats->total_writes_failed; });

-    utils::get_local_injector().inject("database_apply", [&s] () {
-        if (!is_system_keyspace(s->ks_name())) {
-            throw std::runtime_error("injected error");
+    co_await utils::get_local_injector().inject("database_apply", [&s] (auto& handler) -> future<> {
+        if (s->ks_name() != handler.get("ks_name") || s->cf_name() != handler.get("cf_name")) {
+            co_return;
        }
-    });
-    co_await utils::get_local_injector().inject("database_apply_wait", [&] (auto& handler) -> future<> {
-        if (s->cf_name() == handler.get("cf_name")) {
-            dblog.info("database_apply_wait: wait");
+        if (handler.get("what") == "throw") {
+            throw std::runtime_error(format("injected error for {}.{}", s->ks_name(), s->cf_name()));
+        } else if (handler.get("what") == "wait") {
+            dblog.info("database_apply: wait");
            co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
-            dblog.info("database_apply_wait: done");
+            dblog.info("database_apply: done");
        }
    });

@@ -2309,7 +2376,7 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra
    // frames.
    db::rp_handle h;
    auto cl = cf.commitlog();
-    if (cl != nullptr && cf.durable_writes()) {
+    if (cl != nullptr && cf.durable_writes() && !cf.uses_logstor()) {
        std::exception_ptr ex;
        try {
            commitlog_entry_writer cew(s, m, sync);
@@ -2633,6 +2700,9 @@ future<> database::start(sharded<qos::service_level_controller>& sl_controller,
        _compaction_manager.enable();
    }
    co_await init_commitlog();
+    if (_cfg.enable_logstor()) {
+        co_await init_logstor();
+    }
 }

 future<> database::shutdown() {
@@ -2673,6 +2743,11 @@ future<> database::stop() {
        co_await _commitlog->shutdown();
        dblog.info("Shutting down commitlog complete");
    }
+    if (_logstor) {
+        dblog.info("Shutting down logstor");
+        co_await _logstor->stop();
+        dblog.info("Shutting down logstor complete");
+    }
    if (_schema_commitlog) {
        dblog.info("Shutting down schema commitlog");
        co_await _schema_commitlog->shutdown();
@@ -2807,6 +2882,53 @@ future<> database::drop_cache_for_keyspace_on_all_shards(sharded<database>& shar
    });
 }

+future<> database::trigger_logstor_compaction_on_all_shards(sharded<database>& sharded_db, bool major) {
+    return sharded_db.invoke_on_all([major] (replica::database& db) {
+        return db.trigger_logstor_compaction(major);
+    });
+}
+
+void database::trigger_logstor_compaction(bool major) {
+    _tables_metadata.for_each_table([&] (table_id id, const lw_shared_ptr<table> tp) {
+        if (tp->uses_logstor()) {
+            tp->trigger_logstor_compaction();
+        }
+    });
+}
+
+future<> database::flush_logstor_separator_on_all_shards(sharded<database>& sharded_db) {
+    return sharded_db.invoke_on_all([] (replica::database& db) {
+        return db.flush_logstor_separator();
+    });
+}
+
+future<> database::flush_logstor_separator(std::optional<size_t> seq_num) {
+    return _tables_metadata.parallel_for_each_table([seq_num] (table_id, lw_shared_ptr<table> table) {
+        return table->flush_separator(seq_num);
+    });
+}
+
+future<logstor::table_segment_stats> database::get_logstor_table_segment_stats(table_id table) const {
+    return find_column_family(table).get_logstor_segment_stats();
+}
+
+size_t database::get_logstor_memory_usage() const {
+    if (!_logstor) {
+        return 0;
+    }
+    size_t m = 0;
+
+    m += _logstor->get_memory_usage();
+
+    get_tables_metadata().for_each_table([&m] (table_id, lw_shared_ptr<replica::table> table) {
+        if (table->uses_logstor()) {
+            m += table->get_logstor_memory_usage();
+        }
+    });
+
+    return m;
+}
+
 future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, table_id uuid, sstring tag, db::snapshot_options opts) {
    if (!opts.skip_flush) {
        co_await flush_table_on_all_shards(sharded_db, uuid);
@@ -2927,6 +3049,7 @@ future<> database::truncate_table_on_all_shards(sharded<database>& sharded_db, s
        co_await coroutine::parallel_for_each(views, [&] (lw_shared_ptr<replica::table> v) -> future<> {
            co_await flush_or_clear(*v);
        });
+        co_await cf.flush_separator();
        // Since writes could be appended to active memtable between getting low_mark above
        // and flush, the low_mark has to be adjusted to account for those writes, where
        // memtable was flushed with a higher replay position than the one obtained above.
@@ -2968,6 +3091,8 @@ future<> database::truncate(db::system_keyspace& sys_ks, column_family& cf, std:
    dblog.debug("Discarding sstable data for truncated CF + indexes");
    // TODO: notify truncation

+    co_await cf.discard_logstor_segments();
+
    db::replay_position rp = co_await cf.discard_sstables(truncated_at);
    // TODO: indexes.
    // Note: since discard_sstables was changed to only count tables owned by this shard,
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -16,6 +16,7 @@
 #include <seastar/core/execution_stage.hh>
 #include <seastar/core/when_all.hh>
 #include "replica/global_table_ptr.hh"
+#include "replica/logstor/compaction.hh"
 #include "types/user.hh"
 #include "utils/assert.hh"
 #include "utils/hash.hh"
@@ -35,6 +36,7 @@
 #include <seastar/core/gate.hh>
 #include "db/commitlog/replay_position.hh"
 #include "db/commitlog/commitlog_types.hh"
+#include "logstor/logstor.hh"
 #include "schema/schema_fwd.hh"
 #include "db/view/view.hh"
 #include "db/snapshot-ctl.hh"
@@ -544,6 +546,9 @@ private:
    utils::phased_barrier _flush_barrier;
    std::vector<view_ptr> _views;

+    logstor::logstor* _logstor = nullptr;
+    std::unique_ptr<logstor::primary_index> _logstor_index;
+
    std::unique_ptr<cell_locker> _counter_cell_locks; // Memory-intensive; allocate only when needed.

    // Labels used to identify writes and reads for this table in the rate_limiter structure.
@@ -611,6 +616,10 @@ public:
                                          sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);

+    bool add_logstor_segment(logstor::segment_descriptor&, dht::token first_token, dht::token last_token);
+
+    logstor::separator_buffer& get_logstor_separator_buffer(dht::token token, size_t write_size);
+
    // Restricted to new sstables produced by external processes such as repair.
    // The sstable might undergo split if table is in split mode.
    // If no need for split, the input sstable will only be attached to the sstable set.
@@ -833,6 +842,21 @@ public:
    // to issue disk operations safely.
    void mark_ready_for_writes(db::commitlog* cl);

+    void init_logstor(logstor::logstor* ls);
+
+    bool uses_logstor() const {
+        return _logstor != nullptr;
+    }
+
+    logstor::primary_index& logstor_index() noexcept {
+        return *_logstor_index;
+    }
+    const logstor::primary_index& logstor_index() const noexcept {
+        return *_logstor_index;
+    }
+
+    size_t get_logstor_memory_usage() const;
+
    // Creates a mutation reader which covers all data sources for this column family.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
    // Note: for data queries use query() instead.
@@ -858,6 +882,14 @@ public:
        return make_mutation_reader(std::move(schema), std::move(permit), range, full_slice);
    }

+    mutation_reader make_logstor_mutation_reader(schema_ptr s,
+            reader_permit permit,
+            const dht::partition_range& pr,
+            const query::partition_slice& slice,
+            tracing::trace_state_ptr trace_state,
+            streamed_mutation::forwarding fwd,
+            mutation_reader::forwarding fwd_mr) const;
+
    // The streaming mutation reader differs from the regular mutation reader in that:
    //  - Reflects all writes accepted by replica prior to creation of the
    //    reader and a _bounded_ amount of writes which arrive later.
@@ -1047,6 +1079,7 @@ public:
    bool needs_flush() const;
    future<> clear(); // discards memtable(s) without flushing them to disk.
    future<db::replay_position> discard_sstables(db_clock::time_point);
+    future<> discard_logstor_segments();

    bool can_flush() const;

@@ -1098,6 +1131,7 @@ public:
    void start_compaction();
    void trigger_compaction();
    void try_trigger_compaction(compaction_group& cg) noexcept;
+    void trigger_logstor_compaction();
    // Triggers offstrategy compaction, if needed, in the background.
    void trigger_offstrategy_compaction();
    // Performs offstrategy compaction, if needed, returning
@@ -1126,6 +1160,22 @@ public:
        return _compaction_manager;
    }

+    logstor::segment_manager& get_logstor_segment_manager() noexcept {
+        return _logstor->get_segment_manager();
+    }
+
+    const logstor::segment_manager& get_logstor_segment_manager() const noexcept {
+        return _logstor->get_segment_manager();
+    }
+
+    logstor::compaction_manager& get_logstor_compaction_manager() noexcept {
+        return _logstor->get_compaction_manager();
+    }
+
+    future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
+
+    future<logstor::table_segment_stats> get_logstor_segment_stats() const;
+
    table_stats& get_stats() const {
        return _stats;
    }
@@ -1613,6 +1663,8 @@ private:
    dirty_memory_manager _system_dirty_memory_manager;
    dirty_memory_manager _dirty_memory_manager;

+    timer<lowres_clock> _dirty_memory_threshold_controller;
+
    database_config _dbcfg;
    flush_controller _memtable_controller;
    drain_progress _drain_progress {};
@@ -1655,6 +1707,8 @@ private:
    bool _enable_autocompaction_toggle = false;
    querier_cache _querier_cache;

+    std::unique_ptr<logstor::logstor> _logstor;
+
    std::unique_ptr<db::large_data_handler> _large_data_handler;
    std::unique_ptr<db::large_data_handler> _nop_large_data_handler;

@@ -1696,6 +1750,8 @@ public:
    std::shared_ptr<data_dictionary::user_types_storage> as_user_types_storage() const noexcept;
    const data_dictionary::user_types_storage& user_types() const noexcept;
    future<> init_commitlog();
+    future<> init_logstor();
+    future<> recover_logstor();
    const gms::feature_service& features() const { return _feat; }
    future<> apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::rp_handle&&, db::timeout_clock::time_point timeout);
    future<> apply_in_memory(const mutation& m, column_family& cf, db::rp_handle&&, db::timeout_clock::time_point timeout);
@@ -1996,6 +2052,13 @@ public:
    // a wrapper around flush_all_tables, allowing the caller to express intent more clearly
    future<> flush_commitlog() { return flush_all_tables(); }

+    static future<> trigger_logstor_compaction_on_all_shards(sharded<database>& sharded_db, bool major);
+    void trigger_logstor_compaction(bool major);
+    static future<> flush_logstor_separator_on_all_shards(sharded<database>& sharded_db);
+    future<> flush_logstor_separator(std::optional<size_t> seq_num = std::nullopt);
+    future<logstor::table_segment_stats> get_logstor_table_segment_stats(table_id table) const;
+    size_t get_logstor_memory_usage() const;
+
    static future<db_clock::time_point> get_all_tables_flushed_at(sharded<database>& sharded_db);

    static future<> drop_cache_for_table_on_all_shards(sharded<database>& sharded_db, table_id id);
--- a/replica/dirty_memory_manager.cc
+++ b/replica/dirty_memory_manager.cc
@@ -142,6 +142,16 @@ void region_group::notify_unspooled_pressure_relieved() {
    _relief.signal();
 }

+void region_group::update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit) {
+    _cfg.unspooled_hard_limit = unspooled_hard_limit;
+    _cfg.unspooled_soft_limit = unspooled_soft_limit;
+    _cfg.real_hard_limit = real_hard_limit;
+
+    // check pressure with the new limits
+    update_real(0);
+    update_unspooled(0);
+}
+
 bool region_group::do_update_real_and_check_relief(ssize_t delta) {
    _real_total_memory += delta;

@@ -211,9 +221,18 @@ dirty_memory_manager::dirty_memory_manager(replica::database& db, size_t thresho
            .real_hard_limit = threshold,
            .start_reclaiming = std::bind_front(&dirty_memory_manager::start_reclaiming, this)
      }, deferred_work_sg)
+    , _threshold(threshold)
+    , _soft_limit(soft_limit)
    , _flush_serializer(1)
    , _waiting_flush(flush_when_needed()) {}

+void dirty_memory_manager::update_threshold(size_t threshold) {
+    if (threshold != _threshold) {
+        _threshold = threshold;
+        _region_group.update_limits(threshold / 2, threshold * _soft_limit / 2, threshold);
+    }
+}
+
 void
 dirty_memory_manager::setup_collectd(sstring namestr) {
    namespace sm = seastar::metrics;
--- a/replica/dirty_memory_manager.hh
+++ b/replica/dirty_memory_manager.hh
@@ -268,6 +268,8 @@ public:
    }
    void update_unspooled(ssize_t delta);

+    void update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit);
+
    void increase_usage(logalloc::region* r) { // Called by memtable's region_listener
        // It would be easier to call update, but it is unfortunately broken in boost versions up to at
        // least 1.59.
@@ -395,6 +397,9 @@ class dirty_memory_manager {
    // memory usage minus bytes that were already written to disk.
    dirty_memory_manager_logalloc::region_group _region_group;

+    size_t _threshold;
+    double _soft_limit;
+
    // We would like to serialize the flushing of memtables. While flushing many memtables
    // simultaneously can sustain high levels of throughput, the memory is not freed until the
    // memtable is totally gone. That means that if we have throttled requests, they will stay
@@ -483,6 +488,8 @@ public:
        return _region_group;
    }

+    void update_threshold(size_t threshold);
+
    void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
        _region_group.update_real(-delta);
        _region_group.update_unspooled(delta);
--- a/replica/logstor/compaction.hh
+++ b/replica/logstor/compaction.hh
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include "types.hh"
+#include "utils/chunked_vector.hh"
+#include "write_buffer.hh"
+#include "utils/log_heap.hh"
+
+namespace replica::logstor {
+
+constexpr log_heap_options segment_descriptor_hist_options(4 * 1024, 3, 128 * 1024);
+
+struct segment_set;
+
+struct segment_descriptor : public log_heap_hook<segment_descriptor_hist_options> {
+    // free_space = segment_size - net_data_size
+    // initially set to segment_size
+    // when writing records, decrease by total net data size
+    // when freeing a record, increase by the record's net data size
+    size_t free_space{0};
+    size_t record_count{0};
+    segment_generation seg_gen{1};
+    segment_set* owner{nullptr}; // non-owning, set when added to a segment_set
+
+    void reset(size_t segment_size) noexcept {
+        free_space = segment_size;
+        record_count = 0;
+    }
+
+    size_t net_data_size(size_t segment_size) const noexcept {
+        return segment_size - free_space;
+    }
+
+    void on_free_segment() noexcept {
+        ++seg_gen;
+    }
+
+    void on_write(size_t net_data_size, size_t cnt = 1) noexcept {
+        free_space -= net_data_size;
+        record_count += cnt;
+    }
+
+    void on_write(log_location loc) noexcept {
+        on_write(loc.size);
+    }
+
+    void on_free(size_t net_data_size, size_t cnt = 1) noexcept {
+        free_space += net_data_size;
+        record_count -= cnt;
+    }
+
+    void on_free(log_location loc) noexcept {
+        on_free(loc.size);
+    }
+};
+
+using segment_descriptor_hist = log_heap<segment_descriptor, segment_descriptor_hist_options>;
+
+struct segment_set {
+    segment_descriptor_hist _segments;
+    size_t _segment_count{0};
+
+    void add_segment(segment_descriptor& desc) {
+        desc.owner = this;
+        _segments.push(desc);
+        ++_segment_count;
+    }
+
+    void update_segment(segment_descriptor& desc) {
+        _segments.adjust_up(desc);
+    }
+
+    void remove_segment(segment_descriptor& desc) {
+        _segments.erase(desc);
+        desc.owner = nullptr;
+        --_segment_count;
+    }
+
+    size_t segment_count() const noexcept {
+        return _segment_count;
+    }
+};
+
+class segment_ref {
+    struct state {
+        log_segment_id id;
+        std::function<void()> on_last_release;
+        std::function<void()> on_failure;
+        bool flush_failure{false};
+        ~state() {
+            if (!flush_failure) {
+                if (on_last_release) on_last_release();
+            } else {
+                if (on_failure) on_failure();
+            }
+        }
+    };
+    lw_shared_ptr<state> _state;
+public:
+    segment_ref() = default;
+
+    // Copyable: copying increments the shared ref count
+    segment_ref(const segment_ref&) = default;
+    segment_ref& operator=(const segment_ref&) = default;
+    segment_ref(segment_ref&&) noexcept = default;
+    segment_ref& operator=(segment_ref&&) noexcept = default;
+
+    log_segment_id id() const noexcept { return _state->id; }
+    bool empty() const noexcept { return !_state; }
+
+    void set_flush_failure() noexcept { if (_state) _state->flush_failure = true; }
+
+private:
+    friend class segment_manager_impl;
+    explicit segment_ref(log_segment_id id, std::function<void()> on_last_release, std::function<void()> on_failure)
+        : _state(make_lw_shared<state>(id, std::move(on_last_release), std::move(on_failure)))
+    {}
+};
+
+struct separator_buffer {
+    write_buffer* buf;
+    utils::chunked_vector<future<>> pending_updates;
+    utils::chunked_vector<segment_ref> held_segments;
+    std::optional<size_t> min_seq_num;
+    bool flushed{false};
+
+    separator_buffer(write_buffer* wb)
+        : buf(wb)
+    {}
+
+    ~separator_buffer() {
+        if (!flushed && buf && buf->has_data()) {
+            for (auto& seg_ref : held_segments) {
+                seg_ref.set_flush_failure();
+            }
+        }
+    }
+
+    separator_buffer(const separator_buffer&) = delete;
+    separator_buffer& operator=(const separator_buffer&) = delete;
+
+    separator_buffer(separator_buffer&&) noexcept = default;
+    separator_buffer& operator=(separator_buffer&&) noexcept = default;
+
+    future<log_location_with_holder> write(log_record_writer writer) {
+        return buf->write(std::move(writer));
+    }
+
+    bool can_fit(const log_record_writer& writer) const noexcept {
+        return buf->can_fit(writer);
+    }
+
+    bool can_fit(size_t write_size) const noexcept {
+        return buf->can_fit(write_size);
+    }
+};
+
+class compaction_manager {
+public:
+    virtual ~compaction_manager() = default;
+
+    virtual separator_buffer allocate_separator_buffer() = 0;
+
+    virtual future<> flush_separator_buffer(separator_buffer, replica::compaction_group&) = 0;
+
+    virtual void submit(replica::compaction_group&) = 0;
+
+    virtual future<> stop_ongoing_compactions(replica::compaction_group&) = 0;
+};
+
+}
--- a/replica/logstor/index.hh
+++ b/replica/logstor/index.hh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include "dht/decorated_key.hh"
+#include "dht/ring_position.hh"
+#include "types.hh"
+#include "utils/bptree.hh"
+#include "utils/double-decker.hh"
+#include "utils/phased_barrier.hh"
+
+namespace replica::logstor {
+
+class primary_index_entry {
+    dht::decorated_key _key;
+    index_entry _e;
+    struct {
+        bool _head : 1;
+        bool _tail : 1;
+        bool _train : 1;
+    } _flags{};
+public:
+    primary_index_entry(dht::decorated_key key, index_entry e)
+        : _key(std::move(key))
+        , _e(std::move(e))
+    { }
+
+    primary_index_entry(primary_index_entry&&) noexcept = default;
+
+    bool is_head() const noexcept { return _flags._head; }
+    void set_head(bool v) noexcept { _flags._head = v; }
+    bool is_tail() const noexcept { return _flags._tail; }
+    void set_tail(bool v) noexcept { _flags._tail = v; }
+    bool with_train() const noexcept { return _flags._train; }
+    void set_train(bool v) noexcept { _flags._train = v; }
+
+    const dht::decorated_key& key() const noexcept { return _key; }
+    const index_entry& entry() const noexcept { return _e; }
+
+    friend class primary_index;
+
+    friend dht::ring_position_view ring_position_view_to_compare(const primary_index_entry& e) { return e._key; }
+};
+
+class primary_index final {
+public:
+    using partitions_type = double_decker<int64_t, primary_index_entry,
+                            dht::raw_token_less_comparator, dht::ring_position_comparator,
+                            16, bplus::key_search::linear>;
+private:
+    partitions_type _partitions;
+    schema_ptr _schema;
+    size_t _key_count = 0;
+
+    mutable utils::phased_barrier _reads_phaser{"logstor_primary_index"};
+
+public:
+    explicit primary_index(schema_ptr schema)
+        : _partitions(dht::raw_token_less_comparator{})
+        , _schema(std::move(schema))
+        {}
+
+    void set_schema(schema_ptr s) {
+        _schema = std::move(s);
+    }
+
+    void clear() {
+        _partitions.clear();
+        _key_count = 0;
+    }
+
+    utils::phased_barrier::operation start_read() const {
+        return _reads_phaser.start();
+    }
+
+    future<> await_pending_reads() {
+        return _reads_phaser.advance_and_await();
+    }
+
+    std::optional<index_entry> get(const primary_index_key& key) const {
+        auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
+        if (it != _partitions.end()) {
+            return it->_e;
+        }
+        return std::nullopt;
+    }
+
+    std::optional<index_entry> exchange(const primary_index_key& key, index_entry new_entry) {
+        partitions_type::bound_hint hint;
+        auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
+        if (hint.match) {
+            auto old_entry = i->_e;
+            i->_e = std::move(new_entry);
+            return old_entry;
+        } else {
+            _partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
+            ++_key_count;
+            return std::nullopt;
+        }
+    }
+
+    bool update_record_location(const primary_index_key& key, log_location old_location, log_location new_location) {
+        auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
+        if (it != _partitions.end()) {
+            if (it->_e.location == old_location) {
+                it->_e.location = new_location;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    std::pair<bool, std::optional<index_entry>> insert_if_newer(const primary_index_key& key, index_entry new_entry) {
+        partitions_type::bound_hint hint;
+        auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
+        if (hint.match) {
+            if (i->_e.generation < new_entry.generation) {
+                auto old_entry = i->_e;
+                i->_e = std::move(new_entry);
+                return {true, std::make_optional(old_entry)};
+            } else {
+                return {false, std::make_optional(i->_e)};
+            }
+        } else {
+            _partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
+            ++_key_count;
+            return {true, std::nullopt};
+        }
+    }
+
+    bool erase(const primary_index_key& key, log_location loc) {
+        auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
+        if (it != _partitions.end() && it->_e.location == loc) {
+            it.erase(dht::raw_token_less_comparator{});
+            --_key_count;
+            return true;
+        }
+        return false;
+    }
+
+    auto begin() const noexcept { return _partitions.begin(); }
+    auto end() const noexcept { return _partitions.end(); }
+
+    bool empty() const noexcept { return _partitions.empty(); }
+
+    size_t get_key_count() const noexcept { return _key_count; }
+
+    size_t get_memory_usage() const noexcept { return _key_count * sizeof(index_entry); }
+
+    // First entry with key >= pos (for positioning at range start)
+    partitions_type::const_iterator lower_bound(const dht::ring_position_view& pos) const {
+        return _partitions.lower_bound(pos, dht::ring_position_comparator(*_schema));
+    }
+
+    // First entry with key strictly > key (for advancing past a key after a yield)
+    partitions_type::const_iterator upper_bound(const dht::decorated_key& key) const {
+        return _partitions.upper_bound(key, dht::ring_position_comparator(*_schema));
+    }
+
+};
+
+}
--- a/replica/logstor/logstor.cc
+++ b/replica/logstor/logstor.cc
@@ -0,0 +1,297 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#include "replica/logstor/logstor.hh"
+#include <seastar/core/coroutine.hh>
+#include <seastar/util/log.hh>
+#include <seastar/core/future.hh>
+#include "readers/from_mutations.hh"
+#include "keys/keys.hh"
+#include "replica/logstor/segment_manager.hh"
+#include "replica/logstor/types.hh"
+#include "utils/managed_bytes.hh"
+#include <openssl/ripemd.h>
+#include <openssl/evp.h>
+
+namespace replica::logstor {
+
+seastar::logger logstor_logger("logstor");
+
+logstor::logstor(logstor_config config)
+    : _segment_manager(config.segment_manager_cfg)
+    , _write_buffer(_segment_manager, config.flush_sg) {
+}
+
+future<> logstor::do_recovery(replica::database& db) {
+    co_await _segment_manager.do_recovery(db);
+}
+
+future<> logstor::start() {
+    logstor_logger.info("Starting logstor");
+
+    co_await _segment_manager.start();
+    co_await _write_buffer.start();
+
+    logstor_logger.info("logstor started");
+}
+
+future<> logstor::stop() {
+    logstor_logger.info("Stopping logstor");
+
+    co_await _write_buffer.stop();
+    co_await _segment_manager.stop();
+
+    logstor_logger.info("logstor stopped");
+}
+
+size_t logstor::get_memory_usage() const {
+    return _segment_manager.get_memory_usage();
+}
+
+future<> logstor::write(const mutation& m, compaction_group& cg, seastar::gate::holder cg_holder) {
+    primary_index_key key(m.decorated_key());
+    table_id table = m.schema()->id();
+    auto& index = cg.get_logstor_index();
+
+    // TODO ?
+    record_generation gen = index.get(key)
+        .transform([](const index_entry& entry) {
+            return entry.generation + 1;
+         }).value_or(record_generation(1));
+
+    log_record record {
+        .key = key,
+        .generation = gen,
+        .table = table,
+        .mut = canonical_mutation(m)
+    };
+
+    return _write_buffer.write(std::move(record), &cg, std::move(cg_holder)).then_unpack([this, &index, gen, key = std::move(key)]
+            (log_location location, seastar::gate::holder op) {
+        index_entry new_entry {
+            .location = location,
+            .generation = gen,
+        };
+
+        auto old_entry = index.exchange(key, std::move(new_entry));
+
+        // If overwriting, free old record
+        if (old_entry) {
+            _segment_manager.free_record(old_entry->location);
+        }
+    }).handle_exception([] (std::exception_ptr ep) {
+        logstor_logger.error("Error writing mutation: {}", ep);
+        return make_exception_future<>(ep);
+    });
+}
+
+future<std::optional<log_record>> logstor::read(const primary_index& index, primary_index_key key) {
+    auto op = index.start_read();
+
+    auto entry_opt = index.get(key);
+    if (!entry_opt.has_value()) {
+        return make_ready_future<std::optional<log_record>>(std::nullopt);
+    }
+
+    const auto& entry = *entry_opt;
+
+    return _segment_manager.read(entry.location).then([key = std::move(key), op = std::move(op)] (log_record record) {
+        return std::optional<log_record>(std::move(record));
+    }).handle_exception([] (std::exception_ptr ep) {
+        logstor_logger.error("Error reading record: {}", ep);
+        return make_exception_future<std::optional<log_record>>(ep);
+    });
+}
+
+future<std::optional<canonical_mutation>> logstor::read(const schema& s, const primary_index& index, const dht::decorated_key& dk) {
+    primary_index_key key(dk);
+    return read(index, key).then([&dk] (std::optional<log_record> record_opt) -> std::optional<canonical_mutation> {
+        if (!record_opt.has_value()) {
+            return std::nullopt;
+        }
+
+        auto& record = *record_opt;
+
+        if (record.mut.key() != dk.key()) [[unlikely]] {
+            throw std::runtime_error(fmt::format(
+                "Key mismatch reading log entry: expected {}, got {}",
+                dk.key(), record.mut.key()
+            ));
+        }
+
+        return std::optional<canonical_mutation>(std::move(record.mut));
+    });
+}
+
+segment_manager& logstor::get_segment_manager() noexcept {
+    return _segment_manager;
+}
+
+const segment_manager& logstor::get_segment_manager() const noexcept {
+    return _segment_manager;
+}
+
+compaction_manager& logstor::get_compaction_manager() noexcept {
+    return _segment_manager.get_compaction_manager();
+}
+
+const compaction_manager& logstor::get_compaction_manager() const noexcept {
+    return _segment_manager.get_compaction_manager();
+}
+
+mutation_reader logstor::make_reader(schema_ptr schema,
+                                            const primary_index& index,
+                                            reader_permit permit,
+                                            const dht::partition_range& pr,
+                                            const query::partition_slice& slice,
+                                            tracing::trace_state_ptr trace_state) {
+
+    class logstor_range_reader : public mutation_reader::impl {
+        logstor* _logstor;
+        const primary_index& _index;
+        dht::partition_range _pr;
+        query::partition_slice _slice;
+        tracing::trace_state_ptr _trace_state;
+        std::optional<dht::decorated_key> _last_key; // owns the key, safe across yields
+        mutation_reader_opt _current_partition_reader;
+        dht::ring_position_comparator _cmp;
+
+        // Finds the next iterator to process, safe to call after any co_await
+        primary_index::partitions_type::const_iterator find_next() const {
+            auto it = _last_key
+                ? _index.upper_bound(*_last_key)                        // strictly after last key
+                : position_at_range_start();                            // initial positioning
+            // If start was exclusive and we haven't yet seen a key
+            return it;
+        }
+
+        primary_index::partitions_type::const_iterator position_at_range_start() const {
+            if (!_pr.start()) {
+                return _index.begin();
+            }
+            auto it = _index.lower_bound(_pr.start()->value());
+            if (!_pr.start()->is_inclusive() && it != _index.end()) {
+                if (_cmp(it->key(), _pr.start()->value()) == 0) {
+                    ++it;
+                }
+            }
+            return it;
+        }
+
+        bool exceeds_range_end(const primary_index_entry& e) const {
+            if (!_pr.end()) return false;
+            auto c = _cmp(e.key(), _pr.end()->value());
+            return _pr.end()->is_inclusive() ? c > 0 : c >= 0;
+        }
+
+    public:
+        logstor_range_reader(schema_ptr s, const primary_index& idx, reader_permit p,
+                    logstor* ls, dht::partition_range pr,
+                    query::partition_slice slice, tracing::trace_state_ptr ts)
+            : impl(std::move(s), std::move(p))
+            , _logstor(ls), _index(idx), _pr(std::move(pr))
+            , _slice(std::move(slice)), _trace_state(std::move(ts))
+            , _cmp(*_schema)
+        {}
+
+        virtual future<> fill_buffer() override {
+            while (!is_buffer_full() && !_end_of_stream) {
+                // Drain current partition's reader first
+                if (_current_partition_reader) {
+                    co_await _current_partition_reader->fill_buffer();
+                    _current_partition_reader->move_buffer_content_to(*this);
+                    if (!_current_partition_reader->is_end_of_stream()) {
+                        continue;
+                    }
+                    co_await _current_partition_reader->close();
+                    _current_partition_reader = std::nullopt;
+                    // _last_key was already set when we opened the reader
+                }
+
+                // Find next key in range (safe after co_await since we use _last_key)
+                auto it = find_next();
+                if (it == _index.end() || exceeds_range_end(*it)) {
+                    _end_of_stream = true;
+                    break;
+                }
+
+                // Snapshot the key before yielding
+                auto current_key = it->key();
+
+                auto guard = reader_permit::awaits_guard(_permit);
+                auto cmut = co_await _logstor->read(*_schema, _index, current_key);
+
+                _last_key = current_key; // mark as visited even if not found (tombstoned)
+
+                if (!cmut) {
+                    continue; // key was removed between index lookup and read
+                }
+
+                tracing::trace(_trace_state, "logstor_range_reader: fetched key {}", current_key);
+
+                _current_partition_reader = make_mutation_reader_from_mutations(
+                    _schema, _permit, cmut->to_mutation(_schema),
+                    _slice, streamed_mutation::forwarding::no
+                );
+            }
+        }
+
+        virtual future<> next_partition() override {
+            clear_buffer_to_next_partition();
+            if (!is_buffer_empty()) return make_ready_future<>();
+            _end_of_stream = false;
+            if (_current_partition_reader) {
+                auto fut = _current_partition_reader->close();
+                _current_partition_reader = std::nullopt;
+                return fut;
+            }
+            return make_ready_future<>();
+        }
+
+        virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+            clear_buffer();
+            _end_of_stream = false;
+            _pr = pr;
+            _last_key = std::nullopt;      // re-position from new range start
+            if (_current_partition_reader) {
+                auto fut = _current_partition_reader->close();
+                _current_partition_reader = std::nullopt;
+                return fut;
+            }
+            return make_ready_future<>();
+        }
+
+        virtual future<> fast_forward_to(position_range pr) override {
+            if (_current_partition_reader) {
+                clear_buffer();
+                return _current_partition_reader->fast_forward_to(std::move(pr));
+            }
+            return make_ready_future<>();
+        }
+
+        virtual future<> close() noexcept override {
+            if (_current_partition_reader) {
+                return _current_partition_reader->close();
+            }
+            return make_ready_future<>();
+        }
+    };
+
+    return make_mutation_reader<logstor_range_reader>(
+        std::move(schema), index, std::move(permit), this, pr, slice, std::move(trace_state)
+    );
+}
+
+void logstor::set_trigger_compaction_hook(std::function<void()> fn) {
+    _segment_manager.set_trigger_compaction_hook(std::move(fn));
+}
+
+void logstor::set_trigger_separator_flush_hook(std::function<void(size_t)> fn) {
+    _segment_manager.set_trigger_separator_flush_hook(std::move(fn));
+}
+
+}
--- a/replica/logstor/logstor.hh
+++ b/replica/logstor/logstor.hh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <optional>
+#include <seastar/core/scheduling.hh>
+#include "readers/mutation_reader.hh"
+#include "replica/compaction_group.hh"
+#include "types.hh"
+#include "index.hh"
+#include "segment_manager.hh"
+#include "write_buffer.hh"
+#include "mutation/mutation.hh"
+#include "dht/decorated_key.hh"
+
+namespace replica {
+
+class compaction_group;
+class database;
+
+namespace logstor {
+
+extern seastar::logger logstor_logger;
+
+struct logstor_config {
+    segment_manager_config segment_manager_cfg;
+    seastar::scheduling_group flush_sg;
+};
+
+class logstor {
+
+    segment_manager _segment_manager;
+    buffered_writer _write_buffer;
+
+public:
+
+    explicit logstor(logstor_config);
+
+    logstor(const logstor&) = delete;
+    logstor& operator=(const logstor&) = delete;
+
+    future<> do_recovery(replica::database&);
+
+    future<> start();
+    future<> stop();
+
+    size_t get_memory_usage() const;
+
+    segment_manager& get_segment_manager() noexcept;
+    const segment_manager& get_segment_manager() const noexcept;
+
+    compaction_manager& get_compaction_manager() noexcept;
+    const compaction_manager& get_compaction_manager() const noexcept;
+
+    future<> write(const mutation&, compaction_group&, seastar::gate::holder cg_holder);
+
+    future<std::optional<log_record>> read(const primary_index&, primary_index_key);
+
+    future<std::optional<canonical_mutation>> read(const schema&, const primary_index&, const dht::decorated_key&);
+
+    /// Create a mutation reader for a specific key
+    mutation_reader make_reader(schema_ptr schema,
+                                       const primary_index& index,
+                                       reader_permit permit,
+                                       const dht::partition_range& pr,
+                                       const query::partition_slice& slice,
+                                       tracing::trace_state_ptr trace_state = nullptr);
+
+    void set_trigger_compaction_hook(std::function<void()> fn);
+    void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
+};
+
+} // namespace logstor
+} // namespace replica
--- a/replica/logstor/segment_manager.cc
+++ b/replica/logstor/segment_manager.cc
--- a/replica/logstor/segment_manager.hh
+++ b/replica/logstor/segment_manager.hh
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <cstdint>
+#include <filesystem>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/file.hh>
+#include <seastar/core/rwlock.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/shared_ptr.hh>
+#include "bytes_fwd.hh"
+#include "replica/logstor/write_buffer.hh"
+#include "types.hh"
+#include "utils/updateable_value.hh"
+
+namespace replica {
+
+class database;
+
+namespace logstor {
+
+class compaction_manager;
+class segment_set;
+class primary_index;
+
+static constexpr size_t default_segment_size = 128 * 1024;
+static constexpr size_t default_file_size = 32 * 1024 * 1024;
+
+/// Configuration for the segment manager
+struct segment_manager_config {
+    std::filesystem::path base_dir;
+    size_t segment_size = default_segment_size;
+    size_t file_size = default_file_size;
+    size_t disk_size;
+    bool compaction_enabled = true;
+    size_t max_segments_per_compaction = 8;
+    seastar::scheduling_group compaction_sg;
+    utils::updateable_value<float> compaction_static_shares;
+    seastar::scheduling_group separator_sg;
+    uint32_t separator_delay_limit_ms;
+    size_t max_separator_memory = 1 * 1024 * 1024;
+};
+
+struct table_segment_histogram_bucket {
+    size_t count;
+    size_t max_data_size;
+
+    table_segment_histogram_bucket& operator+=(table_segment_histogram_bucket& other) {
+        count += other.count;
+        max_data_size = std::max(max_data_size, other.max_data_size);
+        return *this;
+    }
+};
+
+struct table_segment_stats {
+    size_t compaction_group_count{0};
+    size_t segment_count{0};
+    std::vector<table_segment_histogram_bucket> histogram;
+
+    table_segment_stats& operator+=(table_segment_stats& other) {
+        compaction_group_count += other.compaction_group_count;
+        segment_count += other.segment_count;
+        histogram.resize(std::max(histogram.size(), other.histogram.size()));
+        for (size_t i = 0; i < other.histogram.size(); i++) {
+            histogram[i] += other.histogram[i];
+        }
+        return *this;
+    }
+};
+
+class segment_manager_impl;
+class log_index;
+
+class segment_manager {
+    std::unique_ptr<segment_manager_impl> _impl;
+private:
+    segment_manager_impl& get_impl() noexcept;
+    const segment_manager_impl& get_impl() const noexcept;
+public:
+    static constexpr size_t block_alignment = 4096;
+
+    explicit segment_manager(segment_manager_config config);
+    ~segment_manager();
+
+    segment_manager(const segment_manager&) = delete;
+    segment_manager& operator=(const segment_manager&) = delete;
+
+    future<> do_recovery(replica::database&);
+
+    future<> start();
+    future<> stop();
+
+    future<log_location> write(write_buffer& wb);
+
+    future<log_record> read(log_location location);
+
+    void free_record(log_location location);
+
+    future<> for_each_record(const std::vector<log_segment_id>& segments,
+                            std::function<future<>(log_location, log_record)> callback);
+
+    compaction_manager& get_compaction_manager() noexcept;
+    const compaction_manager& get_compaction_manager() const noexcept;
+
+    void set_trigger_compaction_hook(std::function<void()> fn);
+    void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
+
+    size_t get_segment_size() const noexcept;
+
+    future<> discard_segments(segment_set&);
+
+    size_t get_memory_usage() const;
+
+    future<> await_pending_writes();
+
+    friend class segment_manager_impl;
+
+};
+
+}
+}
--- a/replica/logstor/types.hh
+++ b/replica/logstor/types.hh
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <cstdint>
+#include <fmt/format.h>
+#include "mutation/canonical_mutation.hh"
+#include "replica/logstor/utils.hh"
+#include "dht/decorated_key.hh"
+
+namespace replica::logstor {
+
+struct log_segment_id {
+    uint32_t value;
+
+    bool operator==(const log_segment_id& other) const noexcept = default;
+    auto operator<=>(const log_segment_id& other) const noexcept = default;
+};
+
+struct log_location {
+    log_segment_id segment;
+    uint32_t offset;
+    uint32_t size;
+
+    bool operator==(const log_location& other) const noexcept = default;
+};
+
+struct primary_index_key {
+    dht::decorated_key dk;
+};
+
+using record_generation = generation_base<uint16_t>;
+using segment_generation = generation_base<uint16_t>;
+
+struct index_entry {
+    log_location location;
+    record_generation generation;
+
+    bool operator==(const index_entry& other) const noexcept = default;
+};
+
+struct log_record {
+    primary_index_key key;
+    record_generation generation;
+    table_id table;
+    canonical_mutation mut;
+};
+
+}
+
+// Format specialization declarations and implementations
+template <>
+struct fmt::formatter<replica::logstor::log_segment_id> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::log_segment_id& id, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "segment({})", id.value);
+    }
+};
+
+template <>
+struct fmt::formatter<replica::logstor::log_location> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::log_location& loc, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "{{segment:{}, offset:{}, size:{}}}",
+                             loc.segment, loc.offset, loc.size);
+    }
+};
+
+template <>
+struct fmt::formatter<replica::logstor::primary_index_key> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::primary_index_key& key, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "{}", key.dk);
+    }
+};
--- a/replica/logstor/utils.hh
+++ b/replica/logstor/utils.hh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <concepts>
+#include "serializer.hh"
+
+namespace replica::logstor {
+
+// an unsigned integer that can be incremented and compared with wraparound semantics
+template <std::unsigned_integral T>
+class generation_base {
+    T _value;
+
+public:
+
+    using underlying = T;
+
+    constexpr generation_base() noexcept : _value(0) {}
+    constexpr explicit generation_base(T value) noexcept : _value(value) {}
+
+    constexpr T value() const noexcept { return _value; }
+
+    constexpr generation_base& operator++() noexcept {
+        ++_value;
+        return *this;
+    }
+
+    constexpr generation_base operator++(int) noexcept {
+        auto old = *this;
+        ++_value;
+        return old;
+    }
+
+    constexpr generation_base& operator+=(T delta) noexcept {
+        _value += delta;
+        return *this;
+    }
+
+    constexpr generation_base operator+(T delta) const noexcept {
+        return generation_base(_value + delta);
+    }
+
+    constexpr bool operator==(const generation_base& other) const noexcept = default;
+
+    /// Comparison using wraparound semantics.
+    /// Returns true if this generation is less than other, accounting for wraparound.
+    /// Assumes generations are within half the value space of each other.
+    constexpr bool operator<(const generation_base& other) const noexcept {
+        // Use signed comparison after converting difference to signed type
+        // This handles wraparound: if diff > max/2, it's treated as negative
+        using signed_type = std::make_signed_t<T>;
+        auto diff = static_cast<signed_type>(_value - other._value);
+        return diff < 0;
+    }
+
+    constexpr bool operator<=(const generation_base& other) const noexcept {
+        return *this == other || *this < other;
+    }
+
+    constexpr bool operator>(const generation_base& other) const noexcept {
+        return other < *this;
+    }
+
+    constexpr bool operator>=(const generation_base& other) const noexcept {
+        return other <= *this;
+    }
+};
+
+}
+
+template <std::unsigned_integral T>
+struct fmt::formatter<replica::logstor::generation_base<T>> : fmt::formatter<T> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::generation_base<T>& gen, FormatContext& ctx) const {
+        return fmt::formatter<T>::format(gen.value(), ctx);
+    }
+};
+
+namespace ser {
+
+template <std::unsigned_integral T>
+struct serializer<replica::logstor::generation_base<T>> {
+    template <typename Output>
+    static void write(Output& out, const replica::logstor::generation_base<T>& g) {
+        serializer<typename replica::logstor::generation_base<T>::underlying>::write(out, g.value());
+    }
+    template <typename Input>
+    static replica::logstor::generation_base<T> read(Input& in) {
+        auto val = serializer<typename replica::logstor::generation_base<T>::underlying>::read(in);
+        return replica::logstor::generation_base<T>(val);
+    }
+    template <typename Input>
+    static void skip(Input& in) {
+        serializer<typename replica::logstor::generation_base<T>::underlying>::skip(in);
+    }
+};
+
+}
--- a/replica/logstor/write_buffer.cc
+++ b/replica/logstor/write_buffer.cc
@@ -0,0 +1,278 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#include "write_buffer.hh"
+#include "segment_manager.hh"
+#include "bytes_fwd.hh"
+#include "logstor.hh"
+#include "replica/logstor/types.hh"
+#include <seastar/core/simple-stream.hh>
+#include <seastar/core/with_scheduling_group.hh>
+#include <seastar/core/on_internal_error.hh>
+#include "serializer_impl.hh"
+#include "idl/logstor.dist.hh"
+#include "idl/logstor.dist.impl.hh"
+#include <seastar/core/align.hh>
+#include <seastar/core/aligned_buffer.hh>
+
+namespace replica::logstor {
+
+void log_record_writer::compute_size() const {
+    seastar::measuring_output_stream ms;
+    ser::serialize(ms, _record);
+    _size = ms.size();
+}
+
+void log_record_writer::write(ostream& out) const {
+    ser::serialize(out, _record);
+}
+
+// write_buffer
+
+write_buffer::write_buffer(size_t buffer_size, bool with_record_copy)
+        : _buffer_size(buffer_size)
+        , _buffer(seastar::allocate_aligned_buffer<char>(buffer_size, 4096))
+        , _with_record_copy(with_record_copy)
+{
+    if (_with_record_copy) {
+        _records_copy.reserve(_buffer_size / 100);
+    }
+    reset();
+}
+
+void write_buffer::reset() {
+    _stream = seastar::simple_memory_output_stream(_buffer.get(), _buffer_size);
+    _header_stream = _stream.write_substream(buffer_header_size);
+    _buffer_header = {};
+    _net_data_size = 0;
+    _record_count = 0;
+    _written = {};
+    _records_copy.clear();
+    _write_gate = {};
+}
+
+future<> write_buffer::close() {
+    if (!_write_gate.is_closed()) {
+        co_await _write_gate.close();
+    }
+}
+
+size_t write_buffer::get_max_write_size() const noexcept {
+    return _buffer_size - (buffer_header_size + record_header_size);
+}
+
+bool write_buffer::can_fit(size_t data_size) const noexcept {
+    // Calculate total space needed including header, data, and alignment padding
+    auto total_size = record_header_size + data_size;
+    auto aligned_size = align_up(total_size, record_alignment);
+    return aligned_size <= _stream.size();
+}
+
+bool write_buffer::has_data() const noexcept {
+    return offset_in_buffer() > buffer_header_size;
+}
+
+future<log_location_with_holder> write_buffer::write(log_record_writer writer, compaction_group* cg, seastar::gate::holder cg_holder) {
+    const auto data_size = writer.size();
+
+    if (!can_fit(data_size)) {
+        throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", data_size, _stream.size()));
+    }
+
+    auto rh = record_header {
+        .data_size = data_size
+    };
+    ser::serialize(_stream, rh);
+
+    // Write actual data
+    size_t data_offset_in_buffer = offset_in_buffer();
+    auto data_out = _stream.write_substream(data_size);
+    writer.write(data_out);
+
+    _net_data_size += data_size;
+    _record_count++;
+
+    // Add padding to align record
+    pad_to_alignment(record_alignment);
+
+    auto record_location = [data_offset_in_buffer, data_size] (log_location base_location) {
+        return log_location {
+            .segment = base_location.segment,
+            .offset = base_location.offset + data_offset_in_buffer,
+            .size = data_size
+        };
+    };
+
+    if (_with_record_copy) {
+        _records_copy.push_back(record_in_buffer {
+            .writer = std::move(writer),
+            .offset_in_buffer = data_offset_in_buffer,
+            .data_size = data_size,
+            .loc = _written.get_shared_future().then(record_location),
+            .cg = cg,
+            .cg_holder = std::move(cg_holder)
+        });
+    }
+
+    // hold the write buffer until the write is complete, and pass the holder to the
+    // caller for follow-up operations that should continue holding the buffer, such
+    // as index updates.
+    auto op = _write_gate.hold();
+
+    return _written.get_shared_future().then([record_location, op = std::move(op)] (log_location base_location) mutable {
+        return std::make_tuple(record_location(base_location), std::move(op));
+    });
+}
+
+future<log_location> write_buffer::write_no_holder(log_record_writer writer) {
+    // write and leave the gate immediately after the write.
+    // use carefully when the gate it not needed.
+    return write(std::move(writer)).then_unpack([] (log_location loc, seastar::gate::holder op) {
+        return loc;
+    });
+}
+
+void write_buffer::pad_to_alignment(size_t alignment) {
+    auto current_pos = offset_in_buffer();
+    auto next_pos = align_up(current_pos, alignment);
+    auto padding = next_pos - current_pos;
+    if (padding > 0) {
+        _stream.fill('\0', padding);
+    }
+}
+
+void write_buffer::finalize(size_t alignment) {
+    _buffer_header.data_size = static_cast<uint32_t>(offset_in_buffer() - buffer_header_size);
+    pad_to_alignment(alignment);
+}
+
+void write_buffer::write_header(segment_generation seg_gen) {
+    _buffer_header.magic = buffer_header_magic;
+    _buffer_header.seg_gen = seg_gen;
+    ser::serialize<buffer_header>(_header_stream, _buffer_header);
+}
+
+future<> write_buffer::complete_writes(log_location base_location) {
+    _written.set_value(base_location);
+    co_await close();
+}
+
+future<> write_buffer::abort_writes(std::exception_ptr ex) {
+    if (!_written.available()) {
+        _written.set_exception(std::move(ex));
+    }
+    co_await close();
+}
+
+std::vector<write_buffer::record_in_buffer>& write_buffer::records() {
+    if (!_with_record_copy) {
+        on_internal_error(logstor_logger, "requesting records but the write buffer has no record copy enabled");
+    }
+    return _records_copy;
+}
+
+size_t write_buffer::estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size) {
+    // Calculate total size needed including headers and alignment padding
+    size_t total_size = record_header_size * record_count + net_data_size;
+
+    // not perfect so let's multiply by some overhead constant
+    total_size = static_cast<size_t>(total_size * 1.1);
+
+    return align_up(total_size, segment_size) / segment_size;
+
+}
+
+// buffered_writer
+
+buffered_writer::buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg)
+        : _sm(sm)
+        , _available_buffers(num_flushing_buffers)
+        , _flush_sg(flush_sg) {
+
+    _buffers.reserve(num_flushing_buffers + 1);
+    for (size_t i = 0; i < num_flushing_buffers + 1; ++i) {
+        _buffers.emplace_back(_sm.get_segment_size(), true);
+    }
+
+    _active_buffer = active_buffer {
+        .buf = &_buffers[0],
+    };
+
+    for (size_t i = 1; i < num_flushing_buffers + 1; ++i) {
+        _available_buffers.push(&_buffers[i]);
+    }
+}
+
+future<> buffered_writer::start() {
+    logstor_logger.info("Starting write buffer");
+    co_return;
+}
+
+future<> buffered_writer::stop() {
+    if (_async_gate.is_closed()) {
+        co_return;
+    }
+    logstor_logger.info("Stopping write buffer");
+
+    co_await _async_gate.close();
+    logstor_logger.info("Write buffer stopped");
+}
+
+future<log_location_with_holder> buffered_writer::write(log_record record, compaction_group* cg, seastar::gate::holder cg_holder) {
+    auto holder = _async_gate.hold();
+
+    log_record_writer writer(std::move(record));
+
+    if (writer.size() > _active_buffer.buf->get_max_write_size()) {
+        throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", writer.size(), _active_buffer.buf->get_max_write_size()));
+    }
+
+    // Check if write fits in current buffer
+    while (!_active_buffer.buf->can_fit(writer)) {
+        co_await _buffer_switched.wait();
+    }
+
+    // Write to buffer at current position
+    auto fut = _active_buffer.buf->write(std::move(writer), cg, std::move(cg_holder));
+
+    // Trigger flush for the active buffer if not in progress
+    if (!std::exchange(_active_buffer.flush_requested, true)) {
+        (void)with_gate(_async_gate, [this] {
+            return switch_buffer().then([this] (write_buffer* old_buf) mutable {
+                return with_scheduling_group(_flush_sg, [this, old_buf] mutable {
+                    return flush(old_buf);
+                });
+            });
+        });
+    }
+
+    co_return co_await std::move(fut);
+}
+
+future<write_buffer*> buffered_writer::switch_buffer() {
+    // Wait for and get the next available buffer
+    auto new_buf = co_await _available_buffers.pop_eventually();
+
+    auto next_active_buffer = active_buffer {
+        .buf = std::move(new_buf),
+    };
+
+    auto old_active_buffer = std::exchange(_active_buffer, std::move(next_active_buffer));
+    _buffer_switched.broadcast();
+
+    co_return std::move(old_active_buffer.buf);
+}
+
+future<> buffered_writer::flush(write_buffer* buf) {
+    co_await _sm.write(*buf);
+
+    // Return the flushed buffer to the available queue
+    buf->reset();
+    _available_buffers.push(std::move(buf));
+}
+
+}
--- a/replica/logstor/write_buffer.hh
+++ b/replica/logstor/write_buffer.hh
@@ -0,0 +1,294 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/core/aligned_buffer.hh>
+#include <seastar/core/condition-variable.hh>
+#include <seastar/core/scheduling.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/simple-stream.hh>
+#include <seastar/core/shared_future.hh>
+#include "types.hh"
+#include "serializer.hh"
+
+namespace replica {
+
+class compaction_group;
+
+namespace logstor {
+
+class segment_manager;
+
+// Writer for log records that handles serialization and size computation
+class log_record_writer {
+
+    using ostream = seastar::simple_memory_output_stream;
+
+    log_record _record;
+    mutable std::optional<size_t> _size;
+
+    void compute_size() const;
+
+public:
+    explicit log_record_writer(log_record record)
+        : _record(std::move(record))
+    {}
+
+    // Get serialized size (computed lazily)
+    size_t size() const {
+        if (!_size) {
+            compute_size();
+        }
+        return *_size;
+    }
+
+    // Write the record to an output stream
+    void write(ostream& out) const;
+
+    const log_record& record() const {
+        return _record;
+    }
+};
+
+using log_location_with_holder = std::tuple<log_location, seastar::gate::holder>;
+
+// Manages a single aligned buffer for accumulating records and writing
+// them to the segment manager.
+//
+// usage:
+//
+// create write buffer with specified size:
+//     write_buffer wb(buffer_size);
+// write data to the buffer if fits and get a future for the log location when flushed:
+//     log_record_writer writer(record);
+//     auto loc_fut = wb.write(writer);
+// flush the buffer to the segment manager:
+//     co_await sm.write(wb);
+// await individual write locations:
+//     auto record_loc = co_await std::move(loc_fut);
+class write_buffer {
+public:
+
+    using ostream = seastar::simple_memory_output_stream;
+
+    // buffer: buffer_header | record_1 | ... | record_n | 0-padding
+    // record: record_header | record_data | 0-padding
+    //
+    // buffer_header and record are aligned by record_alignment
+    // buffer_header and record_header have explicit sizes and serialization below
+
+    static constexpr uint32_t buffer_header_magic = 0x4c475342;
+    static constexpr size_t record_alignment = 8;
+
+    struct buffer_header {
+        uint32_t magic;
+        uint32_t data_size; // size of all records data following the buffer_header
+        segment_generation seg_gen;
+        uint16_t reserved1;
+        uint32_t reserved2;
+    };
+    static constexpr size_t buffer_header_size = 3 * sizeof(uint32_t) + sizeof(uint16_t) + sizeof(segment_generation::underlying);
+
+    static_assert(buffer_header_size % record_alignment == 0, "Buffer header size must be aligned by record_alignment");
+
+    struct record_header {
+        uint32_t data_size; // size of the record data following the record_header
+    };
+    static constexpr size_t record_header_size = sizeof(uint32_t);
+
+private:
+
+    using aligned_buffer_type = std::unique_ptr<char[], free_deleter>;
+
+    size_t _buffer_size;
+    aligned_buffer_type _buffer;
+    seastar::simple_memory_output_stream _stream;
+    buffer_header _buffer_header;
+    seastar::simple_memory_output_stream _header_stream;
+
+    size_t _net_data_size{0};
+    size_t _record_count{0};
+
+    shared_promise<log_location> _written;
+
+    seastar::gate _write_gate;
+
+    struct record_in_buffer {
+        log_record_writer writer;
+        size_t offset_in_buffer;
+        size_t data_size;
+        future<log_location> loc;
+        compaction_group* cg;
+        seastar::gate::holder cg_holder;
+    };
+
+    bool _with_record_copy;
+    std::vector<record_in_buffer> _records_copy;
+
+public:
+
+    write_buffer(size_t buffer_size, bool with_record_copy);
+
+    void reset();
+
+    write_buffer(const write_buffer&) = delete;
+    write_buffer& operator=(const write_buffer&) = delete;
+
+    write_buffer(write_buffer&&) noexcept = default;
+    write_buffer& operator=(write_buffer&&) noexcept = default;
+
+    future<> close();
+
+    size_t get_buffer_size() const noexcept { return _buffer_size; }
+    size_t offset_in_buffer() const noexcept { return _buffer_size - _stream.size(); }
+
+    bool can_fit(size_t data_size) const noexcept;
+
+    bool can_fit(const log_record_writer& writer) const noexcept {
+        return can_fit(writer.size());
+    }
+
+    bool has_data() const noexcept;
+
+    size_t get_max_write_size() const noexcept;
+
+    size_t get_net_data_size() const noexcept { return _net_data_size; }
+    size_t get_record_count() const noexcept { return _record_count; }
+
+    // Write a record to the buffer.
+    // Returns a future that will be resolved with the log location once flushed and a gate holder
+    // that keeps the write buffer open. The gate should be held for index updates after the write
+    // is done.
+    future<log_location_with_holder> write(log_record_writer, compaction_group*, seastar::gate::holder cg_holder);
+
+    future<log_location_with_holder> write(log_record_writer writer) {
+        return write(std::move(writer), nullptr, {});
+    }
+
+    // Write a record to the buffer.
+    // Returns a future that will be resolved with the log location once flushed.
+    // If there are follow-up operations to the write such as index updates then consider
+    // using write_with_holder instead to keep the write buffer open until those operations are complete.
+    future<log_location> write_no_holder(log_record_writer);
+
+    static size_t estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size);
+
+private:
+
+    const char* data() const noexcept { return _buffer.get(); }
+
+    void write_header(segment_generation);
+
+    // get all write records in the buffer.
+    // with_record_copy must be to true when creating the write_buffer.
+    std::vector<record_in_buffer>& records();
+
+    /// Complete all tracked writes with their locations when the buffer is flushed to base_location
+    future<> complete_writes(log_location base_location);
+    future<> abort_writes(std::exception_ptr);
+
+    void pad_to_alignment(size_t alignment);
+    void finalize(size_t alignment);
+
+    friend class segment_manager_impl;
+    friend class compaction_manager_impl;
+};
+
+// Manages multiple buffers, a single active buffer and multiple flushing buffers.
+// When switch is requested for the active buffer, it waits for a flushing buffer to
+// become available, and continuing to accumulate writes until then.
+class buffered_writer {
+    static constexpr size_t num_flushing_buffers = 4;
+
+    segment_manager& _sm;
+
+    struct active_buffer {
+        write_buffer* buf;
+        bool flush_requested{false};
+    } _active_buffer;
+
+    std::vector<write_buffer> _buffers;
+    seastar::queue<write_buffer*> _available_buffers;
+    seastar::gate _async_gate;
+    seastar::condition_variable _buffer_switched;
+    seastar::scheduling_group _flush_sg;
+
+public:
+    explicit buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg);
+
+    buffered_writer(const buffered_writer&) = delete;
+    buffered_writer& operator=(const buffered_writer&) = delete;
+
+    future<> start();
+    future<> stop();
+
+    future<log_location_with_holder> write(log_record, compaction_group* cg = nullptr, seastar::gate::holder cg_holder = {});
+
+private:
+    future<write_buffer*> switch_buffer();
+    future<> flush(write_buffer*);
+
+};
+
+}
+}
+
+namespace ser {
+
+template <>
+struct serializer<replica::logstor::write_buffer::buffer_header> {
+    template <typename Output>
+    static void write(Output& out, const replica::logstor::write_buffer::buffer_header& h) {
+        serializer<uint32_t>::write(out, h.magic);
+        serializer<uint32_t>::write(out, h.data_size);
+        serializer<replica::logstor::segment_generation>::write(out, h.seg_gen);
+        serializer<uint16_t>::write(out, h.reserved1);
+        serializer<uint32_t>::write(out, h.reserved2);
+    }
+    template <typename Input>
+    static replica::logstor::write_buffer::buffer_header read(Input& in) {
+        replica::logstor::write_buffer::buffer_header h;
+        h.magic = serializer<uint32_t>::read(in);
+        h.data_size = serializer<uint32_t>::read(in);
+        h.seg_gen = serializer<replica::logstor::segment_generation>::read(in);
+        h.reserved1 = serializer<uint16_t>::read(in);
+        h.reserved2 = serializer<uint32_t>::read(in);
+        return h;
+    }
+    template <typename Input>
+    static void skip(Input& in) {
+        serializer<uint32_t>::skip(in);
+        serializer<uint32_t>::skip(in);
+        serializer<replica::logstor::segment_generation>::skip(in);
+        serializer<uint16_t>::skip(in);
+        serializer<uint32_t>::skip(in);
+    }
+};
+
+template <>
+struct serializer<replica::logstor::write_buffer::record_header> {
+    template <typename Output>
+    static void write(Output& out, const replica::logstor::write_buffer::record_header& h) {
+        serializer<uint32_t>::write(out, h.data_size);
+    }
+    template <typename Input>
+    static replica::logstor::write_buffer::record_header read(Input& in) {
+        replica::logstor::write_buffer::record_header h;
+        h.data_size = serializer<uint32_t>::read(in);
+        return h;
+    }
+    template <typename Input>
+    static void skip(Input& in) {
+        serializer<uint32_t>::skip(in);
+    }
+};
+} // namespace ser
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -217,6 +217,17 @@ table::add_memtables_to_reader_list(std::vector<mutation_reader>& readers,
    }
 }

+mutation_reader
+table::make_logstor_mutation_reader(schema_ptr s,
+                                   reader_permit permit,
+                                   const dht::partition_range& pr,
+                                   const query::partition_slice& slice,
+                                   tracing::trace_state_ptr trace_state,
+                                   streamed_mutation::forwarding fwd,
+                                   mutation_reader::forwarding fwd_mr) const {
+    return _logstor->make_reader(std::move(s), logstor_index(), std::move(permit), pr, slice, std::move(trace_state));
+}
+
 mutation_reader
 table::make_mutation_reader(schema_ptr s,
                           reader_permit permit,
@@ -229,6 +240,10 @@ table::make_mutation_reader(schema_ptr s,
        return (*_virtual_reader).make_mutation_reader(s, std::move(permit), range, slice, trace_state, fwd, fwd_mr);
    }

+    if (_logstor) [[unlikely]] {
+        return make_logstor_mutation_reader(s, std::move(permit), range, slice, std::move(trace_state), fwd, fwd_mr);
+    }
+
    std::vector<mutation_reader> readers;

    // We're assuming that cache and memtables are both read atomically
@@ -716,7 +731,9 @@ public:
        return make_ready_future<>();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override {}

    compaction_group& compaction_group_for_token(dht::token token) const override {
        return get_compaction_group();
@@ -762,6 +779,11 @@ public:
    }
 };

+struct background_merge_guard {
+    compaction::compaction_reenabler compaction_guard;
+    locator::effective_replication_map_ptr erm_guard;
+};
+
 class tablet_storage_group_manager final : public storage_group_manager {
    replica::table& _t;
    locator::host_id _my_host_id;
@@ -782,7 +804,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
    utils::phased_barrier _merge_fiber_barrier;
    std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
    // Holds compaction reenabler which disables compaction temporarily during tablet merge
-    std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
+    std::vector<background_merge_guard> _compaction_reenablers_for_merging;
 private:
    const schema_ptr& schema() const {
        return _t.schema();
@@ -806,7 +828,8 @@ private:
    // Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
    // the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
    // are merged into a new storage group with id (X >> 1).
-    void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
+    void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                        const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);

    // When merge completes, compaction groups of sibling tablets are added to same storage
    // group, but they're not merged yet into one, since the merge completion handler happens
@@ -822,9 +845,8 @@ private:
        return tablet_map().get_tablet_id(t).value();
    }

-    std::pair<size_t, locator::tablet_range_side> storage_group_of(dht::token t) const {
-        auto [id, side] = tablet_map().get_tablet_id_and_range_side(t);
-        auto idx = id.value();
+    size_t storage_group_of(dht::token t) const {
+        auto idx = tablet_id_for_token(t);
 #ifndef SCYLLA_BUILD_MODE_RELEASE
        if (idx >= tablet_count()) {
            on_fatal_internal_error(tlogger, format("storage_group_of: index out of range: idx={} size_log2={} size={} token={}",
@@ -836,7 +858,7 @@ private:
                                                    idx, sg.token_range(), t));
        }
 #endif
-        return { idx, side };
+        return idx;
    }

    repair_classifier_func make_repair_sstable_classifier_func() const {
@@ -900,7 +922,9 @@ public:
                std::exchange(_stop_fut, make_ready_future())).discard_result();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override;

    compaction_group& compaction_group_for_token(dht::token token) const override;
    utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
@@ -911,7 +935,7 @@ public:
        return log2ceil(tablet_map().tablet_count());
    }
    storage_group& storage_group_for_token(dht::token token) const override {
-        return storage_group_for_id(storage_group_of(token).first);
+        return storage_group_for_id(storage_group_of(token));
    }

    locator::combined_load_stats table_load_stats() const override;
@@ -959,9 +983,20 @@ size_t storage_group::to_idx(locator::tablet_range_side side) const {
    return size_t(side);
 }

-compaction_group_ptr& storage_group::select_compaction_group(locator::tablet_range_side side) noexcept {
+compaction_group_ptr& storage_group::select_compaction_group(dht::token token, const locator::tablet_map& tmap) noexcept {
    if (splitting_mode()) {
-        return _split_ready_groups[to_idx(side)];
+        return _split_ready_groups[to_idx(tmap.get_tablet_range_side(token))];
+    }
+    return _main_cg;
+}
+
+compaction_group_ptr& storage_group::select_compaction_group(dht::token first, dht::token last, const locator::tablet_map& tmap) noexcept {
+    if (splitting_mode()) {
+        auto first_side = tmap.get_tablet_range_side(first);
+        auto last_side = tmap.get_tablet_range_side(last);
+        if (first_side == last_side) {
+            return _split_ready_groups[to_idx(first_side)];
+        }
    }
    return _main_cg;
 }
@@ -1056,6 +1091,38 @@ future<> compaction_group::split(compaction::compaction_type_options::split opt,
    }
 }

+future<> compaction_group::discard_logstor_segments() {
+    auto& sm = get_logstor_segment_manager();
+    co_await sm.discard_segments(*_logstor_segments);
+}
+
+future<> compaction_group::flush_separator(std::optional<size_t> seq_num) {
+    auto units = co_await get_units(_separator_flush_sem, 1);
+    auto pending = std::exchange(_separator_flushes, {});
+    if (_logstor_separator && (!seq_num || _logstor_separator->min_seq_num < *seq_num)) {
+        auto& cm = get_logstor_compaction_manager();
+        auto b = std::move(*_logstor_separator);
+        _logstor_separator.reset();
+        pending.push_back(cm.flush_separator_buffer(std::move(b), *this));
+    }
+    co_await when_all(pending.begin(), pending.end());
+}
+
+logstor::separator_buffer& compaction_group::get_separator_buffer(size_t write_size) {
+    if (!_logstor_separator || !_logstor_separator->can_fit(write_size)) {
+        auto& cm = get_logstor_compaction_manager();
+        if (_logstor_separator) {
+            auto b = std::move(*_logstor_separator);
+            _logstor_separator.reset();
+
+            std::erase_if(_separator_flushes, [](future<>& f) { return f.available(); });
+            _separator_flushes.push_back(cm.flush_separator_buffer(std::move(b), *this));
+        }
+        _logstor_separator.emplace(cm.allocate_separator_buffer());
+    }
+    return *_logstor_separator;
+}
+
 future<> storage_group::split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info) {
    if (set_split_mode()) {
        co_return;
@@ -1222,9 +1289,9 @@ storage_group& table::storage_group_for_id(size_t i) const {
 }

 compaction_group& tablet_storage_group_manager::compaction_group_for_token(dht::token token) const {
-    auto [idx, range_side] = storage_group_of(token);
+    auto idx = storage_group_of(token);
    auto& sg = storage_group_for_id(idx);
-    return *sg.select_compaction_group(range_side);
+    return *sg.select_compaction_group(token, tablet_map());
 }

 compaction_group& table::compaction_group_for_token(dht::token token) const {
@@ -1265,8 +1332,8 @@ compaction_group& table::compaction_group_for_key(partition_key_view key, const
 }

 compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(const sstables::shared_sstable& sst) const {
-    auto [first_id, first_range_side] = storage_group_of(sst->get_first_decorated_key().token());
-    auto [last_id, last_range_side] = storage_group_of(sst->get_last_decorated_key().token());
+    auto first_id = storage_group_of(sst->get_first_decorated_key().token());
+    auto last_id = storage_group_of(sst->get_last_decorated_key().token());

    auto sstable_desc = [] (const sstables::shared_sstable& sst) {
        auto& identifier_opt = sst->sstable_identifier();
@@ -1289,12 +1356,10 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con

    try {
        auto& sg = storage_group_for_id(first_id);
-
-        if (first_range_side != last_range_side) {
-            return *sg.main_compaction_group();
-        }
-
-        return *sg.select_compaction_group(first_range_side);
+        return *sg.select_compaction_group(
+                sst->get_first_decorated_key().token(),
+                sst->get_last_decorated_key().token(),
+                tablet_map());
    } catch (std::out_of_range& e) {
        on_internal_error(tlogger, format("Unable to load SSTable {} of tablet {}, due to {}",
                                          sstable_desc(sst),
@@ -1465,6 +1530,7 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
                                        sstables::offstrategy offstrategy) {
    std::vector<sstables::shared_sstable> ret, ssts;
    std::exception_ptr ex;
+    log_level failure_log_level = log_level::error;
    try {
        bool trigger_compaction = offstrategy == sstables::offstrategy::no;
        auto& cg = compaction_group_for_sstable(new_sst);
@@ -1486,6 +1552,9 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
            co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
            sst = nullptr;
        }
+    } catch (compaction::compaction_stopped_exception&) {
+        failure_log_level = log_level::warn;
+        ex = std::current_exception();
    } catch (...) {
        ex = std::current_exception();
    }
@@ -1493,13 +1562,13 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
    if (ex) {
        // on failed split, input sstable is unlinked here.
        if (new_sst) {
-            tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
+            tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
            co_await new_sst->unlink();
        }
        // on failure after successful split, sstables not attached yet will be unlinked
-        co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+        co_await coroutine::parallel_for_each(ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
            if (sst) {
-                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
                co_await sst->unlink();
            }
        });
@@ -1513,6 +1582,7 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
                                         std::function<future<>(sstables::shared_sstable)> on_add) {
    std::exception_ptr ex;
    std::vector<sstables::shared_sstable> ret;
+    log_level failure_log_level = log_level::error;

    // We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
    // so the exception handling below will only have to unlink sstables not processed yet.
@@ -1522,14 +1592,17 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
            std::ranges::move(ssts, std::back_inserter(ret));

        }
+    } catch (compaction::compaction_stopped_exception&) {
+        failure_log_level = log_level::warn;
+        ex = std::current_exception();
    } catch (...) {
        ex = std::current_exception();
    }

    if (ex) {
-        co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+        co_await coroutine::parallel_for_each(new_ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
            if (sst) {
-                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
                co_await sst->unlink();
            }
        });
@@ -1568,6 +1641,19 @@ table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector
    }
 }

+bool table::add_logstor_segment(logstor::segment_descriptor& seg_desc, dht::token first_token, dht::token last_token) {
+    auto& cg = compaction_group_for_token(first_token);
+    if (&cg != &compaction_group_for_token(last_token)) {
+        return false;
+    }
+    cg.add_logstor_segment(seg_desc);
+    return true;
+}
+
+logstor::separator_buffer& table::get_logstor_separator_buffer(dht::token token, size_t write_size) {
+    return compaction_group_for_token(token).get_separator_buffer(write_size);
+}
+
 // Handles permit management only, used for situations where we don't want to inform
 // the compaction manager about backlogs (i.e., tests)
 class permit_monitor : public sstables::write_monitor {
@@ -1765,7 +1851,9 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
        utils::get_local_injector().inject("table_seal_active_memtable_try_flush", []() {
            throw std::system_error(ENOSPC, std::system_category(), "Injected error");
        });
-        co_return co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
+        co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
+        // signal a memtable was sealed
+        utils::get_local_injector().receive_message("table_seal_post_flush_waiters");
    });

    undo_stats.reset();
@@ -2021,8 +2109,15 @@ size_t compaction_group::live_sstable_count() const noexcept {
    return _main_sstables->size() + _maintenance_sstables->size();
 }

+size_t compaction_group::logstor_disk_space_used() const noexcept {
+    if (!_logstor_segments || !_t.uses_logstor()) {
+        return 0;
+    }
+    return _logstor_segments->segment_count() * _t.get_logstor_segment_manager().get_segment_size();
+}
+
 uint64_t compaction_group::live_disk_space_used() const noexcept {
-    return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk();
+    return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk() + logstor_disk_space_used();
 }

 sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() const noexcept {
@@ -2372,6 +2467,12 @@ void table::trigger_compaction() {
    });
 }

+void table::trigger_logstor_compaction() {
+    for_each_compaction_group([] (compaction_group& cg) {
+        cg.trigger_logstor_compaction();
+    });
+}
+
 void table::try_trigger_compaction(compaction_group& cg) noexcept {
    try {
        cg.trigger_compaction();
@@ -2380,6 +2481,51 @@ void table::try_trigger_compaction(compaction_group& cg) noexcept {
    }
 }

+future<> table::flush_separator(std::optional<size_t> seq_num) {
+    if (!uses_logstor()) {
+        co_return;
+    }
+
+    // wait for all previous writes to be written to a separator buffer
+    co_await get_logstor_segment_manager().await_pending_writes();
+
+    // flush separator buffers
+    co_await parallel_foreach_compaction_group([seq_num] (compaction_group& cg) {
+        return cg.flush_separator(seq_num);
+    });
+}
+
+future<logstor::table_segment_stats> table::get_logstor_segment_stats() const {
+    logstor::table_segment_stats result;
+    if (!uses_logstor()) {
+        co_return std::move(result);
+    }
+
+    const auto segment_size = get_logstor_segment_manager().get_segment_size();
+    const auto bucket_count = 32;
+    const auto bucket_size = segment_size / bucket_count;
+
+    result.histogram.resize(bucket_count);
+
+    co_await const_cast<table*>(this)->parallel_foreach_compaction_group([&] (const compaction_group& cg) -> future<> {
+        const auto& cg_segments = cg.logstor_segments();
+
+        result.compaction_group_count++;
+        result.segment_count += cg_segments.segment_count();
+
+        for (const auto& desc : cg_segments._segments) {
+            co_await coroutine::maybe_yield();
+            auto data_size = desc.net_data_size(segment_size);
+            auto bucket_index = std::min<size_t>(data_size / bucket_size, bucket_count - 1);
+            auto& bucket = result.histogram[bucket_index];
+            bucket.count++;
+            bucket.max_data_size = std::max(bucket.max_data_size, data_size);
+        }
+    });
+
+    co_return std::move(result);
+}
+
 void compaction_group::trigger_compaction() {
    // But not if we're locked out or stopping
    if (!_async_gate.is_closed()) {
@@ -2390,6 +2536,14 @@ void compaction_group::trigger_compaction() {
    }
 }

+void compaction_group::trigger_logstor_compaction() {
+    if (!_async_gate.is_closed() && !_t.is_auto_compaction_disabled_by_user()) {
+        if (_logstor_segments) {
+            get_logstor_compaction_manager().submit(*this);
+        }
+    }
+}
+
 void table::trigger_offstrategy_compaction() {
    // Run in background.
    // This is safe since the the compaction task is tracked
@@ -2846,6 +3000,7 @@ compaction_group::compaction_group(table& t, size_t group_id, dht::token_range t
    , _async_gate(format("[compaction_group {}.{} {}]", t.schema()->ks_name(), t.schema()->cf_name(), group_id))
    , _backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
    , _repair_sstable_classifier(std::move(repair_classifier))
+    , _logstor_segments(make_lw_shared<logstor::segment_set>())
 {
 }

@@ -2879,9 +3034,13 @@ future<> compaction_group::stop(sstring reason) noexcept {
  for (auto view : all_views()) {
    co_await _t._compaction_manager.stop_ongoing_compactions(reason, view);
  }
+    if (_t.uses_logstor()) {
+        co_await get_logstor_compaction_manager().stop_ongoing_compactions(*this);
+    }
    co_await _async_gate.close();
    auto flush_future = co_await seastar::coroutine::as_future(flush());

+    co_await flush_separator();
    co_await _flush_gate.close();
    co_await _sstable_add_gate.close();
  // FIXME: indentation
@@ -3198,7 +3357,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
    }
 }

-void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
+void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                                                  const locator::tablet_map& old_tmap,
+                                                                  const locator::tablet_map& new_tmap) {
    auto table_id = schema()->id();
    size_t old_tablet_count = old_tmap.tablet_count();
    size_t new_tablet_count = new_tmap.tablet_count();
@@ -3222,7 +3383,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
        auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
        for (auto& view : new_cg->all_views()) {
            auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
-            _compaction_reenablers_for_merging.push_back(std::move(cre));
+            _compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
        }
        auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));

@@ -3255,7 +3416,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
    _merge_completion_event.signal();
 }

-void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
+void tablet_storage_group_manager::update_effective_replication_map(
+        const locator::effective_replication_map_ptr& old_erm,
+        const locator::effective_replication_map& erm,
+        noncopyable_function<void()> refresh_mutation_source)
+{
    auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
    auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);

@@ -3271,7 +3436,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
        if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
            utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
        }
-        handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
+        handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
    }

    // Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
@@ -3357,7 +3522,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
    };

    if (uses_tablets()) {
-        _sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
+        _sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
    }
    if (old_erm) {
        old_erm->invalidate();
@@ -4002,6 +4167,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
            }

            auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
+            auto close_lister = deferred_close(lister);
            while (auto de = lister.get().get()) {
                auto snapshot_name = de->name;
                all_snapshots.emplace(snapshot_name, snapshot_details());
@@ -4009,6 +4175,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
                auto& sd = all_snapshots.at(snapshot_name);
                sd.total += details.total;
                sd.live += details.live;
+                utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
+                    throw std::runtime_error("Injected exception in get_snapshot_details");
+                }).get();
            }
        }
        return all_snapshots;
@@ -4028,53 +4197,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
    }

    auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
-    while (auto de = co_await lister.get()) {
-        const auto& name = de->name;
-        future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
-        auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
-        auto size = sd.allocated_size;
+    std::exception_ptr ex;
+    try {
+        while (auto de = co_await lister.get()) {
+            const auto& name = de->name;
+            future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
+            auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
+            auto size = sd.allocated_size;

-        // The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
-        //
-        // All the others should just generate an exception: there is something wrong, so don't blindly
-        // add it to the size.
-        if (name != "manifest.json" && name != "schema.cql") {
-            details.total += size;
-            if (sd.number_of_links == 1) {
-                // File exists only in the snapshot directory.
-                details.live += size;
+            utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
+                throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
+            }).get();
+
+            // The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
+            //
+            // All the others should just generate an exception: there is something wrong, so don't blindly
+            // add it to the size.
+            if (name != "manifest.json" && name != "schema.cql") {
+                details.total += size;
+                if (sd.number_of_links == 1) {
+                    // File exists only in the snapshot directory.
+                    details.live += size;
+                    continue;
+                }
+                // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
+                // So check the datadir for the file too.
+            } else {
                continue;
            }
-            // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
-            // So check the datadir for the file too.
-        } else {
-            continue;
-        }

-        auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
-          try {
-            // File exists in the main SSTable directory. Snapshots are not contributing to size
-            auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
-            // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
-            if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
-                dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
-                        (path / name).native(), psd.device_id, psd.inode_number, psd.size,
-                        (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+            auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
+              try {
+                // File exists in the main SSTable directory. Snapshots are not contributing to size
+                auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
+                // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
+                if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
+                    dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
+                            (path / name).native(), psd.device_id, psd.inode_number, psd.size,
+                            (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+                    co_return false;
+                }
+                co_return true;
+              } catch (std::system_error& e) {
+                if (e.code() != std::error_code(ENOENT, std::system_category())) {
+                    throw;
+                }
                co_return false;
+              }
+            };
+            // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
+            if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
+                    !co_await exists_in_dir(data_directory, datadir, name)) {
+                details.live += size;
            }
-            co_return true;
-          } catch (std::system_error& e) {
-            if (e.code() != std::error_code(ENOENT, std::system_category())) {
-                throw;
-            }
-            co_return false;
-          }
-        };
-        // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
-        if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
-                !co_await exists_in_dir(data_directory, datadir, name)) {
-            details.live += size;
        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await lister.close();
+    if (ex) {
+        co_await coroutine::return_exception_ptr(std::move(ex));
    }

    co_return details;
@@ -4261,6 +4443,18 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
    co_return rp;
 }

+future<> table::discard_logstor_segments() {
+    if (!uses_logstor()) {
+        co_return;
+    }
+
+    _logstor_index->clear();
+
+    co_await parallel_foreach_compaction_group([] (compaction_group& cg) {
+        return cg.discard_logstor_segments();
+    });
+}
+
 void table::mark_ready_for_writes(db::commitlog* cl) {
    if (!_readonly) {
        on_internal_error(dblog, ::format("table {}.{} is already writable", _schema->ks_name(), _schema->cf_name()));
@@ -4271,6 +4465,19 @@ void table::mark_ready_for_writes(db::commitlog* cl) {
    _readonly = false;
 }

+void table::init_logstor(logstor::logstor* ls) {
+    _logstor = ls;
+    _logstor_index = std::make_unique<logstor::primary_index>(_schema);
+}
+
+size_t table::get_logstor_memory_usage() const {
+    size_t m = 0;
+    if (_logstor_index) {
+        m += _logstor_index->get_memory_usage();
+    }
+    return m;
+}
+
 db::commitlog* table::commitlog() const {
    if (_readonly) [[unlikely]] {
        on_internal_error(dblog, ::format("table {}.{} is readonly", _schema->ks_name(), _schema->cf_name()));
@@ -4295,6 +4502,9 @@ void table::set_schema(schema_ptr s) {
    if (_counter_cell_locks) {
        _counter_cell_locks->set_schema(s);
    }
+    if (_logstor_index) {
+        _logstor_index->set_schema(s);
+    }
    _schema = std::move(s);

    for (auto&& v : _views) {
@@ -4522,6 +4732,11 @@ future<> table::apply(const mutation& m, db::rp_handle&& h, db::timeout_clock::t

    auto& cg = compaction_group_for_token(m.token());
    auto holder = cg.async_gate().hold();
+
+    if (_logstor) [[unlikely]] {
+        return _logstor->write(m, cg, std::move(holder));
+    }
+
    return dirty_memory_region_group().run_when_memory_available([this, &m, h = std::move(h), &cg, holder = std::move(holder)] () mutable {
        do_apply(cg, std::move(h), m);
    }, timeout);
@@ -4537,6 +4752,10 @@ future<> table::apply(const frozen_mutation& m, schema_ptr m_schema, db::rp_hand
    auto& cg = compaction_group_for_key(m.key(), m_schema);
    auto holder = cg.async_gate().hold();

+    if (_logstor) [[unlikely]] {
+        return _logstor->write(m.unfreeze(m_schema), cg, std::move(holder));
+    }
+
    return dirty_memory_region_group().run_when_memory_available([this, &m, m_schema = std::move(m_schema), h = std::move(h), &cg, holder = std::move(holder)]() mutable {
        do_apply(cg, std::move(h), m, m_schema);
    }, timeout);
@@ -4737,6 +4956,10 @@ table::enable_auto_compaction() {
    //      see table::disable_auto_compaction() notes.
    _compaction_disabled_by_user = false;
    trigger_compaction();
+
+    if (uses_logstor()) {
+        trigger_logstor_compaction();
+    }
 }

 future<>
@@ -4768,11 +4991,18 @@ table::disable_auto_compaction() {
    // - it will break computation of major compaction descriptor
    //   for new submissions
    _compaction_disabled_by_user = true;
-    return with_gate(_async_gate, [this] {
-        return parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
-            return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
-        });
+
+    auto holder = _async_gate.hold();
+
+    co_await parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
+        return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
    });
+
+    if (uses_logstor()) {
+        co_await parallel_foreach_compaction_group([this] (compaction_group& cg) {
+            return get_logstor_compaction_manager().stop_ongoing_compactions(cg);
+        });
+    }
 }

 void table::set_tombstone_gc_enabled(bool tombstone_gc_enabled) noexcept {
@@ -4985,6 +5215,26 @@ const compaction::compaction_manager& compaction_group::get_compaction_manager()
    return _t.get_compaction_manager();
 }

+logstor::segment_manager& compaction_group::get_logstor_segment_manager() noexcept {
+    return _t.get_logstor_segment_manager();
+}
+
+const logstor::segment_manager& compaction_group::get_logstor_segment_manager() const noexcept {
+    return _t.get_logstor_segment_manager();
+}
+
+logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() noexcept {
+    return _t.get_logstor_compaction_manager();
+}
+
+const logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() const noexcept {
+    return _t.get_logstor_compaction_manager();
+}
+
+logstor::primary_index& compaction_group::get_logstor_index() noexcept {
+    return _t.logstor_index();
+}
+
 compaction::compaction_group_view& compaction_group::as_view_for_static_sharding() const {
    return view_for_unrepaired_data();
 }
--- a/schema/schema.cc
+++ b/schema/schema.cc
@@ -592,6 +592,7 @@ bool operator==(const schema::user_properties& lhs, const schema::user_propertie
        && lhs.compaction_strategy == rhs.compaction_strategy
        && lhs.compaction_strategy_options == rhs.compaction_strategy_options
        && lhs.compaction_enabled == rhs.compaction_enabled
+        && lhs.storage_engine == rhs.storage_engine
        && lhs.caching_options == rhs.caching_options
        && lhs.tablet_options == rhs.tablet_options
        && lhs.get_paxos_grace_seconds() == rhs.get_paxos_grace_seconds()
@@ -698,6 +699,7 @@ table_schema_version schema::calculate_digest(const schema::raw_schema& r) {
    feed_hash(h, r._view_info);
    feed_hash(h, r._indices_by_name);
    feed_hash(h, r._is_counter);
+    feed_hash(h, r._props.storage_engine);

    for (auto&& [name, ext] : r._props.extensions) {
        feed_hash(h, name);
@@ -874,6 +876,9 @@ auto fmt::formatter<schema>::format(const schema& s, fmt::format_context& ctx) c
    out = fmt::format_to(out, ",minIndexInterval={}", s._raw._props.min_index_interval);
    out = fmt::format_to(out, ",maxIndexInterval={}", s._raw._props.max_index_interval);
    out = fmt::format_to(out, ",speculativeRetry={}", s._raw._props.speculative_retry.to_sstring());
+    if (s.storage_engine() != storage_engine_type::normal) {
+        out = fmt::format_to(out, ",storage_engine={}", storage_engine_type_to_sstring(s.storage_engine()));
+    }
    out = fmt::format_to(out, ",tablets={{");
    if (s._raw._props.tablet_options) {
        n = 0;
@@ -1210,6 +1215,9 @@ fragmented_ostringstream& schema::schema_properties(const schema_describe_helper
    os << "\n    AND memtable_flush_period_in_ms = " << fmt::to_string(memtable_flush_period());
    os << "\n    AND min_index_interval = " << fmt::to_string(min_index_interval());
    os << "\n    AND speculative_retry = '" << speculative_retry().to_sstring() << "'";
+    if (storage_engine() != storage_engine_type::normal) {
+        os << "\n    AND storage_engine = '" << storage_engine_type_to_sstring(storage_engine()) << "'";
+    }

    if (has_tablet_options()) {
        os << "\n    AND tablets = {";
--- a/schema/schema.hh
+++ b/schema/schema.hh
@@ -175,6 +175,21 @@ public:
    bool operator==(const speculative_retry& other) const = default;
 };

+enum class storage_engine_type {
+    normal,
+    logstor,
+};
+
+inline sstring storage_engine_type_to_sstring(storage_engine_type t) {
+    switch (t) {
+    case storage_engine_type::normal:
+        return "normal";
+    case storage_engine_type::logstor:
+        return "logstor";
+    }
+    throw std::invalid_argument(format("unknown storage engine type: {:d}\n", uint8_t(t)));
+}
+
 using index_options_map = std::unordered_map<sstring, sstring>;

 enum class index_metadata_kind {
@@ -561,6 +576,7 @@ public:
        compaction::compaction_strategy_type compaction_strategy = compaction::compaction_strategy_type::incremental;
        std::map<sstring, sstring> compaction_strategy_options;
        bool compaction_enabled = true;
+        storage_engine_type storage_engine = storage_engine_type::normal;
        ::caching_options caching_options;
        std::optional<std::map<sstring, sstring>> tablet_options;

@@ -776,6 +792,14 @@ public:
        return _raw._props.compaction_enabled;
    }

+    storage_engine_type storage_engine() const {
+        return _raw._props.storage_engine;
+    }
+
+    bool logstor_enabled() const {
+        return _raw._props.storage_engine == storage_engine_type::logstor;
+    }
+
    const cdc::options& cdc_options() const {
        return _raw._props.get_cdc_options();
    }
--- a/schema/schema_builder.hh
+++ b/schema/schema_builder.hh
@@ -269,6 +269,11 @@ public:
        enable_schema_commitlog();
    }

+    schema_builder& set_logstor() {
+        _raw._props.storage_engine = storage_engine_type::logstor;
+        return *this;
+    }
+
    class default_names {
    public:
        default_names(const schema_builder&);
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -227,8 +227,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
    for (const auto& m : modules.entries) {
        if (m.table == db::system_keyspace::service_levels_v2()->id()) {
            update_service_levels_cache = true;
-        } else if (m.table == db::system_keyspace::role_members()->id() || m.table == db::system_keyspace::role_attributes()->id()) {
-            update_service_levels_effective_cache = true;
        } else if (m.table == db::system_keyspace::dicts()->id()) {
            auto pk_type = db::system_keyspace::dicts()->partition_key_type();
            auto name_value = pk_type->deserialize_value(m.pk.representation());
@@ -247,6 +245,11 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
            auto cdc_log_table_id = table_id(value_cast<utils::UUID>(uuid_type->deserialize_value(elements.front())));
            update_cdc_streams.insert(cdc_log_table_id);
        } else if (auth::cache::includes_table(m.table)) {
+            if (m.table == db::system_keyspace::role_members()->id() ||
+                    m.table == db::system_keyspace::role_attributes()->id()) {
+                update_service_levels_effective_cache = true;
+            }
+
            auto schema = _ss.get_database().find_schema(m.table);
            const auto elements = m.pk.explode(*schema);
            auto role = value_cast<sstring>(schema->partition_key_type()->
@@ -255,6 +258,9 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
        }
    }
    
+    if (update_auth_cache_roles.size()) {
+        co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
+    }
    if (update_service_levels_cache || update_service_levels_effective_cache) { // this also updates SL effective cache
        co_await _ss.update_service_levels_cache(qos::update_both_cache_levels(update_service_levels_cache), qos::query_context::group0);
    }
@@ -264,9 +270,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
    if (update_cdc_streams.size()) {
        co_await _ss.load_cdc_streams(std::move(update_cdc_streams));
    }
-    if (update_auth_cache_roles.size()) {
-        co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
-    }
 }

 future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merger) {
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -4653,6 +4653,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
    auto& stats = handler_ptr->stats();
    auto& handler = *handler_ptr;
    auto& global_stats = handler._proxy->_global_stats;
+    auto schema = handler_ptr->get_schema();

    if (handler.get_targets().size() == 0) {
        // Usually we remove the response handler when receiving responses from all targets.
@@ -4748,7 +4749,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
        }

        // Waited on indirectly.
-        (void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats] (std::exception_ptr eptr) {
+        (void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats, schema] (std::exception_ptr eptr) {
            ++stats.writes_errors.get_ep_stat(handler_ptr->_effective_replication_map_ptr->get_topology(), coordinator);
            error err = error::FAILURE;
            std::optional<sstring> msg;
@@ -4762,8 +4763,8 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
                // ignore, disconnect will be logged by gossiper
            } else if (const auto* e = try_catch_nested<seastar::gate_closed_exception>(eptr)) {
                // may happen during shutdown, log and ignore it
-                slogger.warn("gate_closed_exception during mutation write to {}: {}",
-                    coordinator, e->what());
+                slogger.warn("gate_closed_exception during mutation write to {}.{} on {}: {}",
+                    schema->ks_name(), schema->cf_name(), coordinator, e->what());
            } else if (try_catch<timed_out_error>(eptr)) {
                // from lmutate(). Ignore so that logs are not flooded
                // database total_writes_timedout counter was incremented.
@@ -4774,7 +4775,8 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
            } else if (auto* e = try_catch<replica::critical_disk_utilization_exception>(eptr)) {
                msg = e->what();
            } else {
-                slogger.error("exception during mutation write to {}: {}", coordinator, eptr);
+                slogger.error("exception during mutation write to {}.{} on {}: {}",
+                    schema->ks_name(), schema->cf_name(), coordinator, eptr);
            }
            p->got_failure_response(response_id, coordinator, forward_size + 1, std::nullopt, err, std::move(msg));
        });
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -3026,6 +3026,8 @@ future<> storage_service::drain() {
 }

 future<> storage_service::do_drain() {
+    co_await utils::get_local_injector().inject("storage_service_drain_wait", utils::wait_for_message(60s));
+
    // Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
    co_await stop_transport();

@@ -4016,6 +4018,9 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
        } catch (raft::request_aborted& ex) {
            slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
            break;
+        } catch (seastar::gate_closed_exception& ex) {
+            slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
+            break;
        } catch (...) {
            slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
                          table, std::current_exception(), split_retry.sleep_time());
@@ -4082,6 +4087,58 @@ future<> storage_service::snitch_reconfigured() {
    }
 }

+future<> storage_service::local_topology_barrier() {
+    if (this_shard_id() != 0) {
+        co_await container().invoke_on(0, [] (storage_service& ss) {
+            return ss.local_topology_barrier();
+        });
+        co_return;
+    }
+
+    auto version = _topology_state_machine._topology.version;
+
+    utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
+        throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
+    });
+
+    co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
+    if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
+        for (auto& n : _topology_state_machine._topology.transition_nodes) {
+            if (!_address_map.find(locator::host_id{n.first.uuid()})) {
+                rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
+                break;
+            }
+        }
+    }
+
+    co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
+        const auto current_version = ss._shared_token_metadata.get()->get_version();
+        rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
+                      "current version {}, stale versions (version: use_count): {}",
+                      version, current_version, ss._shared_token_metadata.describe_stale_versions());
+
+        // This shouldn't happen under normal operation, it's only plausible
+        // if the topology change coordinator has
+        // moved to another node and managed to update the topology
+        // parallel to this method. The previous coordinator
+        // should be inactive now, so it won't observe this
+        // exception. By returning exception we aim
+        // to reveal any other conditions where this may arise.
+        if (current_version != version) {
+            co_await coroutine::return_exception(std::runtime_error(
+                    ::format("raft topology: command::barrier_and_drain, the version has changed, "
+                             "version {}, current_version {}, the topology change coordinator "
+                             " had probably migrated to another node",
+                             version, current_version)));
+        }
+
+        co_await ss._shared_token_metadata.stale_versions_in_use();
+        co_await get_topology_session_manager().drain_closing_sessions();
+
+        rtlogger.info("raft_topology_cmd::barrier_and_drain done");
+    });
+}
+
 future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd) {
    raft_topology_cmd_result result;
    rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
@@ -4109,12 +4166,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            state.last_index = cmd_index;
        }

-        // We capture the topology version right after the checks
-        // above, before any yields. This is crucial since _topology_state_machine._topology
-        // might be altered concurrently while this method is running,
-        // which can cause the fence command to apply an invalid fence version.
-        const auto version = _topology_state_machine._topology.version;
-
        switch (cmd.cmd) {
            case raft_topology_cmd::command::barrier: {
                utils::get_local_injector().inject("raft_topology_barrier_fail",
@@ -4153,44 +4204,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            }
            break;
            case raft_topology_cmd::command::barrier_and_drain: {
-                utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
-                    throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
-                });
-                co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
-                if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
-                    for (auto& n : _topology_state_machine._topology.transition_nodes) {
-                        if (!_address_map.find(locator::host_id{n.first.uuid()})) {
-                            rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
-                            break;
-                        }
-                    }
-                }
-                co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
-                    const auto current_version = ss._shared_token_metadata.get()->get_version();
-                    rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
-                        "current version {}, stale versions (version: use_count): {}",
-                        version, current_version, ss._shared_token_metadata.describe_stale_versions());
-
-                    // This shouldn't happen under normal operation, it's only plausible
-                    // if the topology change coordinator has
-                    // moved to another node and managed to update the topology
-                    // parallel to this method. The previous coordinator
-                    // should be inactive now, so it won't observe this
-                    // exception. By returning exception we aim
-                    // to reveal any other conditions where this may arise.
-                    if (current_version != version) {
-                        co_await coroutine::return_exception(std::runtime_error(
-                            ::format("raft topology: command::barrier_and_drain, the version has changed, "
-                                     "version {}, current_version {}, the topology change coordinator "
-                                     " had probably migrated to another node",
-                                version, current_version)));
-                    }
-
-                    co_await ss._shared_token_metadata.stale_versions_in_use();
-                    co_await get_topology_session_manager().drain_closing_sessions();
-
-                    rtlogger.info("raft_topology_cmd::barrier_and_drain done");
-                });
+                co_await local_topology_barrier();

                co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
                    auto ks = handler.get("keyspace");
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -813,6 +813,9 @@ public:
    future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
    future<> raft_initialize_discovery_leader(const join_node_request_params& params);
    future<> initialize_done_topology_upgrade_state();
+    // Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
+    // In particular, waits for non-latest local erms to go die.
+    future<> local_topology_barrier();
 private:
     // State machine that is responsible for topology change
    topology_state_machine& _topology_state_machine;
--- a/service/task_manager_module.cc
+++ b/service/task_manager_module.cc
@@ -195,9 +195,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
    } else if (is_resize_task(task_type)) {
        auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
        res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
-        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
    } else {
-        res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    }
    res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
    co_return res->status;
@@ -312,7 +312,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
            }
            return make_ready_future();
        });
-        res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    } else if (is_migration_task(task_type)) {    // Migration task.
        auto tablet_id = hint.get_tablet_id();
        res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
@@ -326,7 +326,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
        if (task_info.tablet_task_id.uuid() == id.uuid()) {
            update_status(task_info, res.status, sched_nr);
            res.status.state = tasks::task_manager::task_state::running;
-            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
            co_return res;
        }
    }
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -2229,6 +2229,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                _tablet_allocator.set_load_stats(reconciled_stats);
            }
        }
+
+        // Wait for the background storage group merge to finish before releasing the state machine.
+        // Background merge holds the old erm, so a successful barrier joins with it.
+        // This guarantees that the background merge doesn't run concurrently with the next merge.
+        // Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
+        // by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
+        // The background merge fiber will try to stop a compaction group which is locked, and the lock is held
+        // by the background merge fiber.
+        tm = nullptr;
+        if (!guard) {
+            guard = co_await start_operation();
+        }
+        co_await global_tablet_token_metadata_barrier(std::move(guard));
    }

    using get_table_ids_func = std::function<std::unordered_set<table_id>(const db::system_keyspace::topology_requests_entry&)>;
--- a/sstables/index_entry.hh
+++ b/sstables/index_entry.hh
@@ -201,95 +201,49 @@ public:
    virtual future<std::optional<entry_info>> next_entry() = 0;
 };

-// Allocated inside LSA.
-class promoted_index {
-    deletion_time _del_time;
-    uint64_t _promoted_index_start;
-    uint32_t _promoted_index_size;
-    uint32_t _num_blocks;
-public:
-    promoted_index(const schema& s,
-        deletion_time del_time,
-        uint64_t promoted_index_start,
-        uint32_t promoted_index_size,
-        uint32_t num_blocks)
-            : _del_time{del_time}
-            , _promoted_index_start(promoted_index_start)
-            , _promoted_index_size(promoted_index_size)
-            , _num_blocks(num_blocks)
-    { }
-
-    [[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
-    [[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
-
-    // Call under allocating_section.
-    // For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
-    std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
-        reader_permit,
-        tracing::trace_state_ptr,
-        file_input_stream_options,
-        use_caching);
+// Promoted index information produced by the parser.
+struct parsed_promoted_index_entry {
+    deletion_time del_time;
+    uint64_t promoted_index_start;
+    uint32_t promoted_index_size;
+    uint32_t num_blocks;
 };

+using promoted_index = parsed_promoted_index_entry;
+
 // A partition index element.
 // Allocated inside LSA.
-class index_entry {
-private:
-    managed_bytes _key;
-    mutable std::optional<dht::token> _token;
-    uint64_t _position;
-    managed_ref<promoted_index> _index;
+struct [[gnu::packed]] index_entry {
+    mutable int64_t raw_token;
+    uint64_t data_file_offset;
+    uint32_t key_offset;

-public:
-
-    key_view get_key() const {
-        return key_view{_key};
-    }
-
-    // May allocate so must be called under allocating_section.
-    decorated_key_view get_decorated_key(const schema& s) const {
-        if (!_token) {
-            _token.emplace(s.get_partitioner().get_token(get_key()));
-        }
-        return decorated_key_view(*_token, get_key());
-    }
-
-    uint64_t position() const { return _position; };
-
-    std::optional<deletion_time> get_deletion_time() const {
-        if (_index) {
-            return _index->get_deletion_time();
-        }
-
-        return {};
-    }
-
-    index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
-        : _key(std::move(key))
-        , _position(position)
-        , _index(std::move(index))
-    {}
-
-    index_entry(index_entry&&) = default;
-    index_entry& operator=(index_entry&&) = default;
-
-    // Can be nullptr
-    const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
-    managed_ref<promoted_index>& get_promoted_index() { return _index; }
-    uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
-
-    size_t external_memory_usage() const {
-        return _key.external_memory_usage() + _index.external_memory_usage();
-    }
+    uint64_t position() const { return data_file_offset; }
+    dht::raw_token token() const { return dht::raw_token(raw_token); }
 };

+// Required for optimized LSA migration of storage of managed_vector.
+static_assert(std::is_trivially_move_assignable_v<index_entry>);
+static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
+
 // A partition index page.
 //
 // Allocated in the standard allocator space but with an LSA allocator as the current allocator.
 // So the shallow part is in the standard allocator but all indirect objects are inside LSA.
 class partition_index_page {
 public:
-    lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
+    lsa::chunked_managed_vector<index_entry> _entries;
+    managed_bytes _key_storage;
+
+    // Stores promoted index information of index entries.
+    // The i-th element corresponds to the i-th entry in _entries.
+    // Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
+    // that entry doesn't have a promoted index.
+    // It's not chunked, because promoted index is present only when there are large partitions in the page,
+    // which also means the page will have typically only 1 entry due to summary:data_file size ratio.
+    // Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
+    // which is typical in workloads with small partitions.
+    managed_vector<promoted_index> _promoted_indexes;
 public:
    partition_index_page() = default;
    partition_index_page(partition_index_page&&) noexcept = default;
@@ -298,15 +252,68 @@ public:
    bool empty() const { return _entries.empty(); }
    size_t size() const { return _entries.size(); }

+    stop_iteration clear_gently() {
+        // Vectors have trivial storage, so are fast to destroy.
+        return stop_iteration::yes;
+    }
+
    void clear_one_entry() {
        _entries.pop_back();
    }

+    bool has_promoted_index(size_t i) const {
+        return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    const promoted_index& get_promoted_index(size_t i) const {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    promoted_index& get_promoted_index(size_t i) {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index size for the i-th entry.
+    uint32_t get_promoted_index_size(size_t i) const {
+        return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
+    }
+
+    /// Get deletion_time for partition represented by the i-th entry.
+    /// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
+    /// It has to be read from the data file.
+    std::optional<deletion_time> get_deletion_time(size_t i) const {
+        if (has_promoted_index(i)) {
+            return get_promoted_index(i).del_time;
+        }
+        return {};
+    }
+
+    key_view get_key(size_t i) const {
+        auto start = _entries[i].key_offset;
+        auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
+        auto v = managed_bytes_view(_key_storage).prefix(end);
+        v.remove_prefix(start);
+        return key_view(v);
+    }
+
+    decorated_key_view get_decorated_key(const schema& s, size_t i) const {
+        auto key = get_key(i);
+        auto t = _entries[i].token();
+        if (!t) {
+            t = dht::raw_token(s.get_partitioner().get_token(key));
+            _entries[i].raw_token = t.value;
+        }
+        return decorated_key_view(dht::token(t), key);
+    }
+
    size_t external_memory_usage() const {
        size_t size = _entries.external_memory_usage();
-        for (auto&& e : _entries) {
-            size += sizeof(index_entry) + e->external_memory_usage();
-        }
+        size += _promoted_indexes.external_memory_usage();
+        size += _key_storage.external_memory_usage();
        return size;
    }
 };
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -25,14 +25,6 @@ namespace sstables {
 extern seastar::logger sstlog;
 extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;

-// Promoted index information produced by the parser.
-struct parsed_promoted_index_entry {
-    deletion_time del_time;
-    uint64_t promoted_index_start;
-    uint32_t promoted_index_size;
-    uint32_t num_blocks;
-};
-
 // Partition index entry information produced by the parser.
 struct parsed_partition_index_entry {
    temporary_buffer<char> key;
@@ -53,9 +45,10 @@ class index_consumer {
    schema_ptr _s;
    logalloc::allocating_section _alloc_section;
    logalloc::region& _region;
+    utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
+    size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
+    size_t _key_storage_size = 0;
 public:
-    index_list indexes;
-
    index_consumer(logalloc::region& r, schema_ptr s)
        : _s(s)
        , _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
@@ -64,36 +57,63 @@ public:
        , _region(r)
    { }

-    ~index_consumer() {
-        with_allocator(_region.allocator(), [&] {
-            indexes._entries.clear_and_release();
-        });
+    void consume_entry(parsed_partition_index_entry&& e) {
+        _key_storage_size += e.key.size();
+        _parsed_entries.emplace_back(std::move(e));
+        if (e.promoted_index) {
+            _max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
+        }
    }

-    void consume_entry(parsed_partition_index_entry&& e) {
-        _alloc_section(_region, [&] {
+    future<index_list> finalize() {
+        index_list result;
+        // In case of exception, need to deallocate under region allocator.
+        auto delete_result = seastar::defer([&] {
            with_allocator(_region.allocator(), [&] {
-                managed_ref<promoted_index> pi;
-                if (e.promoted_index) {
-                    pi = make_managed<promoted_index>(*_s,
-                            e.promoted_index->del_time,
-                            e.promoted_index->promoted_index_start,
-                            e.promoted_index->promoted_index_size,
-                            e.promoted_index->num_blocks);
-                }
-                auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
-                indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
+                result._entries = {};
+                result._promoted_indexes = {};
+                result._key_storage = {};
            });
        });
+        auto i = _parsed_entries.begin();
+        size_t key_offset = 0;
+        while (i != _parsed_entries.end()) {
+            _alloc_section(_region, [&] {
+                with_allocator(_region.allocator(), [&] {
+                    result._entries.reserve(_parsed_entries.size());
+                    result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
+                    if (result._key_storage.empty()) {
+                        result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
+                    }
+                    managed_bytes_mutable_view key_out(result._key_storage);
+                    key_out.remove_prefix(key_offset);
+                    while (i != _parsed_entries.end()) {
+                        parsed_partition_index_entry& e = *i;
+                        if (e.promoted_index) {
+                            result._promoted_indexes[result._entries.size()] = *e.promoted_index;
+                        }
+                        write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
+                        result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
+                        ++i;
+                        key_offset += e.key.size();
+                        if (need_preempt()) {
+                            break;
+                        }
+                    }
+                });
+            });
+            co_await coroutine::maybe_yield();
+        }
+        delete_result.cancel();
+        _parsed_entries.clear();
+        co_return std::move(result);
    }

    void prepare(uint64_t size) {
-        _alloc_section = logalloc::allocating_section();
-        _alloc_section(_region, [&] {
-            with_allocator(_region.allocator(), [&] {
-                indexes._entries.reserve(size);
-            });
-        });
+        _max_promoted_index_entry_plus_one = 0;
+        _key_storage_size = 0;
+        _parsed_entries.clear();
+        _parsed_entries.reserve(size);
    }
 };

@@ -198,10 +218,14 @@ public:

        switch (_state) {
        // START comes first, to make the handling of the 0-quantity case simpler
+            state_START:
        case state::START:
            sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
            _state = state::KEY_SIZE;
-            break;
+            if (data.size() == 0) {
+                break;
+            }
+            [[fallthrough]];
        case state::KEY_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
            _entry_offset = current_pos();
@@ -227,7 +251,16 @@ public:
        case state::PROMOTED_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
            _position = this->_u64;
-            if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
+            if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
+                data.trim_front(1);
+                _consumer.consume_entry(parsed_partition_index_entry{
+                    .key = std::move(_key),
+                    .data_file_offset = _position,
+                    .index_offset = _entry_offset,
+                    .promoted_index = std::nullopt
+                });
+                goto state_START;
+            } else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
                _state = state::PARTITION_HEADER_LENGTH_1;
                break;
            }
@@ -339,33 +372,6 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
    return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
 }

-inline
-std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
-    reader_permit permit,
-    tracing::trace_state_ptr trace_state,
-    file_input_stream_options options,
-    use_caching caching)
-{
-    if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
-        seastar::shared_ptr<cached_file> cached_file_ptr = caching
-                ? sst->_cached_index_file
-                : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
-                                                    sst->manager().get_cache_tracker().get_index_cached_file_stats(),
-                                                    sst->manager().get_cache_tracker().get_lru(),
-                                                    sst->manager().get_cache_tracker().region(),
-                                                    sst->_index_file_size);
-        return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
-            _promoted_index_start, _promoted_index_size,
-            promoted_index_cache_metrics, permit,
-            sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
-    }
-
-    auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
-    auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
-    return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
-        std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
-}
-
 // Less-comparator for lookups in the partition index.
 class index_comparator {
    dht::ring_position_comparator_for_sstables _tri_cmp;
@@ -376,27 +382,17 @@ public:
        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

-    bool operator()(const index_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
-    }
-
-    bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
-        return operator()(*e, rp);
-    }
-
-    bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
-        return operator()(rp, *e);
-    }
-
    bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }
-
-    bool operator()(dht::ring_position_view rp, const index_entry& e) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
-    }
 };

+inline
+std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
+    dht::ring_position_comparator_for_sstables tri_cmp(s);
+    return tri_cmp(page.get_decorated_key(s, idx), rp);
+}
+
 // Contains information about index_reader position in the index file
 struct index_bound {
    index_bound() = default;
@@ -537,7 +533,7 @@ private:
                    if (ex) {
                        return make_exception_future<index_list>(std::move(ex));
                    }
-                    return make_ready_future<index_list>(std::move(bound.consumer->indexes));
+                    return bound.consumer->finalize();
                });
            });
        };
@@ -550,17 +546,18 @@ private:
            if (bound.current_list->empty()) {
                throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
            }
-            bound.data_file_position = bound.current_list->_entries[0]->position();
+            bound.data_file_position = bound.current_list->_entries[0].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();

            if (sstlog.is_enabled(seastar::log_level::trace)) {
                sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
                logalloc::reclaim_lock rl(_region);
-                for (auto&& e : bound.current_list->_entries) {
+                for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
+                    auto& e = bound.current_list->_entries[i];
                    auto dk = dht::decorate_key(*_sstable->_schema,
-                        e->get_key().to_partition_key(*_sstable->_schema));
-                    sstlog.trace("  {} -> {}", dk, e->position());
+                        bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
+                    sstlog.trace("  {} -> {}", dk, e.position());
                }
            }

@@ -604,7 +601,13 @@ private:
    // Valid if partition_data_ready(bound)
    index_entry& current_partition_entry(index_bound& bound) {
        parse_assert(bool(bound.current_list), _sstable->index_filename());
-        return *bound.current_list->_entries[bound.current_index_idx];
+        return bound.current_list->_entries[bound.current_index_idx];
+    }
+
+    // Valid if partition_data_ready(bound)
+    partition_index_page& current_page(index_bound& bound) {
+        parse_assert(bool(bound.current_list), _sstable->index_filename());
+        return *bound.current_list;
    }

    future<> advance_to_next_partition(index_bound& bound) {
@@ -617,7 +620,7 @@ private:
        if (bound.current_index_idx + 1 < bound.current_list->size()) {
            ++bound.current_index_idx;
            bound.current_pi_idx = 0;
-            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
+            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            return reset_clustered_cursor(bound);
@@ -680,9 +683,13 @@ private:
        return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
            sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
            auto i = _alloc_section(_region, [&] {
-                auto& entries = bound.current_list->_entries;
-                return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
-                    index_comparator(*_sstable->_schema));
+                auto& page = *bound.current_list;
+                auto& s = *_sstable->_schema;
+                auto r = std::views::iota(bound.current_index_idx, page._entries.size());
+                auto it = std::ranges::partition_point(r, [&] (int idx) {
+                    return index_entry_tri_cmp(s, page, idx, pos) < 0;
+                });
+                return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
            });
            // i is valid until next allocation point
            auto& entries = bound.current_list->_entries;
@@ -697,7 +704,7 @@ private:
            }
            bound.current_index_idx = std::distance(std::begin(entries), i);
            bound.current_pi_idx = 0;
-            bound.data_file_position = (*i)->position();
+            bound.data_file_position = (*i).position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
@@ -800,6 +807,34 @@ public:
        }
    }

+    static
+    std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
+        shared_sstable sst,
+        reader_permit permit,
+        tracing::trace_state_ptr trace_state,
+        file_input_stream_options options,
+        use_caching caching)
+    {
+        if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
+            seastar::shared_ptr<cached_file> cached_file_ptr = caching
+                    ? sst->_cached_index_file
+                    : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
+                                                        sst->manager().get_cache_tracker().get_index_cached_file_stats(),
+                                                        sst->manager().get_cache_tracker().get_lru(),
+                                                        sst->manager().get_cache_tracker().region(),
+                                                        sst->_index_file_size);
+            return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
+                pi.promoted_index_start, pi.promoted_index_size,
+                promoted_index_cache_metrics, permit,
+                sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
+        }
+
+        auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
+        auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
+        return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
+            std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
+    }
+
    // Ensures that partition_data_ready() returns true.
    // Can be called only when !eof()
    future<> read_partition_data() override {
@@ -835,10 +870,10 @@ public:
    clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
        if (!bound.clustered_cursor) {
            _alloc_section(_region, [&] {
-                index_entry& e = current_partition_entry(bound);
-                promoted_index* pi = e.get_promoted_index().get();
-                if (pi) {
-                    bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
+                partition_index_page& page = current_page(bound);
+                if (page.has_promoted_index(bound.current_index_idx)) {
+                    promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
+                    bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
                        get_file_input_stream_options(), _use_caching);
                }
            });
@@ -861,15 +896,15 @@ public:
    // It may be unavailable for old sstables for which this information was not generated.
    // Can be called only when partition_data_ready().
    std::optional<sstables::deletion_time> partition_tombstone() override {
-        return current_partition_entry(_lower_bound).get_deletion_time();
+        return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
    }

    // Returns the key for current partition.
    // Can be called only when partition_data_ready().
    std::optional<partition_key> get_partition_key() override {
        return _alloc_section(_region, [this] {
-            index_entry& e = current_partition_entry(_lower_bound);
-            return e.get_key().to_partition_key(*_sstable->_schema);
+            return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
+                .to_partition_key(*_sstable->_schema);
        });
    }

@@ -883,8 +918,8 @@ public:
    // Returns the number of promoted index entries for the current partition.
    // Can be called only when partition_data_ready().
    uint64_t get_promoted_index_size() {
-        index_entry& e = current_partition_entry(_lower_bound);
-        return e.get_promoted_index_size();
+        partition_index_page& page = current_page(_lower_bound);
+        return page.get_promoted_index_size(_lower_bound.current_index_idx);
    }

    bool partition_data_ready() const override {
@@ -975,9 +1010,9 @@ public:
                return make_ready_future<bool>(false);
            }
            return read_partition_data().then([this, key] {
-                index_comparator cmp(*_sstable->_schema);
                bool found = _alloc_section(_region, [&] {
-                    return cmp(key, current_partition_entry(_lower_bound)) == 0;
+                    auto& page = current_page(_lower_bound);
+                    return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
                });
                return make_ready_future<bool>(found);
            });
--- a/sstables/object_storage_client.cc
+++ b/sstables/object_storage_client.cc
@@ -189,10 +189,11 @@ public:
            {}
            future<std::optional<directory_entry>> get() override {
                std::filesystem::path dir(_prefix);
-                do {
+                while (true) {
                    if (_pos == _info.size()) {
                        _info.clear();
                        _info = co_await _client->list_objects(_bucket, _prefix, _paging);
+                        _pos = 0;
                    }
                    if (_info.empty()) {
                        break;
@@ -203,7 +204,7 @@ public:
                        continue;
                    }
                    co_return ent;
-                } while (false);
+                }

                co_return std::nullopt;
            }
@@ -276,7 +277,7 @@ public:
            co_await f.close();

            auto names = ranges | std::views::transform([](auto& p) { return p.name; }) | std::ranges::to<std::vector<std::string>>();
-            co_await _client->merge_objects(bucket, object, std::move(names), {}, as);
+            co_await _client->merge_objects(bucket, object, names, {}, as);

            co_await parallel_for_each(names, [this, bucket](auto& name) -> future<> {
                co_await _client->delete_object(bucket, name);
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -257,14 +257,11 @@ public:
        while (partial_page || i != _cache.end()) {
            if (partial_page) {
                auto preempted = with_allocator(_region.allocator(), [&] {
-                    while (!partial_page->empty()) {
-                        partial_page->clear_one_entry();
-                        if (need_preempt()) {
-                            return true;
-                        }
+                    while (partial_page->clear_gently() != stop_iteration::yes) {
+                        return true;
                    }
                    partial_page.reset();
-                    return false;
+                    return need_preempt();
                });
                if (preempted) {
                    auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -1132,7 +1132,6 @@ public:

    friend class mc::writer;
    friend class index_reader;
-    friend class promoted_index;
    friend class sstables_manager;
    template <typename DataConsumeRowsContext>
    friend future<std::unique_ptr<DataConsumeRowsContext>>
--- a/sstables/sstables_manager.cc
+++ b/sstables/sstables_manager.cc
@@ -180,18 +180,11 @@ storage_manager::config_updater::config_updater(const db::config& cfg, storage_m
 {}

 sstables::sstable::version_types sstables_manager::get_highest_supported_format() const noexcept {
-    // FIXME: start announcing `ms` here after it becomes the default.
-    // (There are several tests which expect that new sstables are written with
-    // the format reported by this API).
-    //
-    // After `ms` becomes the default, this function look like this:
-    //
-    // if (_features.ms_sstable) {
-    //     return sstable_version_types::ms;
-    // } else {
-    //     return sstable_version_types::me;
-    // }
-    return sstable_version_types::me;
+     if (_features.ms_sstable) {
+         return sstable_version_types::ms;
+     } else {
+         return sstable_version_types::me;
+     }
 }

 sstables::sstable::version_types sstables_manager::get_preferred_sstable_version() const {
--- a/tasks/task_manager.cc
+++ b/tasks/task_manager.cc
@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
    : _module(std::move(module))
 {}

-future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
+future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
    auto ms = module->get_task_manager()._messaging;
    if (!ms) {
        auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
@@ -417,19 +417,18 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
        tmlogger.info("tasks_vt_get_children: waiting");
        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{60});
    });
-    co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
-        if (is_host_alive(host_id)) {
-            return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
-                return resp | std::views::transform([host_id] (auto id) {
-                    return task_identity{
-                        .host_id = host_id,
-                        .task_id = id
-                    };
-                }) | std::ranges::to<utils::chunked_vector<task_identity>>();
-            });
-        } else {
-            return make_ready_future<utils::chunked_vector<task_identity>>();
-        }
+    co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
+        return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
+            return resp | std::views::transform([host_id] (auto id) {
+                return task_identity{
+                    .host_id = host_id,
+                    .task_id = id
+                };
+            }) | std::ranges::to<utils::chunked_vector<task_identity>>();
+        }).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
+            tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
+            return utils::chunked_vector<task_identity>{};
+        });
    }, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
        std::move(b.begin(), b.end(), std::back_inserter(a));
        return a;
--- a/tasks/task_manager.hh
+++ b/tasks/task_manager.hh
@@ -19,6 +19,7 @@
 #include "db_clock.hh"
 #include "utils/log.hh"
 #include "locator/host_id.hh"
+#include "locator/token_metadata_fwd.hh"
 #include "schema/schema_fwd.hh"
 #include "tasks/types.hh"
 #include "utils/chunked_vector.hh"
@@ -282,7 +283,7 @@ public:
            impl& operator=(impl&&) = delete;
            virtual ~impl() = default;
        protected:
-            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
+            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
        public:
            virtual task_group get_group() const noexcept = 0;
            // Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.
--- a/test/alternator/test_streams.py
+++ b/test/alternator/test_streams.py
@@ -469,18 +469,6 @@ def test_get_records_nonexistent_iterator(dynamodbstreams):
 # not allowed (see test_streams_change_type), and while removing and re-adding
 # a stream is possible, it is very slow. So we create four different fixtures
 # with the four different StreamViewType settings for these four fixtures.
-#
-# It turns out that DynamoDB makes reusing the same table in different tests
-# very difficult, because when we request a "LATEST" iterator we sometimes
-# miss the immediately following write (this issue doesn't happen in
-# ALternator, just in DynamoDB - presumably LATEST adds some time slack?)
-# So all the fixtures we create below have scope="function", meaning that a
-# separate table is created for each of the tests using these fixtures. This
-# slows the tests down a bit, but not by much (about 0.05 seconds per test).
-# It is still worthwhile to use a fixture rather than to create a table
-# explicitly - it is convenient, safe (the table gets deleted automatically)
-# and if in the future we can work around the DynamoDB problem, we can return
-# these fixtures to module scope.

@contextmanager
 def create_table_ss(dynamodb, dynamodbstreams, type):
@@ -524,43 +512,43 @@ def create_table_s_no_ck(dynamodb, dynamodbstreams, type):
    yield table, arn
    table.delete()

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_sss_new_and_old_images_lsi(dynamodb, dynamodbstreams):
    yield from create_table_sss_lsi(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_keys_only(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_new_image(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'NEW_IMAGE') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_old_image(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'OLD_IMAGE') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_new_and_old_images(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_keys_only(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'KEYS_ONLY')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_new_image(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'NEW_IMAGE')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_old_image(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'OLD_IMAGE')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_new_and_old_images(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES')

@@ -626,13 +614,30 @@ def list_shards(dynamodbstreams, arn):

 # Utility function for getting shard iterators starting at "LATEST" for
 # all the shards of the given stream arn.
+# On DynamoDB (but not Alternator), LATEST has a time slack: it may point to
+# a position slightly before the true end of the stream, so writes from a
+# previous test that reused the same table can appear to be "in the future"
+# relative to the returned iterators and therefore show up unexpectedly in
+# the current test's reads.  To work around this we drain any already-pending
+# records from the iterators before returning them, so the caller is
+# guaranteed to see only events written *after* this call returns.
 def latest_iterators(dynamodbstreams, arn):
    iterators = []
    for shard_id in list_shards(dynamodbstreams, arn):
        iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
            ShardId=shard_id, ShardIteratorType='LATEST')['ShardIterator'])
    assert len(set(iterators)) == len(iterators)
-    return iterators
+    # Drain any records that are already visible at the LATEST position.
+    # We keep fetching until no more records are returned, which means that
+    # the stream is caught up. This drain loop is not necessary on Alternator,
+    # and needlessly slows the test down.
+    if not dynamodbstreams._endpoint.host.endswith('.amazonaws.com'):
+        return iterators
+    while True:
+        events = []
+        iterators = fetch_more(dynamodbstreams, iterators, events)
+        if events == []:
+            return iterators

 # Similar to latest_iterators(), just also returns the shard id which produced
 # each iterator.
@@ -641,7 +646,16 @@ def shards_and_latest_iterators(dynamodbstreams, arn):
    for shard_id in list_shards(dynamodbstreams, arn):
        shards_and_iterators.append((shard_id, dynamodbstreams.get_shard_iterator(StreamArn=arn,
            ShardId=shard_id, ShardIteratorType='LATEST')['ShardIterator']))
-    return shards_and_iterators
+    # Drain pre-existing records from the iterators, for the same reason as
+    # explained in latest_iterators() above.
+    if not dynamodbstreams._endpoint.host.endswith('.amazonaws.com'):
+        return shards_and_iterators
+    while True:
+        events = []
+        new_iters = fetch_more(dynamodbstreams, [it for _, it in shards_and_iterators], events)
+        shards_and_iterators = list(zip([sh for sh, _ in shards_and_iterators], new_iters))
+        if events == []:
+            return shards_and_iterators

 # Utility function for fetching more content from the stream (given its
 # array of iterators) into an "output" array. Call repeatedly to get more
@@ -806,9 +820,11 @@ def fetch_and_compare_events(dynamodb, dynamodbstreams, iterators, expected_even
 # function "updatefunc" which is supposed to do some updates to the table
 # and also return an expected_events list. do_test() then fetches the streams
 # data and compares it to the expected_events using compare_events().
-def do_test(test_table_ss_stream, dynamodb, dynamodbstreams, updatefunc, mode, p = random_string(), c = random_string()):
+def do_test(test_table_ss_stream, dynamodb, dynamodbstreams, updatefunc, mode):
    table, arn = test_table_ss_stream
    iterators = latest_iterators(dynamodbstreams, arn)
+    p = random_string()
+    c = random_string()
    expected_events = updatefunc(table, p, c)
    fetch_and_compare_events(dynamodb, dynamodbstreams, iterators, expected_events, mode)

@@ -956,7 +972,7 @@ def test_streams_updateitem_old_image_empty_item(test_table_ss_old_image, dynamo
 # columns they are only included in the preimage if they change.
 # Currently fails in Alternator because the item's key is missing in
 # OldImage (#6935) and the LSI key is also missing (#7030).
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_old_image_and_lsi(dynamodb, dynamodbstreams):
    table = create_test_table(dynamodb,
        Tags=TAGS,
@@ -1357,49 +1373,48 @@ def test_streams_after_sequence_number(test_table_ss_keys_only, dynamodbstreams)

 # Test the "TRIM_HORIZON" iterator, which can be used to re-read *all* the
 # previously-read events of the stream shard again.
-# NOTE: This test relies on the test_table_ss_keys_only fixture giving us a
-# brand new stream, with no old events saved from other tests. If we ever
-# change this, we should change this test to use a different fixture.
-def test_streams_trim_horizon(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
-    # Do two UpdateItem operations to the same key, that are expected to leave
-    # two events in the stream.
-    p = random_string()
-    c = random_string()
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
-    # Eventually, *one* of the stream shards will return the two events:
-    timeout = time.time() + 15
-    while time.time() < timeout:
-        for (shard_id, iter) in shards_and_iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and len(response['Records']) == 2:
-                assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                sequence_number_1 = response['Records'][0]['dynamodb']['SequenceNumber']
-                sequence_number_2 = response['Records'][1]['dynamodb']['SequenceNumber']
-                # If we use the TRIM_HORIZON iterator, we should receive the
-                # same two events again, in the same order.
-                # Note that we assume that the fixture gave us a brand new
-                # stream, with no old events saved from other tests. If we
-                # couldn't assume this, this test would need to become much
-                # more complex, and would need to read from this shard until
-                # we find the two events we are looking for.
-                iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
-                    ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+def test_streams_trim_horizon(dynamodb, dynamodbstreams):
+    # This test needs a brand-new stream, without old data from other
+    # tests, so we can't reuse the test_table_ss_keys_only fixture.
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
+        # Do two UpdateItem operations to the same key, that are expected to leave
+        # two events in the stream.
+        p = random_string()
+        c = random_string()
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
+        # Eventually, *one* of the stream shards will return the two events:
+        timeout = time.time() + 15
+        while time.time() < timeout:
+            for (shard_id, iter) in shards_and_iterators:
                response = dynamodbstreams.get_records(ShardIterator=iter)
-                assert 'Records' in response
-                assert len(response['Records']) == 2
-                assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][0]['dynamodb']['SequenceNumber'] == sequence_number_1
-                assert response['Records'][1]['dynamodb']['SequenceNumber'] == sequence_number_2
-                return
-        time.sleep(0.5)
-    pytest.fail("timed out")
+                if 'Records' in response and len(response['Records']) == 2:
+                    assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    sequence_number_1 = response['Records'][0]['dynamodb']['SequenceNumber']
+                    sequence_number_2 = response['Records'][1]['dynamodb']['SequenceNumber']
+                    # If we use the TRIM_HORIZON iterator, we should receive the
+                    # same two events again, in the same order.
+                    # Note that we assume that the fixture gave us a brand new
+                    # stream, with no old events saved from other tests. If we
+                    # couldn't assume this, this test would need to become much
+                    # more complex, and would need to read from this shard until
+                    # we find the two events we are looking for.
+                    iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                        ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+                    response = dynamodbstreams.get_records(ShardIterator=iter)
+                    assert 'Records' in response
+                    assert len(response['Records']) == 2
+                    assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][0]['dynamodb']['SequenceNumber'] == sequence_number_1
+                    assert response['Records'][1]['dynamodb']['SequenceNumber'] == sequence_number_2
+                    return
+            time.sleep(0.5)
+        pytest.fail("timed out")

 # Test the StartingSequenceNumber information returned by DescribeStream.
 # The DynamoDB documentation explains that StartingSequenceNumber is
@@ -1414,45 +1429,47 @@ def test_streams_trim_horizon(test_table_ss_keys_only, dynamodbstreams):
 # that the important thing is that reading a shard starting at
 # StartingSequenceNumber will result in reading all the available items -
 # similar to how TRIM_HORIZON works. This is what the following test verifies.
-def test_streams_starting_sequence_number(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    # Do two UpdateItem operations to the same key, that are expected to leave
-    # two events in the stream.
-    p = random_string()
-    c = random_string()
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
-    # Get for all the stream shards the iterator starting at the shard's
-    # StartingSequenceNumber:
-    response = dynamodbstreams.describe_stream(StreamArn=arn)
-    shards = response['StreamDescription']['Shards']
-    while 'LastEvaluatedShardId' in response['StreamDescription']:
-        response = dynamodbstreams.describe_stream(StreamArn=arn,
-            ExclusiveStartShardId=response['StreamDescription']['LastEvaluatedShardId'])
-        shards.extend(response['StreamDescription']['Shards'])
-    iterators = []
-    for shard in shards:
-        shard_id = shard['ShardId']
-        start = shard['SequenceNumberRange']['StartingSequenceNumber']
-        assert start.isdecimal()
-        iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
-            ShardId=shard_id, ShardIteratorType='AT_SEQUENCE_NUMBER',
-            SequenceNumber=start)['ShardIterator'])
+def test_streams_starting_sequence_number(dynamodb, dynamodbstreams):
+    # This test needs a brand-new stream, without old data from other
+    # tests, so we can't reuse the test_table_ss_keys_only fixture.
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        # Do two UpdateItem operations to the same key, that are expected to leave
+        # two events in the stream.
+        p = random_string()
+        c = random_string()
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
+        # Get for all the stream shards the iterator starting at the shard's
+        # StartingSequenceNumber:
+        response = dynamodbstreams.describe_stream(StreamArn=arn)
+        shards = response['StreamDescription']['Shards']
+        while 'LastEvaluatedShardId' in response['StreamDescription']:
+            response = dynamodbstreams.describe_stream(StreamArn=arn,
+                ExclusiveStartShardId=response['StreamDescription']['LastEvaluatedShardId'])
+            shards.extend(response['StreamDescription']['Shards'])
+        iterators = []
+        for shard in shards:
+            shard_id = shard['ShardId']
+            start = shard['SequenceNumberRange']['StartingSequenceNumber']
+            assert start.isdecimal()
+            iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                ShardId=shard_id, ShardIteratorType='AT_SEQUENCE_NUMBER',
+                SequenceNumber=start)['ShardIterator'])

-    # Eventually, *one* of the stream shards will return the two events:
-    timeout = time.time() + 15
-    while time.time() < timeout:
-        for iter in iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and len(response['Records']) == 2:
-                assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                return
-        time.sleep(0.5)
+        # Eventually, *one* of the stream shards will return the two events:
+        timeout = time.time() + 15
+        while time.time() < timeout:
+            for iter in iterators:
+                response = dynamodbstreams.get_records(ShardIterator=iter)
+                if 'Records' in response and len(response['Records']) == 2:
+                    assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    return
+            time.sleep(0.5)

-    pytest.fail("timed out")
+        pytest.fail("timed out")

 # Above we tested some specific operations in small tests aimed to reproduce
 # a specific bug, in the following tests we do a all the different operations,
@@ -1746,50 +1763,49 @@ def test_stream_specification(test_table_stream_with_result, dynamodbstreams):
 # that the right answer is that NextShardIterator should be *missing*
 # (reproduces issue #7237).
@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
-def test_streams_closed_read(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
-    # Do an UpdateItem operation that is expected to leave one event in the
-    # stream.
-    table.update_item(Key={'p': random_string(), 'c': random_string()},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
-    # Disable streaming for this table. Note that the test_table_ss_keys_only
-    # fixture has "function" scope so it is fine to ruin table, it will not
-    # be used in other tests.
-    disable_stream(dynamodbstreams, table)
+def test_streams_closed_read(dynamodb, dynamodbstreams):
+    # This test can't use the shared table test_table_ss_keys_only,
+    # because it wants to disable streaming, so let's create a new table:
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
+        # Do an UpdateItem operation that is expected to leave one event in the
+        # stream.
+        table.update_item(Key={'p': random_string(), 'c': random_string()},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
+        disable_stream(dynamodbstreams, table)

-    # Even after streaming is disabled for the table, we can still read
-    # from the earlier stream (it is guaranteed to work for 24 hours).
-    # The iterators we got earlier should still be fully usable, and
-    # eventually *one* of the stream shards will return one event:
-    timeout = time.time() + 15
-    while time.time() < timeout:
-        for (shard_id, iter) in shards_and_iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and response['Records'] != []:
-                # Found the shard with the data! Test that it only has
-                # one event. NextShardIterator should either be missing now,
-                # indicating that it is a closed shard (DynamoDB does this),
-                # or, it may (and currently does in Alternator) return another
-                # and reading from *that* iterator should then tell us that
-                # we reached the end of the shard (i.e., zero results and
-                # missing NextShardIterator).
-                assert len(response['Records']) == 1
-                if 'NextShardIterator' in response:
-                    response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
-                    assert len(response['Records']) == 0
-                    assert not 'NextShardIterator' in response
-                # Until now we verified that we can read the closed shard
-                # using an old iterator. Let's test now that the closed
-                # shard id is also still valid, and a new iterator can be
-                # created for it, and the old data can be read from it:
-                iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
-                    ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+        # Even after streaming is disabled for the table, we can still read
+        # from the earlier stream (it is guaranteed to work for 24 hours).
+        # The iterators we got earlier should still be fully usable, and
+        # eventually *one* of the stream shards will return one event:
+        timeout = time.time() + 15
+        while time.time() < timeout:
+            for (shard_id, iter) in shards_and_iterators:
                response = dynamodbstreams.get_records(ShardIterator=iter)
-                assert len(response['Records']) == 1
-                return
-        time.sleep(0.5)
-    pytest.fail("timed out")
+                if 'Records' in response and response['Records'] != []:
+                    # Found the shard with the data! Test that it only has
+                    # one event. NextShardIterator should either be missing now,
+                    # indicating that it is a closed shard (DynamoDB does this),
+                    # or, it may (and currently does in Alternator) return another
+                    # and reading from *that* iterator should then tell us that
+                    # we reached the end of the shard (i.e., zero results and
+                    # missing NextShardIterator).
+                    assert len(response['Records']) == 1
+                    if 'NextShardIterator' in response:
+                        response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
+                        assert len(response['Records']) == 0
+                        assert not 'NextShardIterator' in response
+                    # Until now we verified that we can read the closed shard
+                    # using an old iterator. Let's test now that the closed
+                    # shard id is also still valid, and a new iterator can be
+                    # created for it, and the old data can be read from it:
+                    iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                        ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+                    response = dynamodbstreams.get_records(ShardIterator=iter)
+                    assert len(response['Records']) == 1
+                    return
+            time.sleep(0.5)
+        pytest.fail("timed out")

 # In the above test (test_streams_closed_read) we used a disabled stream as
 # a means to generate a closed shard, and tested the behavior of that closed
@@ -1800,84 +1816,83 @@ def test_streams_closed_read(test_table_ss_keys_only, dynamodbstreams):
 # stream's shards should give an indication that they are all closed - but
 # all these shards should still be readable.
@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
-def test_streams_disabled_stream(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    iterators = latest_iterators(dynamodbstreams, arn)
-    # Do an UpdateItem operation that is expected to leave one event in the
-    # stream.
-    table.update_item(Key={'p': random_string(), 'c': random_string()},
-        UpdateExpression='SET x = :x', ExpressionAttributeValues={':x': 5})
+def test_streams_disabled_stream(dynamodb, dynamodbstreams):
+    # This test can't use the shared table test_table_ss_keys_only,
+    # because it wants to disable streaming, so let's create a new table:
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        iterators = latest_iterators(dynamodbstreams, arn)
+        # Do an UpdateItem operation that is expected to leave one event in the
+        # stream.
+        table.update_item(Key={'p': random_string(), 'c': random_string()},
+            UpdateExpression='SET x = :x', ExpressionAttributeValues={':x': 5})

-    # Wait for this one update to become available in the stream before we
-    # disable the stream. Otherwise, theoretically (although unlikely in
-    # practice) we may disable the stream before the update was saved to it.
-    timeout = time.time() + 15
-    found = False
-    while time.time() < timeout and not found:
-        for iter in iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and len(response['Records']) > 0:
-                found = True
-                break
-        time.sleep(0.5)
-    assert found
+        # Wait for this one update to become available in the stream before we
+        # disable the stream. Otherwise, theoretically (although unlikely in
+        # practice) we may disable the stream before the update was saved to it.
+        timeout = time.time() + 15
+        found = False
+        while time.time() < timeout and not found:
+            for iter in iterators:
+                response = dynamodbstreams.get_records(ShardIterator=iter)
+                if 'Records' in response and len(response['Records']) > 0:
+                    found = True
+                    break
+            time.sleep(0.5)
+        assert found

-    # Disable streaming for this table. Note that the test_table_ss_keys_only
-    # fixture has "function" scope so it is fine to ruin table, it will not
-    # be used in other tests.
-    disable_stream(dynamodbstreams, table)
+        disable_stream(dynamodbstreams, table)

-    # Check that the stream ARN which we previously got for the disabled
-    # stream is still listed by ListStreams
-    arns = [stream['StreamArn'] for stream in dynamodbstreams.list_streams(TableName=table.name)['Streams']]
-    assert arn in arns
+        # Check that the stream ARN which we previously got for the disabled
+        # stream is still listed by ListStreams
+        arns = [stream['StreamArn'] for stream in dynamodbstreams.list_streams(TableName=table.name)['Streams']]
+        assert arn in arns

-    # DescribeStream on the disabled stream still works and lists its shards.
-    # All these shards are listed as being closed (i.e., should have
-    # EndingSequenceNumber). The basic details of the stream (e.g., the view
-    # type) are available and the status of the stream is DISABLED.
-    response = dynamodbstreams.describe_stream(StreamArn=arn)['StreamDescription']
-    assert response['StreamStatus'] == 'DISABLED'
-    assert response['StreamViewType'] == 'KEYS_ONLY'
-    assert response['TableName'] == table.name
-    shards_info = response['Shards']
-    while 'LastEvaluatedShardId' in response:
-        response = dynamodbstreams.describe_stream(StreamArn=arn, ExclusiveStartShardId=response['LastEvaluatedShardId'])['StreamDescription']
+        # DescribeStream on the disabled stream still works and lists its shards.
+        # All these shards are listed as being closed (i.e., should have
+        # EndingSequenceNumber). The basic details of the stream (e.g., the view
+        # type) are available and the status of the stream is DISABLED.
+        response = dynamodbstreams.describe_stream(StreamArn=arn)['StreamDescription']
        assert response['StreamStatus'] == 'DISABLED'
        assert response['StreamViewType'] == 'KEYS_ONLY'
        assert response['TableName'] == table.name
-        shards_info.extend(response['Shards'])
-    print('Number of shards in stream: {}'.format(len(shards_info)))
-    for shard in shards_info:
-        assert 'EndingSequenceNumber' in shard['SequenceNumberRange']
-        assert shard['SequenceNumberRange']['EndingSequenceNumber'].isdecimal()
+        shards_info = response['Shards']
+        while 'LastEvaluatedShardId' in response:
+            response = dynamodbstreams.describe_stream(StreamArn=arn, ExclusiveStartShardId=response['LastEvaluatedShardId'])['StreamDescription']
+            assert response['StreamStatus'] == 'DISABLED'
+            assert response['StreamViewType'] == 'KEYS_ONLY'
+            assert response['TableName'] == table.name
+            shards_info.extend(response['Shards'])
+        print('Number of shards in stream: {}'.format(len(shards_info)))
+        for shard in shards_info:
+            assert 'EndingSequenceNumber' in shard['SequenceNumberRange']
+            assert shard['SequenceNumberRange']['EndingSequenceNumber'].isdecimal()

-    # We can get TRIM_HORIZON iterators for all these shards, to read all
-    # the old data they still have (this data should be saved for 24 hours
-    # after the stream was disabled)
-    iterators = []
-    for shard in shards_info:
-        iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
-            ShardId=shard['ShardId'], ShardIteratorType='TRIM_HORIZON')['ShardIterator'])
+        # We can get TRIM_HORIZON iterators for all these shards, to read all
+        # the old data they still have (this data should be saved for 24 hours
+        # after the stream was disabled)
+        iterators = []
+        for shard in shards_info:
+            iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                ShardId=shard['ShardId'], ShardIteratorType='TRIM_HORIZON')['ShardIterator'])

-    # We can read the one change we did in one of these iterators. The data
-    # should be available immediately - no need for retries with timeout.
-    nrecords = 0
-    for iter in iterators:
-        response = dynamodbstreams.get_records(ShardIterator=iter)
-        if 'Records' in response:
-            nrecords += len(response['Records'])
-        # The shard is closed, so NextShardIterator should either be missing
-        # now,  indicating that it is a closed shard (DynamoDB does this),
-        # or, it may (and currently does in Alternator) return an iterator
-        # and reading from *that* iterator should then tell us that
-        # we reached the end of the shard (i.e., zero results and
-        # missing NextShardIterator).
-        if 'NextShardIterator' in response:
-            response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
-            assert len(response['Records']) == 0
-            assert not 'NextShardIterator' in response
-    assert nrecords == 1
+        # We can read the one change we did in one of these iterators. The data
+        # should be available immediately - no need for retries with timeout.
+        nrecords = 0
+        for iter in iterators:
+            response = dynamodbstreams.get_records(ShardIterator=iter)
+            if 'Records' in response:
+                nrecords += len(response['Records'])
+            # The shard is closed, so NextShardIterator should either be missing
+            # now,  indicating that it is a closed shard (DynamoDB does this),
+            # or, it may (and currently does in Alternator) return an iterator
+            # and reading from *that* iterator should then tell us that
+            # we reached the end of the shard (i.e., zero results and
+            # missing NextShardIterator).
+            if 'NextShardIterator' in response:
+                response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
+                assert len(response['Records']) == 0
+                assert not 'NextShardIterator' in response
+        assert nrecords == 1

 # When streams are enabled for a table, we get a unique ARN which should be
 # unique but not change unless streams are eventually disabled for this table.
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -1058,6 +1058,30 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
    });
 }

+SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
+    return make_ready_future();
+#endif
+    return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
+        sharded<db::snapshot_ctl> sc;
+        sc.start(std::ref(e.db()), std::ref(e.get_storage_proxy()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
+        auto stop_sc = deferred_stop(sc);
+
+        auto& cf = e.local_db().find_column_family("ks", "cf");
+        take_snapshot(e).get();
+
+        utils::get_local_injector().enable("get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        auto details = cf.get_snapshot_details().get();
+        BOOST_REQUIRE_EQUAL(details.size(), 1);
+    });
+}
+
 // toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
 SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
    return do_with_cql_env_thread([] (cql_test_env& e) {
--- a/test/boost/memtable_test.cc
+++ b/test/boost/memtable_test.cc
@@ -1004,7 +1004,20 @@ SEASTAR_TEST_CASE(memtable_flush_compresses_mutations) {
    }, db_config);
 }

-SEASTAR_TEST_CASE(memtable_flush_period) {
+static auto check_has_error_injection() {
+    return boost::unit_test::precondition([](auto){
+        return 
+#ifdef SCYLLA_ENABLE_ERROR_INJECTION
+            true
+#else
+            false
+#endif
+        ;
+    });
+}
+
+SEASTAR_TEST_CASE(memtable_flush_period, *check_has_error_injection()) {
+#ifdef SCYLLA_ENABLE_ERROR_INJECTION
    auto db_config = make_shared<db::config>();
    db_config->enable_cache.set(false);
    return do_with_cql_env_thread([](cql_test_env& env) {
@@ -1028,6 +1041,9 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
        t.apply(m);
        BOOST_REQUIRE_EQUAL(t.sstables_count(), 0); // add mutation and check there are no sstables for this table

+        auto& errj = utils::get_local_injector();
+        errj.enable("table_seal_post_flush_waiters", true);
+
        // change schema to set memtable flush period
        // we use small value in this test but it is impossible to set the period less than 60000ms using ALTER TABLE construction
        schema_builder b(t.schema());
@@ -1035,8 +1051,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
        schema_ptr s2 = b.build();
        t.set_schema(s2);

-        sleep(500ms).get(); // wait until memtable flush starts at least once
-        BOOST_REQUIRE(t.sstables_count() == 1 || t.get_stats().pending_flushes > 0);    // flush started
+        BOOST_TEST_MESSAGE("Wait for flush");
+        errj.inject("table_seal_post_flush_waiters", utils::wait_for_message(std::chrono::minutes(2))).get();
+        BOOST_TEST_MESSAGE("Flush received");
+
        BOOST_REQUIRE(eventually_true([&] { // wait until memtable will be flushed at least once
            return t.sstables_count() == 1;
        }));
@@ -1047,6 +1065,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
            .produces(m)
            .produces_end_of_stream();
    }, db_config);
+#else
+    BOOST_TEST_MESSAGE("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev)");
+    return make_ready_future<>();
+#endif
 }

 SEASTAR_TEST_CASE(sstable_compaction_does_not_resurrect_data) {
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -15,6 +15,7 @@
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/random_utils.hh"
 #include "test/lib/exception_utils.hh"
+#include "test/lib/eventually.hh"
 #include "db/config.hh"

 #include <fmt/ranges.h>
@@ -200,6 +201,10 @@ public:
        return _sem;
    }

+    const replica::querier_cache::stats& get_stats() const {
+        return _cache.get_stats();
+    }
+
    dht::partition_range make_partition_range(bound begin, bound end) const {
        return dht::partition_range::make({_mutations.at(begin.value()).decorated_key(), begin.is_inclusive()},
                {_mutations.at(end.value()).decorated_key(), end.is_inclusive()});
@@ -562,24 +567,21 @@ SEASTAR_THREAD_TEST_CASE(test_time_based_cache_eviction) {

    const auto entry1 = t.produce_first_page_and_save_data_querier(1);

-    seastar::sleep(500ms).get();
+    BOOST_REQUIRE_EQUAL(t.get_stats().time_based_evictions, 0);

-    const auto entry2 = t.produce_first_page_and_save_data_querier(2);
+    // Don't waste time retrying before the TTL is up
+    sleep(1s).get();

-    seastar::sleep(700ms).get();
+    eventually_true([&t] {
+        auto stats = t.get_stats();
+        return stats.time_based_evictions == 1;
+    });

    t.assert_cache_lookup_data_querier(entry1.key, *t.get_schema(), entry1.expected_range, entry1.expected_slice)
        .misses()
        .no_drops()
        .time_based_evictions();

-    seastar::sleep(700ms).get();
-
-    t.assert_cache_lookup_data_querier(entry2.key, *t.get_schema(), entry2.expected_range, entry2.expected_slice)
-        .misses()
-        .no_drops()
-        .time_based_evictions();
-
    // There should be no inactive reads, the querier_cache should unregister
    // the expired queriers.
    BOOST_REQUIRE_EQUAL(t.get_semaphore().get_stats().inactive_reads, 0);
--- a/test/boost/reader_concurrency_semaphore_test.cc
+++ b/test/boost/reader_concurrency_semaphore_test.cc
@@ -26,6 +26,7 @@
 #include <fmt/ranges.h>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/testing/on_internal_error.hh>
 #undef SEASTAR_TESTING_MAIN
 #include <seastar/testing/test_case.hh>
 #include <seastar/testing/thread_test_case.hh>
@@ -35,6 +36,13 @@
 #include "replica/database.hh" // new_reader_base_cost is there :(
 #include "db/config.hh"

+// Provides access to private members of reader_concurrency_semaphore for testing.
+struct reader_concurrency_semaphore_tester {
+    static void signal(reader_concurrency_semaphore& sem, reader_resources r) {
+        sem.signal(r);
+    }
+};
+
 BOOST_AUTO_TEST_SUITE(reader_concurrency_semaphore_test)

 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads) {
@@ -2595,4 +2603,35 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_preemptive_abort_requ
    permit2 = {};
 }

+// Verify that signal() detects and corrects a negative resource leak.
+// When a bug causes available resources to exceed initial resources
+// after signal(), the semaphore should report the negative leak via
+// on_internal_error_noexcept and clamp _resources back to _initial_resources
+// so that consumed_resources() never goes negative.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_signal_detects_negative_resource_leak) {
+    const auto initial = reader_resources{2, 2048};
+
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial.count, initial.memory);
+    auto stop_sem = deferred_stop(semaphore);
+
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial);
+    BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
+
+    // Simulate a negative leak: signal more resources than were ever consumed.
+    // This would happen if a bug double-returned resources or inflated
+    // the amount returned to signal().
+    // signal() calls on_internal_error_noexcept which would abort in
+    // test mode, so temporarily disable that.
+    const auto leaked = reader_resources{1, 512};
+    {
+        seastar::testing::scoped_no_abort_on_internal_error no_abort;
+        reader_concurrency_semaphore_tester::signal(semaphore, leaked);
+    }
+
+    // signal() should have detected the over-return and clamped
+    // available resources back to initial.
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial);
+    BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
+}
+
 BOOST_AUTO_TEST_SUITE_END()
--- a/test/boost/s3_test.cc
+++ b/test/boost/s3_test.cc
@@ -982,21 +982,29 @@ BOOST_AUTO_TEST_CASE(s3_fqn_manipulation) {
 }

 BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 5_MiB), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("too many parts: 100352 > 10000");
-        });
-    }
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, s3::minimum_part_size), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(format("too many parts: 100352 > {}", s3::maximum_parts_in_piece));
+    });
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 4_MiB), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(format("part_size too small: 4194304 is smaller than minimum part size: {}", s3::minimum_part_size));
+    });
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(s3::maximum_object_size + 1, 0), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(
+            format("object size too large: {} is larger than maximum S3 object size: {}", s3::maximum_object_size + 1, s3::maximum_object_size));
+    });
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(1_TiB, s3::maximum_part_size + 1), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(
+            format("part_size too large: {} is larger than maximum part size: {}", s3::maximum_part_size + 1, s3::maximum_part_size));
+    });
+    size_t total_size = s3::minimum_part_size * (s3::maximum_parts_in_piece + 1); // 10001 parts at 5 MiB
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(total_size, s3::minimum_part_size), std::runtime_error, [](auto& e) {
+        return std::string(e.what()).starts_with(format("too many parts: 10001 > {}", s3::maximum_parts_in_piece));
+    });
    {
        auto [parts, size] = s3::calc_part_size(490_GiB, 100_MiB);
        BOOST_REQUIRE_EQUAL(size, 100_MiB);
        BOOST_REQUIRE(parts == 5018);
    }
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 4_MiB), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("part_size too small: 4194304 is smaller than minimum part size: 5242880");
-        });
-    }
    {
        auto [parts, size] = s3::calc_part_size(50_MiB, 0);
        BOOST_REQUIRE_EQUAL(size, 50_MiB);
@@ -1013,24 +1021,14 @@ BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
        BOOST_REQUIRE(parts == 9839);
    }
    {
-        auto [parts, size] = s3::calc_part_size(50_MiB * 10000, 0);
+        auto [parts, size] = s3::calc_part_size(50_MiB * s3::maximum_parts_in_piece, 0);
        BOOST_REQUIRE_EQUAL(size, 50_MiB);
-        BOOST_REQUIRE_EQUAL(parts, 10000);
+        BOOST_REQUIRE_EQUAL(parts, s3::maximum_parts_in_piece);
    }
    {
-        auto [parts, size] = s3::calc_part_size(50_MiB * 10000 + 1, 0);
+        auto [parts, size] = s3::calc_part_size(50_MiB * s3::maximum_parts_in_piece + 1, 0);
        BOOST_REQUIRE(size > 50_MiB);
-        BOOST_REQUIRE(parts <= 10000);
-    }
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(50_TiB, 0), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("object size too large: 54975581388800 is larger than maximum S3 object size: 53687091200000");
-        });
-    }
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(1_TiB, 5_GiB + 1), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("part_size too large: 5368709121 is larger than maximum part size: 5368709120");
-        });
+        BOOST_REQUIRE(parts <= s3::maximum_parts_in_piece);
    }
    {
        auto [parts, size] = s3::calc_part_size(5_TiB, 0);
@@ -1038,21 +1036,16 @@ BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
        BOOST_REQUIRE_EQUAL(size, 525_MiB);
    }
    {
-        auto [parts, size] = s3::calc_part_size(5_MiB * 10000, 5_MiB);
-        BOOST_REQUIRE_EQUAL(size, 5_MiB);
-        BOOST_REQUIRE_EQUAL(parts, 10000);
-    }
-    {
-        size_t total = 5_MiB * 10001; // 10001 parts at 5 MiB
-        BOOST_REQUIRE_EXCEPTION(
-            s3::calc_part_size(total, 5_MiB), std::runtime_error, [](auto& e) { return std::string(e.what()).starts_with("too many parts: 10001 > 10000"); });
+        auto [parts, size] = s3::calc_part_size(s3::minimum_part_size * s3::maximum_parts_in_piece, s3::minimum_part_size);
+        BOOST_REQUIRE_EQUAL(size, s3::minimum_part_size);
+        BOOST_REQUIRE_EQUAL(parts, s3::maximum_parts_in_piece);
    }
    {
        size_t total = 500_GiB + 123; // odd size to force non-MiB alignment
        auto [parts, size] = s3::calc_part_size(total, 0);

        BOOST_REQUIRE(size % 1_MiB == 0); // aligned
-        BOOST_REQUIRE(parts <= 10000);
+        BOOST_REQUIRE(parts <= s3::maximum_parts_in_piece);
    }
    {
        auto [parts, size] = s3::calc_part_size(6_MiB, 0);
--- a/test/boost/schema_change_test.cc
+++ b/test/boost/schema_change_test.cc
@@ -676,7 +676,7 @@ SEASTAR_TEST_CASE(test_system_schema_version_is_stable) {

        // If you changed the schema of system.batchlog then this is expected to fail.
        // Just replace expected version with the new version.
-        BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("1f504ac7-350f-37aa-8a9e-105b1325d8e3")));
+        BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("c3f984e4-f886-3616-bb80-f8c68ed93595")));
    });
 }

--- a/test/boost/sstable_partition_index_cache_test.cc
+++ b/test/boost/sstable_partition_index_cache_test.cc
@@ -20,16 +20,24 @@ static void add_entry(logalloc::region& r,
      const schema& s,
      partition_index_page& page,
      const partition_key& key,
-      uint64_t position)
+      uint64_t position,
+      std::optional<parsed_promoted_index_entry> promoted_index = std::nullopt)
 {
    logalloc::allocating_section as;
    as(r, [&] {
        with_allocator(r.allocator(), [&] {
            sstables::key sst_key = sstables::key::from_partition_key(s, key);
-            page._entries.push_back(make_managed<index_entry>(
-                    managed_bytes(sst_key.get_bytes()),
-                    position,
-                    managed_ref<promoted_index>()));
+            auto key_offset = page._key_storage.size();
+            auto old_storage = std::move(page._key_storage);
+            page._key_storage = managed_bytes(managed_bytes::initialized_later(), key_offset + sst_key.get_bytes().size());
+            auto out = managed_bytes_mutable_view(page._key_storage);
+            write_fragmented(out, managed_bytes_view(old_storage));
+            write_fragmented(out, single_fragmented_view(bytes_view(sst_key)));
+            page._entries.push_back(index_entry{dht::raw_token_opt()->value, position, key_offset});
+            if (promoted_index) {
+                page._promoted_indexes.resize(page._entries.size());
+                page._promoted_indexes[page._entries.size() - 1] = *promoted_index;
+            }
        });
    });
 }
@@ -54,10 +62,10 @@ static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
 static void has_page0(partition_index_cache::entry_ptr ptr) {
    BOOST_REQUIRE(!ptr->empty());
    BOOST_REQUIRE_EQUAL(ptr->_entries.size(), 4);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[0]->position(), 0);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[1]->position(), 1);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[2]->position(), 2);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[3]->position(), 3);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[0].position(), 0);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[1].position(), 1);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[2].position(), 2);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[3].position(), 3);
 };

 SEASTAR_THREAD_TEST_CASE(test_caching) {
@@ -139,6 +147,59 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
    }
 }

+SEASTAR_THREAD_TEST_CASE(test_sparse_promoted_index) {
+    ::lru lru;
+    simple_schema s;
+    logalloc::region r;
+    partition_index_cache_stats stats;
+    partition_index_cache cache(lru, r, stats);
+
+    auto page0_loader = [&] (partition_index_cache::key_type k) -> future<partition_index_page> {
+        partition_index_page page;
+        auto destroy_page = defer([&] {
+            with_allocator(r.allocator(), [&] {
+                auto p = std::move(page);
+            });
+        });
+
+        add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
+        add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1, parsed_promoted_index_entry{
+            .promoted_index_start = 1,
+            .promoted_index_size = 10,
+            .num_blocks = 3
+        });
+        add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
+        add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3, parsed_promoted_index_entry{
+            .promoted_index_start = 2,
+            .promoted_index_size = 13,
+            .num_blocks = 1
+        });
+        add_entry(r, *s.schema(), page, s.make_pkey(4).key(), 4);
+        destroy_page.cancel();
+        co_return std::move(page);
+    };
+
+    auto page = cache.get_or_load(0, page0_loader).get();
+
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(0), false);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(1), true);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(2), false);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(3), true);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(4), false);
+
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_start, 1);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_size, 10);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).num_blocks, 3);
+
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_start, 2);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_size, 13);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).num_blocks, 1);
+
+    with_allocator(r.allocator(), [&] {
+        lru.evict_all();
+    });
+}
+
 template <typename T>
 static future<> ignore_result(future<T>&& f) {
    return f.then_wrapped([] (auto&& f) {
--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -1607,6 +1607,29 @@ future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) {
    }
 }

+static
+future<group0_guard> save_token_metadata(cql_test_env& e, group0_guard guard) {
+    auto& stm = e.local_db().get_shared_token_metadata();
+    auto tm = stm.get();
+
+    e.get_topology_state_machine().local()._topology.version = tm->get_version();
+
+    co_await save_tablet_metadata(e.local_db(), tm->tablets(), guard.write_timestamp());
+    utils::chunked_vector<frozen_mutation> muts;
+    muts.push_back(freeze(topology_mutation_builder(guard.write_timestamp())
+                                  .set_version(tm->get_version())
+                                  .build().to_mutation(db::system_keyspace::topology())));
+    co_await e.local_db().apply(muts, db::no_timeout);
+    co_await e.get_storage_service().local().update_tablet_metadata({});
+
+    // Need a new guard to make sure later changes use later timestamp.
+    // Also, so that the table layer processes the changes we persisted, which is important for splits.
+    // Before we can finalize a split, the storage group needs to process the split by creating split-ready compaction groups.
+    release_guard(std::move(guard));
+    abort_source as;
+    co_return co_await e.get_raft_group0_client().start_operation(as);
+}
+
 static
 future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migration_plan& plan, shared_load_stats* load_stats) {
    auto& talloc = e.get_tablet_allocator().local();
@@ -1626,19 +1649,14 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
        co_await stm.mutate_token_metadata([table_id, &new_tmap, &changed] (token_metadata& tm) {
            changed = true;
            tm.tablets().set_tablet_map(table_id, std::move(new_tmap));
+            tm.set_version(tm.get_version() + 1);
            return make_ready_future<>();
        });
    }

    if (changed) {
        // Need to reload on each resize because table object expects tablet count to change by a factor of 2.
-        co_await save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp());
-        co_await e.get_storage_service().local().update_tablet_metadata({});
-
-        // Need a new guard to make sure later changes use later timestamp.
-        release_guard(std::move(guard));
-        abort_source as;
-        guard = co_await e.get_raft_group0_client().start_operation(as);
+        guard = co_await save_token_metadata(e, std::move(guard));

        if (load_stats) {
            auto new_tm = stm.get();
@@ -1647,6 +1665,11 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
                load_stats->stats = *reconciled_stats;
            }
        }
+
+        testlog.debug("Calling local_topology_barrier()");
+        old_tm = nullptr;
+        co_await e.get_storage_service().local().local_topology_barrier();
+        testlog.debug("Finished local_topology_barrier()");
    }
 }

@@ -1750,13 +1773,22 @@ void do_rebalance_tablets(cql_test_env& e,
        }).get();

        if (auto_split && load_stats) {
+            bool reload = false;
            auto& tm = *stm.get();
            for (const auto& [table, tmap]: tm.tablets().all_tables_ungrouped()) {
                if (std::holds_alternative<resize_decision::split>(tmap->resize_decision().way)) {
-                    testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
-                    load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
+                    if (load_stats->stats.tables[table].split_ready_seq_number != tmap->resize_decision().sequence_number) {
+                        testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
+                        load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
+                        reload = true;
+                    }
                }
            }
+
+            // Need to order split-ack before split finalization, storage_group assumes that.
+            if (reload) {
+                guard = save_token_metadata(e, std::move(guard)).get();
+            }
        }

        handle_resize_finalize(e, guard, plan, load_stats).get();
--- a/test/boost/token_metadata_test.cc
+++ b/test/boost/token_metadata_test.cc
@@ -331,4 +331,28 @@ SEASTAR_THREAD_TEST_CASE(test_stale_version_notification) {
    std::cerr.rdbuf(oldCerr);

    BOOST_TEST(my_stream.str().find("topology version 0 held for") != std::string::npos);
-}
+}
+
+SEASTAR_THREAD_TEST_CASE(test_raw_token) {
+    const auto t1 = dht::token::from_int64(1);
+    const auto t2 = dht::token::from_int64(2);
+
+    dht::raw_token_opt rt_opt;
+    BOOST_REQUIRE(!rt_opt);
+    rt_opt = dht::raw_token(t1);
+    BOOST_REQUIRE(*rt_opt == t1);
+
+    BOOST_REQUIRE(dht::raw_token() == dht::minimum_token());
+    BOOST_REQUIRE(dht::raw_token() < dht::raw_token(dht::first_token()));
+    BOOST_REQUIRE(dht::raw_token() < dht::first_token());
+    BOOST_REQUIRE(dht::raw_token() < dht::maximum_token());
+
+    auto rt1 = dht::raw_token(t1);
+    BOOST_REQUIRE(bool(rt1));
+    BOOST_REQUIRE(rt1 > dht::raw_token());
+    BOOST_REQUIRE(rt1 > dht::minimum_token());
+    BOOST_REQUIRE_EQUAL(rt1, t1);
+    BOOST_REQUIRE(rt1 == t1);
+    BOOST_REQUIRE(rt1 < t2);
+    BOOST_REQUIRE(rt1 < dht::maximum_token());
+}
--- a/test/cluster/auth_cluster/test_maintenance_socket.py
+++ b/test/cluster/auth_cluster/test_maintenance_socket.py
@@ -16,10 +16,21 @@ from test.pylib.util import wait_for
 import logging
 import pytest
 import time
+from collections.abc import Generator
 from test.cluster.auth_cluster import extra_scylla_config_options as auth_config

 logger = logging.getLogger(__name__)

+CqlClusters = list[Cluster]
+
+@pytest.fixture
+def cql_clusters() -> Generator[CqlClusters, None, None]:
+    """Tracks CQL driver Cluster objects for automatic shutdown after test completion."""
+    clusters: CqlClusters = []
+    yield clusters
+    for c in reversed(clusters):
+        c.shutdown()
+

 async def get_ready_maintenance_session(socket_path: str, timeout: int = 60):
    """Connect to maintenance socket, retrying until the role manager is ready.
@@ -84,7 +95,7 @@ async def connect_with_credentials(ip: str, username: str, password: str, timeou


@pytest.mark.asyncio
-async def test_maintenance_socket(manager: ManagerClient):
+async def test_maintenance_socket(manager: ManagerClient, cql_clusters: CqlClusters):
    """
    Test that when connecting to the maintenance socket, the user has superuser permissions,
    even if the authentication is enabled on the regular port.
@@ -95,28 +106,29 @@ async def test_maintenance_socket(manager: ManagerClient):

    logger.info("Verifying unauthenticated connection is rejected")
    cluster = Cluster([server.ip_addr])
+    cql_clusters.append(cluster)
    try:
        cluster.connect()
        pytest.fail("Client should not be able to connect if auth provider is not specified")
    except NoHostAvailable:
        pass
-    finally:
-        cluster.shutdown()

    logger.info("Connecting as superuser to set up roles and keyspaces")
    superuser_cluster = cluster_con([server.ip_addr],
                                    auth_provider=PlainTextAuthProvider(username="cassandra", password="cassandra"))
+    cql_clusters.append(superuser_cluster)
    session = superuser_cluster.connect()

    session.execute("CREATE ROLE john WITH PASSWORD = 'password' AND LOGIN = true;")
-    session.execute("CREATE KEYSPACE ks1 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1};")
-    session.execute("CREATE KEYSPACE ks2 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1};")
+    session.execute("CREATE KEYSPACE ks1 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};")
+    session.execute("CREATE KEYSPACE ks2 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};")
    session.execute("CREATE TABLE ks1.t1 (pk int PRIMARY KEY, val int);")
    session.execute("CREATE TABLE ks2.t1 (pk int PRIMARY KEY, val int);")
    session.execute("GRANT SELECT ON ks1.t1 TO john;")

    logger.info("Verifying user 'john' cannot access ks2.t1")
    john_cluster = cluster_con([server.ip_addr], auth_provider=PlainTextAuthProvider(username="john", password="password"))
+    cql_clusters.append(john_cluster)
    john_session = john_cluster.connect()
    try:
        john_session.execute("SELECT * FROM ks2.t1")
@@ -127,22 +139,19 @@ async def test_maintenance_socket(manager: ManagerClient):

    logger.info("Connecting via maintenance socket")
    maintenance_cluster = cluster_con([UnixSocketEndPoint(socket)], load_balancing_policy=WhiteListRoundRobinPolicy([UnixSocketEndPoint(socket)]))
+    cql_clusters.append(maintenance_cluster)
    maintenance_session = maintenance_cluster.connect()

    logger.info("Verifying maintenance session has superuser permissions")
    maintenance_session.execute("SELECT * FROM ks1.t1")
    maintenance_session.execute("SELECT * FROM ks2.t1")
    maintenance_session.execute("INSERT INTO ks1.t1 (pk, val) VALUES (1, 1);")
-    maintenance_session.execute("CREATE KEYSPACE ks3 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1};")
+    maintenance_session.execute("CREATE KEYSPACE ks3 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};")
    maintenance_session.execute("CREATE TABLE ks1.t2 (pk int PRIMARY KEY, val int);")

-    maintenance_cluster.shutdown()
-    john_cluster.shutdown()
-    superuser_cluster.shutdown()
-

@pytest.mark.asyncio
-async def test_no_default_superuser_exists_by_default(manager: ManagerClient):
+async def test_no_default_superuser_exists_by_default(manager: ManagerClient, cql_clusters: CqlClusters):
    """
    Test that no 'cassandra' user exists when no default superuser is configured.
    """
@@ -157,17 +166,16 @@ async def test_no_default_superuser_exists_by_default(manager: ManagerClient):

    logger.info("Verifying default credentials are rejected")
    cluster = Cluster([server.ip_addr], auth_provider=PlainTextAuthProvider(username="cassandra", password="cassandra"))
+    cql_clusters.append(cluster)
    try:
        cluster.connect()
        pytest.fail("Should not be able to connect with default credentials when they are not seeded")
    except Exception:
        pass
-    finally:
-        cluster.shutdown()


@pytest.mark.asyncio
-async def test_no_default_superuser_maintenance_socket_ops(manager: ManagerClient):
+async def test_no_default_superuser_maintenance_socket_ops(manager: ManagerClient, cql_clusters: CqlClusters):
    """
    Test that we can manage user roles via the maintenance socket.
    """
@@ -183,6 +191,7 @@ async def test_no_default_superuser_maintenance_socket_ops(manager: ManagerClien
    logger.info("Connecting via maintenance socket")
    socket_path = await manager.server_get_maintenance_socket_path(server.server_id)
    session = await get_ready_maintenance_session(socket_path)
+    cql_clusters.append(session.cluster)

    logger.info("Verifying system.roles is empty before operations")
    rows = list(session.execute("SELECT role, is_superuser FROM system.roles"))
@@ -205,9 +214,10 @@ async def test_no_default_superuser_maintenance_socket_ops(manager: ManagerClien

    logger.info("Verifying the new role can log in via the normal CQL port")
    admin_session = await connect_with_credentials(server.ip_addr, new_role, new_role_password)
+    cql_clusters.append(admin_session.cluster)

    logger.info("Verifying superuser can create a keyspace")
-    admin_session.execute("CREATE KEYSPACE ks1 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}")
+    admin_session.execute("CREATE KEYSPACE ks1 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")

    logger.info("Altering role to remove superuser via maintenance socket")
    session.execute(f"ALTER ROLE {new_role} WITH SUPERUSER = false")
@@ -222,14 +232,14 @@ async def test_no_default_superuser_maintenance_socket_ops(manager: ManagerClien
    async def check_superuser_revoked():
        c = cluster_con([server.ip_addr],
                        auth_provider=PlainTextAuthProvider(username=new_role, password=new_role_password))
-        s = c.connect()
        try:
-            s.execute("CREATE KEYSPACE ks2 WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}")
-            c.shutdown()
+            s = c.connect()
+            s.execute("CREATE KEYSPACE ks2 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
            return None  # Still cached as superuser, retry
        except Unauthorized:
-            c.shutdown()
            return True
+        finally:
+            c.shutdown()

    await wait_for(check_superuser_revoked, time.time() + 60)

@@ -254,6 +264,62 @@ async def test_no_default_superuser_maintenance_socket_ops(manager: ManagerClien

    await wait_for(check_role_dropped, time.time() + 60)

-    admin_session.cluster.shutdown()
-    session.cluster.shutdown()

+@pytest.mark.asyncio
+async def test_maintenance_socket_grant_revoke(manager: ManagerClient, cql_clusters: CqlClusters):
+    """
+    Test that GRANT, REVOKE, and REVOKE ALL via the maintenance socket work correctly.
+
+    The maintenance socket uses maintenance_socket_authorizer, which extends
+    CassandraAuthorizer so that authorization-altering statements (GRANT, REVOKE)
+    are persisted, while the maintenance socket user itself always has full access.
+    """
+    config = {
+        **auth_config,
+        "auth_superuser_name": "",
+        "auth_superuser_salted_password": "",
+    }
+
+    logger.info("Starting server without default superuser")
+    server = await manager.server_add(config=config, connect_driver=False)
+
+    logger.info("Connecting via maintenance socket")
+    socket_path = await manager.server_get_maintenance_socket_path(server.server_id)
+    session = await get_ready_maintenance_session(socket_path)
+    cql_clusters.append(session.cluster)
+
+    session.execute("CREATE KEYSPACE ks WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
+    session.execute("CREATE TABLE ks.t (pk int PRIMARY KEY, v int)")
+    session.execute("CREATE ROLE role1 WITH PASSWORD = 'pass' AND LOGIN = true")
+
+    # GRANT SELECT via maintenance socket, verify it is persisted
+    logger.info("Testing GRANT via maintenance socket")
+    session.execute("GRANT SELECT ON ks.t TO role1")
+
+    rows = list(session.execute("LIST ALL PERMISSIONS OF role1"))
+    assert len(rows) == 1
+    assert rows[0].permission == "SELECT"
+
+    role1_session = await connect_with_credentials(server.ip_addr, "role1", "pass")
+    cql_clusters.append(role1_session.cluster)
+    role1_session.execute("SELECT * FROM ks.t")
+
+    # REVOKE SELECT via maintenance socket
+    logger.info("Testing REVOKE via maintenance socket")
+    session.execute("REVOKE SELECT ON ks.t FROM role1")
+
+    rows = list(session.execute("LIST ALL PERMISSIONS OF role1"))
+    assert len(rows) == 0
+
+    # GRANT multiple permissions, then REVOKE ALL
+    logger.info("Testing REVOKE ALL via maintenance socket")
+    session.execute("GRANT SELECT ON ks.t TO role1")
+    session.execute("GRANT MODIFY ON ks.t TO role1")
+
+    rows = list(session.execute("LIST ALL PERMISSIONS OF role1"))
+    assert len(rows) == 2
+
+    session.execute("REVOKE ALL ON ks.t FROM role1")
+
+    rows = list(session.execute("LIST ALL PERMISSIONS OF role1"))
+    assert len(rows) == 0
--- a/test/cluster/dtest/bypass_cache_test.py
+++ b/test/cluster/dtest/bypass_cache_test.py
@@ -12,7 +12,7 @@ from collections.abc import Callable

 import pytest

-from dtest_class import Tester, create_cf, create_ks, get_ip_from_node, highest_supported_sstable_format
+from dtest_class import Tester, create_cf, create_ks, get_ip_from_node, chosen_sstable_format
 from tools.data import create_c1c2_table, insert_c1c2
 from tools.metrics import get_node_metrics

@@ -69,7 +69,7 @@ class TestBypassCache(Tester):
            create_c1c2_table(session)
            insert_c1c2(session, n=NUM_OF_QUERY_EXECUTIONS, ks=keyspace_name)

-        self.sstable_format = highest_supported_sstable_format(node1)
+        self.sstable_format = chosen_sstable_format(node1)

        return session

@@ -103,7 +103,7 @@ class TestBypassCache(Tester):
        return metric_errors

    def cache_thresh(self):
-        return 800 if not self.tablets else 150
+        return 300 if not self.tablets else 150

    def metric_name_for_index_cache_hits(self):
        """
--- a/test/cluster/dtest/ccmlib/scylla_node.py
+++ b/test/cluster/dtest/ccmlib/scylla_node.py
@@ -376,6 +376,7 @@ class ScyllaNode:
            "--commitlog-use-o-dsync": ["0"],
            "--max-networking-io-control-blocks": ["1000"],
            "--unsafe-bypass-fsync": ["1"],
+            "--num-tokens": ["16"],
        }

        if self.scylla_mode() == "debug":
--- a/test/cluster/dtest/dtest_class.py
+++ b/test/cluster/dtest/dtest_class.py
@@ -249,9 +249,9 @@ def is_autocompaction_enabled(node, ks_name, table_name):
    response.raise_for_status()
    return response.json()

-def highest_supported_sstable_format(node):
+def chosen_sstable_format(node):
    node_ip = get_ip_from_node(node=node)
-    response = requests.get(f"http://{node_ip}:10000/system/highest_supported_sstable_version")
+    response = requests.get(f"http://{node_ip}:10000/system/chosen_sstable_version")
    response.raise_for_status()
    return response.json()

--- a/test/cluster/dtest/dtest_setup.py
+++ b/test/cluster/dtest/dtest_setup.py
@@ -507,6 +507,7 @@ class DTestSetup:
            "cas_contention_timeout_in_ms": timeout,
            "request_timeout_in_ms": timeout,
            "num_tokens": None,
+            "sstable_format": "ms",
        }

        if self.setup_overrides is not None and self.setup_overrides.cluster_options:
--- a/test/cluster/mv/test_mv_building.py
+++ b/test/cluster/mv/test_mv_building.py
@@ -119,8 +119,7 @@ async def test_view_building_during_drop_index(manager: ManagerClient):
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_interrupt_view_build_shard_registration(manager: ManagerClient):
    cmdline = ['--smp=4']
-    cfg = {"commitlog_sync_period_in_ms": 1000}
-    servers = await manager.servers_add(1, cmdline=cmdline, config=cfg)
+    servers = await manager.servers_add(1, cmdline=cmdline)
    server = servers[0]

    logger.info("Populate table")
@@ -132,7 +131,7 @@ async def test_interrupt_view_build_shard_registration(manager: ManagerClient):
    await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (p, c) VALUES ({k}, {k+1});") for k in range(n_partitions)])

    # pause the last shard so it won't be registered
-    await manager.api.enable_injection(server.ip_addr, "add_new_view_pause_last_shard", one_shot=True)
+    await manager.api.enable_injection(server.ip_addr, "add_new_view_fail_last_shard", one_shot=False)

    await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv AS SELECT p, c FROM {ks}.test WHERE p IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, p)")

@@ -142,10 +141,9 @@ async def test_interrupt_view_build_shard_registration(manager: ManagerClient):
        if len(rows) > 0:
            return True
    await wait_for(some_registered, time.time() + 60)
-    await asyncio.sleep(2) # ensure commitlog sync

    # restart while some shards registered but the last shard didn't
-    await manager.server_stop(server.server_id)
+    await manager.server_stop_gracefully(server.server_id)

    await manager.server_start(server.server_id)
    cql = await reconnect_driver(manager)
--- a/test/cluster/object_store/test_backup.py
+++ b/test/cluster/object_store/test_backup.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import glob
 import json
 import os
 import logging
@@ -189,18 +190,16 @@ async def do_test_backup_helper(manager: ManagerClient, object_storage,
    server = (await manager.servers_add(num_servers, config=cfg, cmdline=cmd))[0]
    ks, cf = create_ks_and_cf(manager.get_cql())
    snap_name, files = await take_snapshot_on_one_server(ks, server, manager, logger)
-    workdir = await manager.server_get_workdir(server.server_id)
-    cf_dir = os.listdir(f'{workdir}/data/{ks}')[0]

    await manager.api.enable_injection(server.ip_addr, breakpoint_name, one_shot=True)
    log = await manager.server_open_log(server.server_id)
    mark = await log.mark()

    print('Backup snapshot')
-    # use a unique(ish) path, because we're running more than one test using the same minio and ks/cf name.
+    # use a unique path, because we're running more than one test using the same minio and ks/cf name.
    # If we just use {cf}/backup, files like "schema.cql" and "manifest.json" will remain after previous test
    # case, and we will count these erroneously.
-    prefix = f'{cf_dir}/backup'
+    prefix = unique_name('backup_')
    tid = await manager.api.backup(server.ip_addr, ks, cf, snap_name, object_storage.address, object_storage.bucket_name, prefix)

    print(f'Started task {tid}, aborting it early')
@@ -828,13 +827,14 @@ async def test_backup_broken_streaming(manager: ManagerClient, s3_storage):
                for file in files:
                    local_path = os.path.join(root, file)
                    print("Processing file:", local_path)
-                    sst = subprocess.check_output(
+                    sst_generation = subprocess.check_output(
                        [scylla_path, "sstable", "write", "--schema-file", schema_file, "--input-format", "json",
-                         "--output-dir", tmp_dir, "--input-file", local_path])
+                         "--output-dir", tmp_dir, "--input-file", local_path]).decode().strip()
+                    sst_path = glob.glob(f"{tmp_dir}/??-{sst_generation}-???-TOC.txt")[0]
                    expected_rows += json.loads(subprocess.check_output(
                        [scylla_path, "sstable", "query", "-q", f"SELECT COUNT(*) FROM scylla_sstable.{table}",
                         "--output-format", "json", "--sstables",
-                         os.path.join(tmp_dir, f"me-{sst.decode().strip()}-big-TOC.txt")]).decode())[0]['count']
+                         sst_path]).decode())[0]['count']

            prefix = unique_name('/test/streaming_')
            s3_resource = s3_storage.get_resource()
@@ -874,15 +874,34 @@ async def test_backup_broken_streaming(manager: ManagerClient, s3_storage):

@pytest.mark.asyncio
@pytest.mark.parametrize("domain", ['rack', 'dc'])
-async def test_restore_primary_replica_same_domain(manager: ManagerClient, object_storage, domain):
-    '''Check that restoring with primary_replica_only and domain scope streams only to primary replica in the same domain.
-    The test checks that each mutation exists exactly 2 times within the cluster, once in each domain
-    (each restoring node streams to one primary replica in its domain. Without primary_replica_only we'd see 4 replicas, 2 in each domain).
-    The test also checks that the logs of each restoring node shows streaming to a single node, which is the primary replica within the same domain.'''
+@pytest.mark.parametrize("scope_is_same", [True, False])
+async def test_restore_primary_replica(manager: ManagerClient, object_storage, domain, scope_is_same):
+    '''Check that restoring with primary_replica_only streams to the correct primary replica(s) depending on scope.
+
+    When scope matches the node's own domain (scope_is_same=True):
+      - scope equals the domain itself, so streaming is confined within the same rack/DC.
+      - Each mutation exists exactly 2 times in the cluster, once per domain.
+      - Each streaming operation targets exactly one node, which must be within the same domain.
+
+    When scope is wider than the node's own domain (scope_is_same=False):
+      - scope is set to "dc" (for rack domain) or "all" (for dc domain), allowing cross-domain streaming.
+      - Each mutation exists exactly 1 time in the cluster.
+      - Each restoring node streams to exactly 2 distinct nodes, as the primary replica may fall in either domain.'''

    dcs = 1 if domain == 'rack' else 2
-    topology = topo(rf = 4, nodes = 8, racks = 2, dcs = dcs)
-    scope = domain
+    if scope_is_same:
+        topology = topo(rf = 4, nodes = 8, racks = 2, dcs = dcs)
+        scope = domain
+        expected_replicas = 2
+    else:
+        if domain == 'rack':
+            topology = topo(rf = 2, nodes = 2, racks = 2, dcs = dcs)
+            scope = "dc"
+        else:
+            topology = topo(rf = 1, nodes = 2, racks = 1, dcs = dcs)
+            scope = "all"
+        expected_replicas = 1
+
    ks = 'ks'
    cf = 'cf'

@@ -908,7 +927,7 @@ async def test_restore_primary_replica_same_domain(manager: ManagerClient, objec

    await asyncio.gather(*(do_restore_server(manager, logger, ks, cf, s, sstables[s], scope, True, prefix, object_storage) for s in servers))

-    await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, expected_replicas=2)
+    await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, expected_replicas=expected_replicas)

    logger.info(f'Validate streaming directions')
    for i, s in enumerate(servers):
@@ -924,61 +943,17 @@ async def test_restore_primary_replica_same_domain(manager: ManagerClient, objec
            else:
                return s1.datacenter == s2.datacenter

-        scope_nodes = set([ str(host_ids[s.server_id]) for s in servers if same_domain(s, servers[i]) ])
-        for op, nodes in nodes_by_operation.items():
-            logger.info(f'Operation {op} streamed to nodes {nodes}')
-            assert len(nodes) == 1, "Each streaming operation should stream to exactly one primary replica"
-            assert nodes[0] in scope_nodes, f"Primary replica should be within the scope {scope}"
+        if not scope_is_same:
+            streamed_to = set(node for nodes in nodes_by_operation.values() for node in nodes)
+            logger.info(f'{s.ip_addr} {host_ids[s.server_id]} streamed to {streamed_to}')
+            assert len(streamed_to) == 2
+        else:
+            scope_nodes = set([ str(host_ids[s.server_id]) for s in servers if same_domain(s, servers[i]) ])
+            for op, nodes in nodes_by_operation.items():
+                logger.info(f'Operation {op} streamed to nodes {nodes}')
+                assert len(nodes) == 1, "Each streaming operation should stream to exactly one primary replica"
+                assert nodes[0] in scope_nodes, f"Primary replica should be within the scope {scope}"

-@pytest.mark.asyncio
-@pytest.mark.parametrize("domain", ['rack', 'dc'])
-async def test_restore_primary_replica_different_domain(manager: ManagerClient, object_storage, domain):
-    '''Check that restoring with primary_replica_only and wider scope permits cross-domain streaming.
-    The test checks that each mutation exists exactly 1 time within the cluster, in one of the domains.
-    (each restoring node would pick the same primary replica, one would pick it within its own domain(itself), one would pick it from the other domain.
-     Without primary_replica_only we'd see 2 replicas, 1 in each domain).
-    The test also checks that the logs of each restoring node shows streaming to two nodes because cross-domain streaming is allowed
-    and eventually one node, depending on tablet_id of mutations, will end up choosing either of the two nodes as primary replica.'''
-
-    dcs = 1 if domain == 'rack' else 2
-    racks = 2 if domain == 'rack' else 1
-    rf = 2 if domain == 'rack' else 1
-    topology = topo(rf = rf, nodes = 2, racks = racks, dcs = dcs)
-    scope = "dc" if domain == 'rack' else "all"
-    ks = 'ks'
-    cf = 'cf'
-
-    servers, host_ids = await create_cluster(topology, manager, logger, object_storage)
-
-    await manager.disable_tablet_balancing()
-    cql = manager.get_cql()
-
-    schema, keys, replication_opts = await create_dataset(manager, ks, cf, topology, logger)
-
-    # validate replicas assertions hold on fresh dataset
-    await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf)
-
-    snap_name, sstables = await take_snapshot(ks, servers, manager, logger)
-    prefix = f'{cf}/{snap_name}'
-
-    await asyncio.gather(*(do_backup(s, snap_name, prefix, ks, cf, object_storage, manager, logger) for s in servers))
-
-    logger.info(f'Re-initialize keyspace')
-    cql.execute(f'DROP KEYSPACE {ks}')
-    cql.execute((f"CREATE KEYSPACE {ks} WITH REPLICATION = {replication_opts};"))
-    cql.execute(schema)
-
-    await asyncio.gather(*(do_restore_server(manager, logger, ks, cf, s, sstables[s], scope, True, prefix, object_storage) for s in servers))
-
-    await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, expected_replicas=1)
-
-    logger.info(f'Validate streaming directions')
-    for i, s in enumerate(servers):
-        log = await manager.server_open_log(s.server_id)
-        res = await log.grep(r'INFO.*sstables_loader - load_and_stream:.*target_node=([0-9a-z-]+),.*num_bytes_sent=([0-9]+)')
-        streamed_to = set([ r[1].group(1) for r in res ])
-        logger.info(f'{s.ip_addr} {host_ids[s.server_id]} streamed to {streamed_to}')
-        assert len(streamed_to) == 2

@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_decommision_waits_for_backup(manager: ManagerClient, object_storage):
--- a/test/cluster/storage/test_out_of_space_prevention.py
+++ b/test/cluster/storage/test_out_of_space_prevention.py
@@ -514,7 +514,7 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes
                    insert_stmt = cql.prepare(f"INSERT INTO {cf} (pk, t) VALUES (?, ?)")
                    insert_stmt.consistency_level = ConsistencyLevel.ONE

-                    await manager.api.enable_injection(servers[0].ip_addr, "database_apply", one_shot=False)
+                    await manager.api.enable_injection(servers[0].ip_addr, "database_apply", one_shot=False, parameters={"ks_name": ks, "cf_name": table, "what": "throw"})
                    pks = range(256, 512)
                    await asyncio.gather(*[cql.run_async(insert_stmt, (k, f'{k}')) for k in pks])
                    await manager.api.disable_injection(servers[0].ip_addr, "database_apply")
@@ -542,7 +542,7 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes

                    # Expect repair to fail when splitting new sstables
                    await log.wait_for("Repair for tablet migration of .* failed", from_mark=mark)
-                    await log.wait_for("Cannot split .* because manager has compaction disabled", from_mark=mark)
+                    await log.wait_for("Failed to load SSTable.*\(critical disk utilization\)", from_mark=mark)

                    assert await log.grep(f"compaction.*Split {cf}", from_mark=mark) == []

--- a/test/cluster/tasks/test_node_ops_tasks.py
+++ b/test/cluster/tasks/test_node_ops_tasks.py
@@ -254,27 +254,3 @@ async def test_node_ops_task_wait(manager: ManagerClient):

    await decommission_task
    await waiting_task
-
-@pytest.mark.asyncio
-async def test_get_children(manager: ManagerClient):
-    module_name = "node_ops"
-    tm = TaskManagerClient(manager.api)
-    servers = [await manager.server_add(cmdline=cmdline) for _ in range(2)]
-
-    injection = "tasks_vt_get_children"
-    handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, injection)
-
-    log = await manager.server_open_log(servers[0].server_id)
-    mark = await log.mark()
-
-    bootstrap_task = [task for task in await tm.list_tasks(servers[0].ip_addr, module_name) if task.kind == "cluster"][0]
-
-    async def _decommission():
-        await log.wait_for('tasks_vt_get_children: waiting', from_mark=mark)
-        await manager.decommission_node(servers[1].server_id)
-        await handler.message()
-
-    async def _get_status():
-        await tm.get_task_status(servers[0].ip_addr, bootstrap_task.task_id)
-
-    await asyncio.gather(*(_decommission(), _get_status()))
--- a/test/cluster/test_automatic_cleanup.py
+++ b/test/cluster/test_automatic_cleanup.py
@@ -78,11 +78,11 @@ async def test_no_cleanup_when_unnecessary(manager: ManagerClient):
 async def test_cleanup_waits_for_stale_writes(manager: ManagerClient):
    """Scenario:
       * Start two nodes, a vnodes-based table with an rf=2
-       * Run insert while bootstrapping another node, suspend this insert in database_apply_wait injection
+       * Run insert while bootstrapping another node, suspend this insert in database_apply injection
       * Bootstrap succeeds, capture the final topology version
       * Start decommission -> triggers global barrier, which we fail on another injection
       * This failure is not fatal, the cleanup procedure continues and blocks on waiting for the stale write
-       * We release the database_apply_wait injection, cleanup succeeds, write fails with 'stale topology exception'
+       * We release the database_apply injection, cleanup succeeds, write fails with 'stale topology exception'
    """

    config = {'tablets_mode_for_new_keyspaces': 'disabled'}
@@ -118,15 +118,15 @@ async def test_cleanup_waits_for_stale_writes(manager: ManagerClient):
        # Have a write request with write_both_read_new version stuck on both nodes:
        # - On the first node, this exercises the coordinator fencing code path.
        # - On the second node, this exercises the replica code path.
-        logger.info("Enable 'database_apply_wait' injection")
+        logger.info("Enable 'database_apply' injection")
        for s in servers[:-1]:
-            await manager.api.enable_injection(s.ip_addr, 'database_apply_wait',
-                                               False, parameters={'cf_name': 'my_test_table'})
+            await manager.api.enable_injection(s.ip_addr, 'database_apply',
+                                               False, parameters={'ks_name': ks, 'cf_name': 'my_test_table', 'what': 'wait'})
        logger.info("Start write")
        write_task = cql.run_async(f"INSERT INTO {ks}.my_test_table (pk, c) VALUES (1, 1)", host=hosts[0])
-        logger.info("Waiting for database_apply_wait")
-        await log0.wait_for("database_apply_wait: wait")
-        await log1.wait_for("database_apply_wait: wait")
+        logger.info("Waiting for database_apply")
+        await log0.wait_for("database_apply: wait")
+        await log1.wait_for("database_apply: wait")

        # Finish bootstrapping the node
        logger.info("Trigger topology_coordinator/write_both_read_new/after_barrier")
@@ -155,9 +155,9 @@ async def test_cleanup_waits_for_stale_writes(manager: ManagerClient):
        assert len(flush_matches) == 0

        # Release the write -- the cleanup process should resume and the decommission succeed
-        await manager.api.message_injection(servers[0].ip_addr, "database_apply_wait")
+        await manager.api.message_injection(servers[0].ip_addr, "database_apply")
        await log0.wait_for("vnodes_cleanup: flush_all_tables", timeout=15)
-        await manager.api.message_injection(servers[1].ip_addr, "database_apply_wait")
+        await manager.api.message_injection(servers[1].ip_addr, "database_apply")
        await log1.wait_for("vnodes_cleanup: flush_all_tables", timeout=15)

        await decommission_task
--- a/Show More
+++ b/Show More