Merge 'test: fix race condition in test_crashed_node_substitution' from Sergey Zolotukhin

`test_crashed_node_substitution` intermittently failed: ```python assert len(gossiper_eps) == (len(server_eps) + 1) ``` The test crashed the node right after a single ACK2 handshake (`finished do_send_ack2_msg`), assuming the node state was visible to all peers. However, since gossip is eventually consistent, the update may not have propagated yet, so some nodes did not see the failed node. This change: Wait until the gossiper state is visible on peers before continuing the test and asserting. Fixes: [SCYLLADB-1256](https://scylladb.atlassian.net/browse/SCYLLADB-1256). backport: this issue may affect CI for all branches, so should be backported to all versions. [SCYLLADB-1256]: https://scylladb.atlassian.net/browse/SCYLLADB-1256?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ Closes scylladb/scylladb#29254 * github.com:scylladb/scylladb: test: test_crashed_node_substitution: add docstring and fix whitespace test: fix race condition in test_crashed_node_substitution
test_lwt_fencing_upgrade: fix quorum failure due to gossip lag
2026-03-26 21:40:33 +02:00 · 2026-03-26 21:25:53 +02:00 · 2026-03-26 18:40:17 +01:00 · 2026-03-26 18:25:05 +01:00 · 2026-03-26 08:43:14 +03:00 · 2026-03-25 15:30:16 +01:00
169 changed files with 8615 additions and 2233 deletions
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -55,22 +55,26 @@ ninja build/<mode>/test/boost/<test_name>
 ninja build/<mode>/scylla

 # Run all tests in a file
-./test.py --mode=<mode> <test_path>
+./test.py --mode=<mode> test/<suite>/<test_name>.py

 # Run a single test case from a file
-./test.py --mode=<mode> <test_path>::<test_function_name>
+./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
+
+# Run all tests in a directory
+./test.py --mode=<mode> test/<suite>/

 # Examples
-./test.py --mode=dev alternator/
-./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/alternator/
+./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/cqlpy/test_json.py

 # Optional flags
-./test.py --mode=dev cluster/test_raft_no_quorum -v  # Verbose output
-./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5  # Repeat test 5 times
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v  # Verbose output
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5  # Repeat test 5 times
 ```

 **Important:**
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
+- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
 - To run a single test case, append `::<test_function_name>` to the file path
 - Add `-v` for verbose output
 - Add `--repeat <num>` to repeat a test multiple times
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -8,6 +8,9 @@ on:
 jobs:
  check-fixes-prefix:
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
    steps:
      - name: Check PR body for "Fixes" prefix patterns
        uses: actions/github-script@v7
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -1,4 +1,6 @@
 name: Trigger Scylla CI Route
+permissions:
+  contents: read

 on:
  issue_comment:
--- a/.github/workflows/trigger_jenkins.yaml
+++ b/.github/workflows/trigger_jenkins.yaml
@@ -1,5 +1,8 @@
 name: Trigger next gating

+permissions:
+  contents: read
+
 on:
  push:
    branches:
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1295,6 +1295,45 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_compaction",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger compaction of the key-value storage",
+               "type":"void",
+               "nickname":"logstor_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"major",
+                     "description":"When true, perform a major compaction",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/logstor_flush",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger flush of logstor storage",
+               "type":"void",
+               "nickname":"logstor_flush",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/storage_service/active_repair/",
         "operations":[
@@ -3229,6 +3268,38 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_info",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Logstor segment information for one table",
+               "type":"table_logstor_info",
+               "nickname":"logstor_info",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"table",
+                     "description":"table name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/retrain_dict",
         "operations":[
@@ -3637,6 +3708,47 @@
            }
        }
      },
+        "logstor_hist_bucket":{
+         "id":"logstor_hist_bucket",
+         "properties":{
+            "bucket":{
+               "type":"long"
+            },
+            "count":{
+               "type":"long"
+            },
+            "min_data_size":{
+               "type":"long"
+            },
+            "max_data_size":{
+               "type":"long"
+            }
+         }
+        },
+        "table_logstor_info":{
+         "id":"table_logstor_info",
+         "description":"Per-table logstor segment distribution",
+         "properties":{
+            "keyspace":{
+               "type":"string"
+            },
+            "table":{
+               "type":"string"
+            },
+            "compaction_groups":{
+               "type":"long"
+            },
+            "segments":{
+               "type":"long"
+            },
+            "data_size_histogram":{
+               "type":"array",
+               "items":{
+                  "$ref":"logstor_hist_bucket"
+               }
+            }
+         }
+        },
      "tablet_repair_result":{
        "id":"tablet_repair_result",
        "description":"Tablet repair result",
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -209,6 +209,21 @@
               "parameters":[]
            }
         ]
+      },
+      {
+         "path":"/system/chosen_sstable_version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get sstable version currently chosen for use in new sstables",
+               "type":"string",
+               "nickname":"get_chosen_sstable_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
      }
   ]
 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -18,7 +18,9 @@
 #include "utils/assert.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
+#include <sstream>
 #include "db/data_listeners.hh"
+#include "utils/hash.hh"
 #include "storage_service.hh"
 #include "compaction/compaction_manager.hh"
 #include "unimplemented.hh"
@@ -342,6 +344,56 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
    return ret;
 }

+static
+future<json::json_return_type>
+rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
+        bool filters_provided = false;
+
+        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
+        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                table_filters.emplace(parse_fully_qualified_cf_name(filter));
+            }
+        }
+
+        std::unordered_set<sstring> keyspace_filters {};
+        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                keyspace_filters.emplace(std::move(filter));
+            }
+        }
+
+        // when the query is empty return immediately
+        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
+            apilog.debug("toppartitions query: processing results");
+            cf::toppartitions_query_results results;
+
+            results.read_cardinality = 0;
+            results.write_cardinality = 0;
+
+            return make_ready_future<json::json_return_type>(results);
+        }
+
+        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
+        api::req_param<unsigned> capacity(*req, "capacity", 256);
+        api::req_param<unsigned> list_size(*req, "list_size", 10);
+
+        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
+            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
+
+        return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
+            return run_toppartitions_query(q);
+        });
+}
+
 void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
    cf::get_column_family_name.set(r, [&db] (const_req req){
        std::vector<sstring> res;
@@ -1047,6 +1099,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
        });
    });

+    ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
+        return rest_toppartitions_generic(db, std::move(req));
+    });
+
    cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        if (!req->get_query_param("split_output").empty()) {
            fail(unimplemented::cause::API);
@@ -1213,6 +1269,7 @@ void unset_column_family(http_context& ctx, routes& r) {
    cf::get_sstable_count_per_level.unset(r);
    cf::get_sstables_for_key.unset(r);
    cf::toppartitions.unset(r);
+    ss::toppartitions_generic.unset(r);
    cf::force_major_compaction.unset(r);
    ss::get_load.unset(r);
    ss::get_metrics_load.unset(r);
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -17,9 +17,7 @@
 #include "gms/feature_service.hh"
 #include "schema/schema_builder.hh"
 #include "sstables/sstables_manager.hh"
-#include "utils/hash.hh"
 #include <optional>
-#include <sstream>
 #include <stdexcept>
 #include <time.h>
 #include <algorithm>
@@ -612,56 +610,6 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
        co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
 }

-static
-future<json::json_return_type>
-rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
-        bool filters_provided = false;
-
-        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
-        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                table_filters.emplace(parse_fully_qualified_cf_name(filter));
-            }
-        }
-
-        std::unordered_set<sstring> keyspace_filters {};
-        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                keyspace_filters.emplace(std::move(filter));
-            }
-        }
-
-        // when the query is empty return immediately
-        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
-            apilog.debug("toppartitions query: processing results");
-            httpd::column_family_json::toppartitions_query_results results;
-
-            results.read_cardinality = 0;
-            results.write_cardinality = 0;
-
-            return make_ready_future<json::json_return_type>(results);
-        }
-
-        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
-        api::req_param<unsigned> capacity(*req, "capacity", 256);
-        api::req_param<unsigned> list_size(*req, "list_size", 10);
-
-        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
-            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
-
-        return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
-            return run_toppartitions_query(q);
-        });
-}
-
 static
 json::json_return_type
 rest_get_release_version(sharded<service::storage_service>& ss, const_req& req) {
@@ -833,6 +781,28 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)
        co_return json_void();
 }

+static
+future<json::json_return_type>
+rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+        bool major = false;
+        if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
+            major = validate_bool(major_param);
+        }
+        apilog.info("logstor_compaction: major={}", major);
+        auto& db = ctx.db;
+        co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
+        co_return json_void();
+}
+
+static
+future<json::json_return_type>
+rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
+        apilog.info("logstor_flush");
+        auto& db = ctx.db;
+        co_await replica::database::flush_logstor_separator_on_all_shards(db);
+        co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
@@ -1553,6 +1523,54 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
        });
 }

+static
+future<json::json_return_type>
+rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
+        auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
+        auto table = api::req_param<sstring>(*req, "table", {}).value;
+        if (table.empty()) {
+            table = api::req_param<sstring>(*req, "cf", {}).value;
+        }
+
+        if (keyspace.empty()) {
+            throw bad_param_exception("The query parameter 'keyspace' is required");
+        }
+        if (table.empty()) {
+            throw bad_param_exception("The query parameter 'table' is required");
+        }
+
+        keyspace = validate_keyspace(ctx, keyspace);
+        auto tid = validate_table(ctx.db.local(), keyspace, table);
+
+        auto& cf = ctx.db.local().find_column_family(tid);
+        if (!cf.uses_logstor()) {
+            throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
+        }
+
+        return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
+            return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
+                merged_stats += shard_stats;
+            }, [tid](const replica::database& db) {
+                return db.get_logstor_table_segment_stats(tid);
+            }).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
+                ss::table_logstor_info result;
+                result.keyspace = keyspace;
+                result.table = table;
+                result.compaction_groups = merged_stats.compaction_group_count;
+                result.segments = merged_stats.segment_count;
+
+                for (const auto& bucket : merged_stats.histogram) {
+                    ss::logstor_hist_bucket hist;
+                    hist.count = bucket.count;
+                    hist.max_data_size = bucket.max_data_size;
+                    result.data_size_histogram.push(std::move(hist));
+                }
+
+                return make_ready_future<json::json_return_type>(stream_object(result));
+            });
+        });
+}
+
 static
 future<json::json_return_type>
 rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
@@ -1784,7 +1802,6 @@ rest_bind(FuncType func, BindArgs&... args) {

 void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
    ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
-    ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
    ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
    ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
    ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
@@ -1800,6 +1817,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
    ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
+    ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
+    ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
    ss::move.set(r, rest_bind(rest_move, ss));
    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
@@ -1848,6 +1867,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
    ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
    ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
+    ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
@@ -1864,7 +1884,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

 void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_token_endpoint.unset(r);
-    ss::toppartitions_generic.unset(r);
    ss::get_release_version.unset(r);
    ss::get_scylla_release_version.unset(r);
    ss::get_schema_version.unset(r);
@@ -1878,6 +1897,8 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reset_cleanup_needed.unset(r);
    ss::force_flush.unset(r);
    ss::force_keyspace_flush.unset(r);
+    ss::logstor_compaction.unset(r);
+    ss::logstor_flush.unset(r);
    ss::decommission.unset(r);
    ss::move.unset(r);
    ss::remove_node.unset(r);
@@ -1925,6 +1946,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_ownership.unset(r);
    ss::get_effective_ownership.unset(r);
    ss::sstable_info.unset(r);
+    ss::logstor_info.unset(r);
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
--- a/api/system.cc
+++ b/api/system.cc
@@ -190,6 +190,13 @@ void set_system(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
        });
    });
+
+    hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx] {
+            auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
+            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
+        });
+    });
 }

 }
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -47,7 +47,7 @@ void cache::set_permission_loader(permission_loader_func loader) {
    _permission_loader = std::move(loader);
 }

-lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
+lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
    auto it = _roles.find(role);
    if (it == _roles.end()) {
        return {};
@@ -55,6 +55,16 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
    return it->second;
 }

+void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
+    for (const auto& [name, record] : _roles) {
+        func(name, *record);
+    }
+}
+
+size_t cache::roles_count() const noexcept {
+    return _roles.size();
+}
+
 future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
    std::unordered_map<resource, permission_set>* perms_cache;
    lw_shared_ptr<role_record> role_ptr;
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
+#include <string_view>
 #include <unordered_set>
 #include <unordered_map>

@@ -19,7 +20,7 @@
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/metrics_registration.hh>

-#include <absl/container/flat_hash_map.h>
+#include "absl-flat_hash_map.hh"

 #include "auth/permission.hh"
 #include "auth/common.hh"
@@ -42,8 +43,8 @@ public:
        std::unordered_set<role_name_t> member_of;
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
-        std::unordered_map<sstring, sstring> attributes;
-        std::unordered_map<sstring, permission_set> permissions;
+        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
+        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
    private:
        friend cache;
        // cached permissions include effects of role's inheritance
@@ -52,7 +53,7 @@ public:
    };

    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
-    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
+    lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
    void set_permission_loader(permission_loader_func loader);
    future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
    future<> prune(const resource& r);
@@ -61,8 +62,15 @@ public:
    future<> load_roles(std::unordered_set<role_name_t> roles);
    static bool includes_table(const table_id&) noexcept;

+    // Returns the number of roles in the cache.
+    size_t roles_count() const noexcept;
+
+    // The callback doesn't suspend (no co_await) so it observes the state
+    // of the cache atomically.
+    void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
+
 private:
-    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
+    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
    roles_map _roles;
    // anonymous permissions map exists mainly due to compatibility with
    // higher layers which use role_or_anonymous to get permissions.
--- a/auth/maintenance_socket_authorizer.hh
+++ b/auth/maintenance_socket_authorizer.hh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "auth/default_authorizer.hh"
+#include "auth/permission.hh"
+
+namespace auth {
+
+// maintenance_socket_authorizer is used for clients connecting to the
+// maintenance socket. It grants all permissions unconditionally (like
+// AllowAllAuthorizer) while still supporting grant/revoke operations
+// (delegated to the underlying CassandraAuthorizer / default_authorizer).
+class maintenance_socket_authorizer : public default_authorizer {
+public:
+    using default_authorizer::default_authorizer;
+
+    ~maintenance_socket_authorizer() override = default;
+
+    future<> start() override {
+        return make_ready_future<>();
+    }
+
+    future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
+        return make_ready_future<permission_set>(permissions::ALL);
+    }
+};
+
+} // namespace auth
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -30,6 +30,7 @@
 #include "auth/default_authorizer.hh"
 #include "auth/ldap_role_manager.hh"
 #include "auth/maintenance_socket_authenticator.hh"
+#include "auth/maintenance_socket_authorizer.hh"
 #include "auth/maintenance_socket_role_manager.hh"
 #include "auth/password_authenticator.hh"
 #include "auth/role_or_anonymous.hh"
@@ -866,6 +867,12 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
    };
 }

+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
+    return [&qp] {
+        return std::make_unique<maintenance_socket_authorizer>(qp.local());
+    };
+}
+
 role_manager_factory make_maintenance_socket_role_manager_factory(
        sharded<cql3::query_processor>& qp,
        ::service::raft_group0_client& g0,
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -434,6 +434,11 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
        sharded<::service::migration_manager>& mm,
        sharded<cache>& cache);

+/// Creates a factory for the maintenance socket authorizer.
+/// This authorizer is not config-selectable and is only used for the maintenance socket.
+/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
+
 /// Creates a factory for the maintenance socket role manager.
 /// This role manager is not config-selectable and is only used for the maintenance socket.
 role_manager_factory make_maintenance_socket_role_manager_factory(
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -44,13 +44,12 @@ namespace auth {
 static logging::logger log("standard_role_manager");

 future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
-    auto name = sstring(role_name);
-    auto role = _cache.get(name);
+    auto role = _cache.get(role_name);
    if (!role) {
        return make_ready_future<std::optional<record>>(std::nullopt);
    }
    return make_ready_future<std::optional<record>>(std::make_optional(record{
-        .name = std::move(name),
+        .name = sstring(role_name),
        .is_superuser = role->is_superuser,
        .can_login = role->can_login,
        .member_of = role->member_of
@@ -393,51 +392,21 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
 }

 future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT * FROM {}.{}",
-            db::system_keyspace::NAME,
-            ROLE_MEMBERS_CF);
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_to_directly_granted_map roles_map;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles_map, roles_map.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
-    );
-
+    _cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
+        for (const auto& granted_role : record.member_of) {
+            roles_map.emplace(name, granted_role);
+        }
+    });
    co_return roles_map;
 }

 future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT {} FROM {}.{}",
-            meta::roles_table::role_col_name,
-            db::system_keyspace::NAME,
-            meta::roles_table::name);
-
-    // To avoid many copies of a view.
-    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::LOCAL_ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_set roles;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles, roles.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return row.get_as<sstring>(role_col_name_string);}
-    );
+    roles.reserve(_cache.roles_count());
+    _cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
+        roles.insert(name);
+    });
    co_return roles;
 }

@@ -460,31 +429,26 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
-            db::system_keyspace::NAME,
-            ROLE_ATTRIBUTES_CF);
-    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
-    if (!result_set->empty()) {
-        const cql3::untyped_result_set_row &row = result_set->one();
-        co_return std::optional<sstring>(row.get_as<sstring>("value"));
+    auto role = _cache.get(role_name);
+    if (!role) {
+        co_return std::nullopt;
    }
-    co_return std::optional<sstring>{};
+    auto it = role->attributes.find(attribute_name);
+    if (it != role->attributes.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
-    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
-                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
-                    if (att_val) {
-                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
-                    }
-                });
-            }).then([&role_to_att_val] () {
-                return make_ready_future<attribute_vals>(std::move(role_to_att_val));
-            });
-        });
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    attribute_vals result;
+    _cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
+        auto it = record.attributes.find(attribute_name);
+        if (it != record.attributes.end()) {
+            result.emplace(name, it->second);
+        }
    });
+    co_return result;
 }

 future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1268,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
    if (dsm && (this_shard_id() == 0)) {
        _out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
            if (threshold_reached) {
-                return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
+                return container().invoke_on_all([] (compaction_manager& cm) {
+                    cm._in_critical_disk_utilization_mode = true;
+                    return cm.drain();
+                });
            }
-            return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
+            return container().invoke_on_all([] (compaction_manager& cm) {
+                cm._in_critical_disk_utilization_mode = false;
+                cm.enable();
+            });
        });
    }

@@ -2348,6 +2354,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
 }

+std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
+    std::exception_ptr ex;
+    if (_in_critical_disk_utilization_mode) {
+        ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
+    } else {
+        ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
+    }
+    return ex;
+}
+
 future<std::vector<sstables::shared_sstable>>
 compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
@@ -2357,8 +2373,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
    // We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
    // which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
    if (is_disabled()) {
-        co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
-                                                                                         "reason might be out of space prevention", sst->get_filename()))));
+        co_return coroutine::exception(make_disabled_exception(t));
    }
    std::vector<sstables::shared_sstable> ret;

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -115,6 +115,8 @@ private:
    uint32_t _disabled_state_count = 0;

    bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
+    // precondition: is_disabled() is true.
+    std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);

    std::optional<future<>> _stop_future;

@@ -170,6 +172,7 @@ private:
    shared_tombstone_gc_state _shared_tombstone_gc_state;

    utils::disk_space_monitor::subscription _out_of_space_subscription;
+    bool _in_critical_disk_utilization_mode = false;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -397,6 +397,17 @@ commitlog_total_space_in_mb: -1
 #      you can cache more hot rows
 # column_index_size_in_kb: 64

+# sstable format version for newly written sstables.
+# Currently allowed values are `me` and `ms`.
+# If not specified in the config, this defaults to `me`.
+#
+# The difference between `me` and `ms` are the data structures used
+# in the primary index.
+# In short, `ms` needs more CPU during sstable writes,
+# but should behave better during reads,
+# although it might behave worse for very long clustering keys.
+sstable_format: ms
+
 # Auto-scaling of the promoted index prevents running out of memory
 # when the promoted index grows too large (due to partitions with many rows
 # vs. too small column_index_size_in_kb).  When the serialized representation
--- a/configure.py
+++ b/configure.py
@@ -896,6 +896,9 @@ scylla_core = (['message/messaging_service.cc',
                'replica/multishard_query.cc',
                'replica/mutation_dump.cc',
                'replica/querier.cc',
+                'replica/logstor/segment_manager.cc',
+                'replica/logstor/logstor.cc',
+                'replica/logstor/write_buffer.cc',
                'mutation/atomic_cell.cc',
                'mutation/canonical_mutation.cc',
                'mutation/frozen_mutation.cc',
@@ -1467,6 +1470,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/query.idl.hh',
        'idl/idl_test.idl.hh',
        'idl/commitlog.idl.hh',
+        'idl/logstor.idl.hh',
        'idl/tracing.idl.hh',
        'idl/consistency_level.idl.hh',
        'idl/cache_temperature.idl.hh',
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -265,7 +265,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
        return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(
                exceptions::invalid_request_exception(
-                        format("Consistency level {} is not allowed for write operations", cl)));
+                        format("Write consistency level {} is forbidden by the current configuration "
+                               "setting of write_consistency_levels_disallowed. Please use a different "
+                               "consistency level, or remove {} from write_consistency_levels_disallowed "
+                               "set in the configuration.", cl, cl)));
    }

    for (size_t i = 0; i < _statements.size(); ++i) {
@@ -277,7 +280,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        _stats.statements_in_cas_batches += _statements.size();
        return execute_with_conditions(qp, options, query_state).then([guardrail_state, cl] (auto result) {
            if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-                result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+                result->add_warning(format("Using write consistency level {} listed on the "
+                                           "write_consistency_levels_warned is not recommended.", cl));
            }
            return result;
        });
@@ -297,7 +301,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        }
        auto result = make_shared<cql_transport::messages::result_message::void_message>();
        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+            result->add_warning(format("Using write consistency level {} listed on the "
+                                       "write_consistency_levels_warned is not recommended.", cl));
        }
        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(result));
    });
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -59,6 +59,8 @@ const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";

 const sstring cf_prop_defs::KW_TABLETS = "tablets";

+const sstring cf_prop_defs::KW_STORAGE_ENGINE = "storage_engine";
+
 schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
    schema::extensions_map er;
    for (auto& p : exts.schema_extensions()) {
@@ -106,6 +108,7 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
        KW_COMPRESSION, KW_CRC_CHECK_CHANCE,  KW_ID, KW_PAXOSGRACESECONDS,
        KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
+        KW_STORAGE_ENGINE,
    });
    static std::set<sstring> obsolete_keywords({
        sstring("index_interval"),
@@ -196,6 +199,20 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        }
        db::tablet_options::validate(*tablet_options_map);
    }
+
+    if (has_property(KW_STORAGE_ENGINE)) {
+        auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor") {
+            if (!db.features().logstor) {
+                throw exceptions::configuration_exception(format("The experimental feature 'logstor' must be enabled in order to use the 'logstor' storage engine."));
+            }
+            if (!db.get_config().enable_logstor()) {
+                throw exceptions::configuration_exception(format("The configuration option 'enable_logstor' must be set to true in the configuration in order to use the 'logstor' storage engine."));
+            }
+        } else {
+            throw exceptions::configuration_exception(format("Illegal value for '{}'", KW_STORAGE_ENGINE));
+        }
+    }
 }

 std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
@@ -396,6 +413,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
    if (auto tablet_options_opt = get_map(KW_TABLETS)) {
        builder.set_tablet_options(std::move(*tablet_options_opt));
    }
+
+    if (has_property(KW_STORAGE_ENGINE)) {
+        auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor") {
+            builder.set_logstor();
+        }
+    }
 }

 void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -64,6 +64,8 @@ public:

    static const sstring KW_TABLETS;

+    static const sstring KW_STORAGE_ENGINE;
+
    // FIXME: In origin the following consts are in CFMetaData.
    static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
    static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -9,6 +9,7 @@
 */


+#include "cql3/statements/cf_prop_defs.hh"
 #include "utils/assert.hh"
 #include <inttypes.h>
 #include <boost/regex.hpp>
@@ -266,6 +267,13 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
        stmt_warning("CREATE TABLE WITH COMPACT STORAGE is deprecated and will eventually be removed in a future version.");
    }

+    if (_properties.properties()->has_property(cf_prop_defs::KW_STORAGE_ENGINE)) {
+        auto storage_engine = _properties.properties()->get_string(cf_prop_defs::KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor" && !_column_aliases.empty()) {
+            throw exceptions::configuration_exception("The 'logstor' storage engine cannot be used with tables that have clustering columns");
+        }
+    }
+
    auto& key_aliases = _key_aliases[0];
    std::vector<data_type> key_types;
    for (auto&& alias : key_aliases) {
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -273,7 +273,10 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
        co_return coroutine::exception(
                std::make_exception_ptr(exceptions::invalid_request_exception(
-                        format("Consistency level {} is not allowed for write operations", cl))));
+                        format("Write consistency level {} is forbidden by the current configuration "
+                               "setting of write_consistency_levels_disallowed. Please use a different "
+                               "consistency level, or remove {} from write_consistency_levels_disallowed "
+                               "set in the configuration.", cl, cl))));
    }

    _restrictions->validate_primary_key(options);
@@ -281,7 +284,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
    if (has_conditions()) {
        auto result = co_await execute_with_condition(qp, qs, options);
        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+            result->add_warning(format("Using write consistency level {} listed on the "
+                                       "write_consistency_levels_warned is not recommended.", cl));
        }
        co_return result;
    }
@@ -303,7 +307,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    auto result = seastar::make_shared<cql_transport::messages::result_message::void_message>();
    if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-        result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+        result->add_warning(format("Using write consistency level {} listed on the "
+                                   "write_consistency_levels_warned is not recommended.", cl));
    }
    if (keys_size_one) {
        auto&& table = s->table();
--- a/db/config.cc
+++ b/db/config.cc
@@ -679,6 +679,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The directory where hints files are stored if hinted handoff is enabled.")
    , view_hints_directory(this, "view_hints_directory", value_status::Used, "",
        "The directory where materialized-view updates are stored while a view replica is unreachable.")
+    , logstor_directory(this, "logstor_directory", value_status::Used, "",
+        "The directory where data files for logstor storage are stored.")
    , saved_caches_directory(this, "saved_caches_directory", value_status::Unused, "",
        "The directory location where table key and row caches are stored.")
    /**
@@ -862,6 +864,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "* offheap_objects  Native memory, eliminating NIO buffer heap overhead.")
    , memtable_cleanup_threshold(this, "memtable_cleanup_threshold", value_status::Invalid, .11,
        "Ratio of occupied non-flushing memtable size to total permitted size for triggering a flush of the largest memtable. Larger values mean larger flushes and less compaction, but also less concurrent flush activity, which can make it difficult to keep your disks saturated under heavy write load.")
+    , logstor_disk_size_in_mb(this, "logstor_disk_size_in_mb", value_status::Used, 2048,
+        "Total size in megabytes allocated for logstor storage on disk.")
+    , logstor_file_size_in_mb(this, "logstor_file_size_in_mb", value_status::Used, 32,
+        "Total size in megabytes allocated for each logstor data file on disk.")
+    , logstor_separator_delay_limit_ms(this, "logstor_separator_delay_limit_ms", value_status::Used, 100,
+        "Maximum delay in milliseconds for logstor separator debt control.")
+    , logstor_separator_max_memory_in_mb(this, "logstor_separator_max_memory_in_mb", value_status::Used, 256,
+        "Maximum memory in megabytes for logstor separator memory buffers.")
    , file_cache_size_in_mb(this, "file_cache_size_in_mb", value_status::Unused, 512,
        "Total memory to use for SSTable-reading buffers.")
    , memtable_flush_queue_size(this, "memtable_flush_queue_size", value_status::Unused, 4,
@@ -1281,6 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted).")
    , enable_cache(this, "enable_cache", value_status::Used, true, "Enable cache.")
    , enable_commitlog(this, "enable_commitlog", value_status::Used, true, "Enable commitlog.")
+    , enable_logstor(this, "enable_logstor", value_status::Used, false, "Enable the logstor storage engine.")
    , volatile_system_keyspace_for_testing(this, "volatile_system_keyspace_for_testing", value_status::Used, false, "Don't persist system keyspace - testing only!")
    , api_port(this, "api_port", value_status::Used, 10000, "Http Rest API port.")
    , api_address(this, "api_address", value_status::Used, "", "Http Rest API address.")
@@ -1692,6 +1703,7 @@ void db::config::setup_directories() {
    maybe_in_workdir(data_file_directories, "data");
    maybe_in_workdir(hints_directory, "hints");
    maybe_in_workdir(view_hints_directory, "view_hints");
+    maybe_in_workdir(logstor_directory, "logstor");
    maybe_in_workdir(saved_caches_directory, "saved_caches");
 }

@@ -1861,7 +1873,8 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},
        {"tablets", feature::UNUSED},
        {"views-with-tablets", feature::UNUSED},
-        {"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES}
+        {"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES},
+        {"logstor", feature::LOGSTOR}
    };
 }

--- a/db/config.hh
+++ b/db/config.hh
@@ -117,7 +117,8 @@ struct experimental_features_t {
        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
-        STRONGLY_CONSISTENT_TABLES
+        STRONGLY_CONSISTENT_TABLES,
+        LOGSTOR,
    };
    static std::map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
@@ -201,6 +202,7 @@ public:
    named_value<uint64_t> data_file_capacity;
    named_value<sstring> hints_directory;
    named_value<sstring> view_hints_directory;
+    named_value<sstring> logstor_directory;
    named_value<sstring> saved_caches_directory;
    named_value<sstring> commit_failure_policy;
    named_value<sstring> disk_failure_policy;
@@ -244,6 +246,10 @@ public:
    named_value<bool> defragment_memory_on_idle;
    named_value<sstring> memtable_allocation_type;
    named_value<double> memtable_cleanup_threshold;
+    named_value<uint32_t> logstor_disk_size_in_mb;
+    named_value<uint32_t> logstor_file_size_in_mb;
+    named_value<uint32_t> logstor_separator_delay_limit_ms;
+    named_value<uint32_t> logstor_separator_max_memory_in_mb;
    named_value<uint32_t> file_cache_size_in_mb;
    named_value<uint32_t> memtable_flush_queue_size;
    named_value<uint32_t> memtable_flush_writers;
@@ -364,6 +370,7 @@ public:
    named_value<bool> enable_in_memory_data_store;
    named_value<bool> enable_cache;
    named_value<bool> enable_commitlog;
+    named_value<bool> enable_logstor;
    named_value<bool> volatile_system_keyspace_for_testing;
    named_value<uint16_t> api_port;
    named_value<sstring> api_address;
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -336,6 +336,8 @@ schema_ptr scylla_tables(schema_features features) {
        // since it is written to only after the cluster feature is enabled.
        sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false));

+        sb.with_column("storage_engine", utf8_type);
+
        sb.with_hash_version();
        s = sb.build();
    }
@@ -1676,6 +1678,9 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times
            m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp));
        }
    }
+    if (table->logstor_enabled()) {
+        m.set_clustered_cell(ckey, "storage_engine", "logstor", timestamp);
+    }
    // In-memory tables are deprecated since scylla-2024.1.0
    // FIXME: delete the column when there's no live version supporting it anymore.
    // Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature
@@ -2161,6 +2166,13 @@ static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, sche
        auto tablet_options = db::tablet_options(*opt_map);
        builder.set_tablet_options(tablet_options.to_map());
    }
+    if (auto storage_engine = table_row.get<sstring>("storage_engine")) {
+        if (*storage_engine == "logstor") {
+            builder.set_logstor();
+        } else {
+            throw std::invalid_argument(format("Invalid value for storage_engine: {}", *storage_engine));
+        }
+    }
 }

 schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, const data_dictionary::user_types_storage& user_types, schema_ptr cdc_schema, std::optional<table_schema_version> version)
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -3052,7 +3052,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
        co_return ret;
    }

-    const bool strongly_consistent_tables = _db.features().strongly_consistent_tables;
+    const bool tablet_balancing_not_supported = _db.features().strongly_consistent_tables || _db.features().logstor;

    for (auto& row : *rs) {
        if (!row.has("host_id")) {
@@ -3289,7 +3289,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            ret.session = service::session_id(some_row.get_as<utils::UUID>("session"));
        }

-        if (strongly_consistent_tables) {
+        if (tablet_balancing_not_supported) {
            ret.tablet_balancing_enabled = false;
        } else if (some_row.has("tablet_balancing_enabled")) {
            ret.tablet_balancing_enabled = some_row.get_as<bool>("tablet_balancing_enabled");
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2647,7 +2647,7 @@ future<> view_builder::add_new_view(view_ptr view, build_step& step) {
    }

    if (this_shard_id() == smp::count - 1) {
-        co_await utils::get_local_injector().inject("add_new_view_pause_last_shard", utils::wait_for_message(5min));
+        inject_failure("add_new_view_fail_last_shard");
    }

    co_await _sys_ks.register_view_for_building(view->ks_name(), view->cf_name(), step.current_token());
--- a/dht/token.hh
+++ b/dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
    after_all_keys,
 };

+// Represents a token for partition keys.
+// Has a disengaged state, which sorts before all engaged states.
+struct raw_token {
+    int64_t value;
+
+    /// Constructs a disengaged token.
+    raw_token() : value(std::numeric_limits<int64_t>::min()) {}
+
+    /// Constructs an engaged token.
+    /// The token must be of token_kind::key kind.
+    explicit raw_token(const token&);
+
+    explicit raw_token(int64_t v) : value(v) {};
+
+    std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
+    std::strong_ordering operator<=>(const token& o) const noexcept;
+
+    /// Returns true iff engaged.
+    explicit operator bool() const noexcept {
+        return value != std::numeric_limits<int64_t>::min();
+    }
+};
+
+using raw_token_opt = seastar::optimized_optional<raw_token>;
+
 class token {
    // INT64_MIN is not a legal token, but a special value used to represent
    // infinity in token intervals.
@@ -52,6 +77,10 @@ public:

    constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}

+    token(raw_token raw) noexcept
+        : token(raw ? kind::key : kind::before_all_keys, raw.value)
+    { }
+
    // This constructor seems redundant with the bytes_view constructor, but
    // it's necessary for IDL, which passes a deserialized_bytes_proxy here.
    // (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
@@ -223,6 +252,29 @@ public:
    }
 };

+inline
+raw_token::raw_token(const token& t)
+    : value(t.raw())
+{
+#ifdef DEBUG
+    assert(t._kind == token::kind::key);
+#endif
+}
+
+inline
+std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
+    switch (o._kind) {
+        case token::kind::after_all_keys:
+            return std::strong_ordering::less;
+        case token::kind::before_all_keys:
+            // before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
+            // So we can order them by just comparing raw values.
+            [[fallthrough]];
+        case token::kind::key:
+            return value <=> o._data;
+    }
+}
+
 inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
    if (l1 == l2) {
        return std::strong_ordering::equal;
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
    }
 };

+template <>
+struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const dht::raw_token& t, FormatContext& ctx) const {
+        if (!t) {
+            return fmt::format_to(ctx.out(), "null");
+        }
+        return fmt::format_to(ctx.out(), "{}", t.value);
+    }
+};
+
 namespace std {

 template<>
--- a/dist/common/sysconfig/scylla-node-exporter
+++ b/dist/common/sysconfig/scylla-node-exporter
@@ -1 +1 @@
-SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
+SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --collector.systemd --collector.systemd.unit-include='^(scylla-server|systemd-coredump.*)\.service$' --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -139,7 +139,7 @@ The ``WHERE`` clause
 ~~~~~~~~~~~~~~~~~~~~

 The ``WHERE`` clause specifies which rows must be queried. It is composed of relations on the columns that are part of
-the ``PRIMARY KEY``.
+the ``PRIMARY KEY``, and relations can be joined only with ``AND`` (``OR`` and other logical operators are not supported).

 Not all relations are allowed in a query. For instance, non-equal relations (where ``IN`` is considered as an equal
 relation) on a partition key are not supported (see the use of the ``TOKEN`` method below to do non-equal queries on
@@ -200,6 +200,23 @@ The tuple notation may also be used for ``IN`` clauses on clustering columns::
     WHERE userid = 'john doe'
       AND (blog_title, posted_at) IN (('John''s Blog', '2012-01-01'), ('Extreme Chess', '2014-06-01'))

+This tuple notation is different from boolean grouping. For example, the following query is not supported::
+
+    SELECT * FROM users
+     WHERE (country = 'BR' AND state = 'SP')
+
+because parentheses are only allowed around a single relation, so this works: ``(country = 'BR') AND (state = 'SP')``, but this does not: ``(country = 'BR' AND state = 'SP')``.
+Similarly, an extended query of the form of::
+
+    SELECT * FROM users
+     WHERE (country = 'BR' AND state = 'SP')
+       OR (country = 'BR' AND state = 'RJ')
+
+won't work due to both: grouping boolean expressions and not supporting ``OR``, so when possible,
+rewrite such queries with ``IN`` on the varying column, for example
+``country = 'BR' AND state IN ('SP', 'RJ')``, or run multiple queries and merge
+the results client-side.
+
 The ``CONTAINS`` operator may only be used on collection columns (lists, sets, and maps). In the case of maps,
 ``CONTAINS`` applies to the map values. The ``CONTAINS KEY`` operator may only be used on map columns and applies to the
 map keys.
--- a/docs/cql/guardrails.rst
+++ b/docs/cql/guardrails.rst
@@ -0,0 +1,236 @@
+.. highlight:: cql
+
+.. _cql-guardrails:
+
+CQL Guardrails
+==============
+
+ScyllaDB provides a set of configurable guardrail parameters that help operators
+enforce best practices and prevent misconfigurations that could degrade cluster
+health, availability, or performance. Guardrails operate at two severity levels:
+
+* **Warn**: The request succeeds, but the server includes a warning in the CQL
+  response. Depending on the specific guardrail, the warning may also be logged on the server side.
+* **Fail**: The request is rejected with an error/exception (the specific type
+  depends on the guardrail). The user must correct the request or adjust the
+  guardrail configuration to proceed.
+
+.. note::
+
+   Guardrails are checked only when a statement is
+   executed. They do not retroactively validate existing keyspaces, tables, or
+   previously completed writes.
+
+For the full list of configuration properties, including types, defaults, and
+liveness information, see :doc:`Configuration Parameters </reference/configuration-parameters>`.
+
+.. _guardrails-replication-factor:
+
+Replication Factor Guardrails
+-----------------------------
+
+These four parameters control the minimum and maximum allowed replication factor
+(RF) values. They are evaluated whenever a ``CREATE KEYSPACE`` or
+``ALTER KEYSPACE`` statement is executed. Each data center's RF is checked
+individually.
+
+An RF of ``0`` — which means "do not replicate to this data center" — is
+always allowed and never triggers a guardrail.
+
+A threshold value of ``-1`` disables the corresponding check.
+
+``minimum_replication_factor_warn_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF is set to a value greater than ``0`` and lower than
+this threshold, the server attaches a warning to the CQL response identifying
+the offending data center and RF value.
+
+**When to use.** The default of ``3`` is the standard recommendation for
+production clusters. An RF below ``3`` means that the cluster cannot tolerate
+even a single node failure without data loss or read unavailability (assuming
+``QUORUM`` consistency). Keep this at ``3`` unless your deployment has specific
+constraints (e.g., a development or test cluster with fewer than 3 nodes).
+
+``minimum_replication_factor_fail_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF is set to a value greater than ``0`` and lower than
+this threshold, the request is rejected with a ``ConfigurationException``
+identifying the offending data center and RF value.
+
+**When to use.** Enable this parameter (e.g., set to ``3``) in production
+environments where allowing a low RF would be operationally dangerous. Unlike
+the warn threshold, this provides a hard guarantee that no keyspace can be
+created or altered to have an RF below the limit.
+
+``maximum_replication_factor_warn_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF exceeds this threshold, the server attaches a warning to the CQL response identifying
+the offending data center and RF value.
+
+**When to use.** An excessively high RF increases write amplification and
+storage costs proportionally. For example, an RF of ``5`` means every write
+is replicated to five nodes. Set this threshold to alert operators who
+may unintentionally set an RF that is too high.
+
+``maximum_replication_factor_fail_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF exceeds this threshold, the request is rejected with a ``ConfigurationException``
+identifying the offending data center and RF value.
+
+**When to use.** Enable this parameter to prevent accidental creation of
+keyspaces with an unreasonably high RF. An extremely high RF wastes storage and
+network bandwidth and can lead to write latency spikes. This is a hard limit —
+the keyspace creation or alteration will not proceed until the RF is lowered.
+
+**Metrics.** ScyllaDB exposes per-shard metrics that track the number of
+times each replication factor guardrail has been triggered:
+
+* ``scylla_cql_minimum_replication_factor_warn_violations``
+* ``scylla_cql_minimum_replication_factor_fail_violations``
+* ``scylla_cql_maximum_replication_factor_warn_violations``
+* ``scylla_cql_maximum_replication_factor_fail_violations``
+
+A sustained increase in any of these metrics indicates that
+``CREATE KEYSPACE`` or ``ALTER KEYSPACE`` requests are hitting the configured
+thresholds.
+
+.. _guardrails-replication-strategy:
+
+Replication Strategy Guardrails
+-------------------------------
+
+These two parameters control which replication strategies trigger warnings or
+are rejected when a keyspace is created or altered.
+
+``replication_strategy_warn_list``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
+statement is on this list, the server attaches a warning to the CQL response
+identifying the discouraged strategy and the affected keyspace.
+
+**When to use.** ``SimpleStrategy`` is not recommended for production use.
+It places replicas without awareness of data center or rack topology, which
+can undermine fault tolerance in multi-DC deployments. Even in single-DC
+deployments, ``NetworkTopologyStrategy`` is recommended because it keeps the
+schema ready for future topology changes.
+
+The default configuration warns on ``SimpleStrategy``, which is appropriate
+for most deployments. If you have existing keyspaces that use
+``SimpleStrategy``, see :doc:`Update Topology Strategy From Simple to Network
+</operating-scylla/procedures/cluster-management/update-topology-strategy-from-simple-to-network>`
+for the migration procedure.
+
+``replication_strategy_fail_list``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
+statement is on this list, the request is rejected with a
+``ConfigurationException`` identifying the forbidden strategy and the affected
+keyspace.
+
+**When to use.** In production environments, add ``SimpleStrategy`` to this
+list to enforce ``NetworkTopologyStrategy`` across all keyspaces. This helps
+prevent new production keyspaces from being created with a topology-unaware
+strategy.
+
+**Metrics.** The following per-shard metrics track replication strategy
+guardrail violations:
+
+* ``scylla_cql_replication_strategy_warn_list_violations``
+* ``scylla_cql_replication_strategy_fail_list_violations``
+
+.. _guardrails-write-consistency-level:
+
+Write Consistency Level Guardrails
+----------------------------------
+
+These two parameters control which consistency levels (CL) are allowed for
+write operations (``INSERT``, ``UPDATE``, ``DELETE``, and ``BATCH``
+statements).
+
+Be aware that adding warnings to CQL responses can significantly increase
+network traffic and reduce overall throughput.
+
+``write_consistency_levels_warned``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If a write operation uses a consistency level on this list, the server attaches
+a warning to the CQL response identifying the discouraged consistency level.
+
+**When to use.** Use this parameter to alert application developers when they
+use a consistency level that, while technically functional, is not recommended
+for the workload. Common examples:
+
+* **Warn on** ``ANY``: writes at ``ANY`` are acknowledged as soon as at least
+  one node (including a coordinator acting as a hinted handoff store) receives
+  the mutation. This means data may not be persisted on any replica node at
+  the time of acknowledgement, risking data loss if the coordinator fails
+  before hinted handoff completes.
+* **Warn on** ``ALL``: writes at ``ALL`` require every replica to acknowledge
+  the write. If any single replica is down, the write fails. This significantly
+  reduces write availability.
+
+``write_consistency_levels_disallowed``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If a write operation uses a consistency level on this list, the request is
+rejected with an ``InvalidRequestException`` identifying the forbidden
+consistency level.
+
+**When to use.** Use this parameter to hard-block consistency levels that are
+considered unsafe for your deployment:
+
+* **Disallow** ``ANY``: in production environments, ``ANY`` is almost never
+  appropriate. It provides the weakest durability guarantee and is a common
+  source of data-loss incidents when operators or application developers use it
+  unintentionally.
+* **Disallow** ``ALL``: in clusters where high write availability is critical,
+  blocking ``ALL`` prevents a single node failure from causing write
+  unavailability.
+
+**Metrics.** The following per-shard metrics track write consistency level
+guardrail violations:
+
+* ``scylla_cql_write_consistency_levels_warned_violations``
+* ``scylla_cql_write_consistency_levels_disallowed_violations``
+
+Additionally, ScyllaDB exposes the
+``scylla_cql_writes_per_consistency_level`` metric, labeled by consistency
+level, which tracks the total number of write requests per CL. This metric is
+useful for understanding the current write-CL distribution across the cluster
+*before* deciding which levels to warn on or disallow. For example, querying
+this metric can reveal whether any application is inadvertently using ``ANY``
+or ``ALL`` for writes.
+
+.. _guardrails-compact-storage:
+
+Compact Storage Guardrail
+-------------------------
+
+``enable_create_table_with_compact_storage``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This boolean parameter controls whether ``CREATE TABLE`` statements with the
+deprecated ``COMPACT STORAGE`` option are allowed. Unlike the other guardrails,
+it acts as a simple on/off switch rather than using separate warn and fail
+thresholds.
+
+**When to use.** Leave this at the default (``false``) for all new
+deployments. ``COMPACT STORAGE`` is a legacy feature that will be permanently
+removed in a future version of ScyllaDB. Set to ``true`` only if you have a specific,
+temporary need to create compact storage tables (e.g., compatibility with legacy
+applications during a migration). For details on the ``COMPACT STORAGE`` option, see
+:ref:`Compact Tables <compact-tables>` in the Data Definition documentation.
+
+Additional References
+---------------------
+
+* :doc:`Consistency Level </cql/consistency>`
+* :doc:`Data Definition (CREATE/ALTER KEYSPACE) </cql/ddl>`
+* :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
+* :doc:`Metrics Reference </reference/metrics>`
--- a/docs/cql/index.rst
+++ b/docs/cql/index.rst
@@ -17,6 +17,7 @@ CQL Reference
   secondary-indexes
   time-to-live
   functions
+   guardrails
   wasm
   json
   mv
@@ -46,6 +47,7 @@ It allows you to create keyspaces and tables, insert and query tables, and more.
  * :doc:`Data Types </cql/types>`
  * :doc:`Definitions </cql/definitions>`
  * :doc:`Global Secondary Indexes </cql/secondary-indexes>`
+  * :doc:`CQL Guardrails </cql/guardrails>`
  * :doc:`Expiring Data with Time to Live (TTL) </cql/time-to-live>`
  * :doc:`Functions </cql/functions>`
  * :doc:`JSON Support </cql/json>`
--- a/docs/dev/audit.md
+++ b/docs/dev/audit.md
@@ -1,347 +1,111 @@
-# Prototype design: auditing all keyspaces and per-role auditing
+# Introduction

-## Summary
+Similar to the approach described in CASSANDRA-12151, we add the
+concept of an audit specification.  An audit has a target (syslog or a
+table) and a set of events/actions that it wants recorded.  We
+introduce new CQL syntax for Scylla users to describe and manipulate
+audit specifications.

-Extend the existing `scylla.yaml`-driven audit subsystem with two focused capabilities:
+Prior art:
+- Microsoft SQL Server [audit
+  description](https://docs.microsoft.com/en-us/sql/relational-databases/security/auditing/sql-server-audit-database-engine?view=sql-server-ver15)
+- pgAudit [docs](https://github.com/pgaudit/pgaudit/blob/master/README.md)
+- MySQL audit_log docs in
+  [MySQL](https://dev.mysql.com/doc/refman/8.0/en/audit-log.html) and
+  [Azure](https://docs.microsoft.com/en-us/azure/mysql/concepts-audit-logs)
+- DynamoDB can [use CloudTrail](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/logging-using-cloudtrail.html) to log all events

-1. allow auditing **all keyspaces** without enumerating them one by one
-2. allow auditing only a configured set of **roles**
+# CQL extensions

-The prototype should stay close to the current implementation in `audit/`:
+## Create an audit

- keep the existing backends (`table`, `syslog`, or both)
- keep the existing category / keyspace / table filters
- preserve live updates for audit configuration
- avoid any schema change to `audit.audit_log`
-
-This is intentionally a small extension of the current auditing model, not a redesign around new CQL statements such as `CREATE AUDIT`.
-
-## Motivation
-
-Today Scylla exposes three main audit selectors:
-
- `audit_categories`
- `audit_tables`
- `audit_keyspaces`
-
-This leaves two operational gaps:
-
-1. **Auditing all keyspaces is cumbersome.**
-   Large installations may create keyspaces dynamically, or manage many tenant keyspaces. Requiring operators to keep
-   `audit_keyspaces` synchronized with the full keyspace list is error-prone and defeats the point of cluster-wide auditing.
-2. **Auditing is all-or-nothing with respect to users.**
-   Once a category/keyspace/table combination matches, any authenticated user generating that traffic is audited.
-   Operators want to narrow the scope to specific tenants, service accounts, or privileged roles.
-
-These two additions also work well together: "audit all keyspaces, but only for selected roles" is a practical way to reduce
-both audit volume and performance impact.
-
-## Goals
-
- Add a way to express "all keyspaces" in the current configuration model.
- Add a new role filter that limits auditing to selected roles.
- Preserve backwards compatibility for existing configurations.
- Keep the evaluation cheap on the request path.
- Support live configuration updates, consistent with the existing audit options.
-
-## Non-goals
-
- Introducing `CREATE AUDIT`, `ALTER AUDIT`, or other new CQL syntax.
- Adding per-role audit destinations.
- Adding different categories per role.
- Expanding role matching through the full granted-role graph in the prototype.
- Changing the on-disk audit table schema.
-
-## Current behavior
-
-At the moment, audit logging is controlled by:
-
- `audit`
- `audit_categories`
- `audit_tables`
- `audit_keyspaces`
-
-The current decision rule in `audit::should_log()` is effectively:
-
-```text
-category matches
-&& (
-    keyspace is listed in audit_keyspaces
-    || table is listed in audit_tables
-    || category in {AUTH, ADMIN, DCL}
-)
+```cql
+CREATE AUDIT [IF NOT EXISTS] audit-name WITH TARGET { SYSLOG | table-name }
+[ AND TRIGGER KEYSPACE IN (ks1, ks2, ks3) ]
+[ AND TRIGGER TABLE IN (tbl1, tbl2, tbl3) ]
+[ AND TRIGGER ROLE IN (usr1, usr2, usr3) ]
+[ AND TRIGGER CATEGORY IN (cat1, cat2, cat3) ]
+;
 ```

-Observations:
+From this point on, every database event that matches all present
+triggers will be recorded in the target.  When the target is a table,
+it behaves like the [current
+design](https://docs.scylladb.com/operating-scylla/security/auditing/#table-storage).

- `AUTH`, `ADMIN`, and `DCL` are already global once their category is enabled.
- `DDL`, `DML`, and `QUERY` need a matching keyspace or table.
- An empty `audit_keyspaces` means "audit no keyspaces", not "audit every keyspace".
- There is no role-based filter; the authenticated user is recorded in the log but is not part of the decision.
- The exact implementation to preserve is in `audit/audit.cc` (`should_log()`, `inspect()`, and `inspect_login()`).
+The audit name must be different from all other audits, unless IF NOT
+EXISTS precedes it, in which case the existing audit must be identical
+to the new definition.  Case sensitivity and length limit are the same
+as for table names.

-## Proposed configuration
+A trigger kind (ie, `KEYSPACE`, `TABLE`, `ROLE`, or `CATEGORY`) can be
+specified at most once.

-### 1. Add `audit_all_keyspaces`
+## Show an audit

-Introduce a new live-update boolean option:
-
-Examples:
-
-```yaml
-# Audit all keyspaces for matching categories
-audit_all_keyspaces: true
-
-# Audit all keyspaces for selected roles
-audit_all_keyspaces: true
-audit_roles: "alice,bob"
+```cql
+DESCRIBE AUDIT [audit-name ...];
 ```

-Semantics:
+Prints definitions of all audits named herein.  If no names are
+provided, prints all audits.

- `audit_all_keyspaces: false` keeps the existing behavior.
- `audit_all_keyspaces: true` makes every keyspace match.
- `audit_keyspaces` keeps its existing meaning: an explicit list of keyspaces, or no keyspace-wide auditing when left empty.
- `audit_all_keyspaces: true` and a non-empty `audit_keyspaces` must be rejected as invalid configuration,
-  because the two options express overlapping scope in different ways.
- A dedicated boolean is preferable to overloading `audit_keyspaces`, because it avoids changing the meaning of existing configurations.
- This also keeps the behavior aligned with today's `audit_tables` handling, where leaving `audit_tables` empty does not introduce a new wildcard syntax.
+## Delete an audit

-### 2. Add `audit_roles`
-
-Introduce a new live-update configuration option:
-
-```yaml
-audit_roles: "alice,bob,service_api"
+```cql
+DROP AUDIT audit-name;
 ```

-Semantics:
+Stops logging events specified by this audit.  Doesn't impact the
+already logged events.  If the target is a table, it remains as it is.

- empty `audit_roles` means **no role filtering**, preserving today's behavior
- non-empty `audit_roles` means audit only requests whose effective logged username matches one of the configured roles
- matching is byte-for-byte exact, using the same role name that is already written to the audit record's `username` column / syslog field
- the prototype should compare against the post-authentication role name from the session and audit log,
-  with no additional case folding or role-graph expansion
+## Alter an audit

-Examples:
-
-```yaml
-# Audit all roles in a single keyspace (current behavior, made explicit)
-audit_keyspaces: "ks1"
-audit_roles: ""
-
-# Audit two roles across all keyspaces
-audit_all_keyspaces: true
-audit_roles: "alice,bob"
-
-# Audit a service role, but only for selected tables
-audit_tables: "ks1.orders,ks1.payments"
-audit_roles: "billing_service"
+```cql
+ALTER AUDIT audit-name WITH {same syntax as CREATE}
 ```

-## Decision rule after the change
+Any trigger provided will be updated (or newly created, if previously
+absent).  To drop a trigger, use `IN *`.

-After the prototype, the rule becomes:
+## Permissions

-```text
-category matches
-&& role matches
-&& (
-    category in {AUTH, ADMIN, DCL}
-    || audit_all_keyspaces
-    || keyspace is listed in audit_keyspaces
-    || table is listed in audit_tables
-)
-```
+Only superusers can modify audits or turn them on and off.

-Where:
+Only superusers can read tables that are audit targets; no user can
+modify them.  Only superusers can drop tables that are audit targets,
+after the audit itself is dropped.  If a superuser doesn't drop a
+target table, it remains in existence indefinitely.

- `role matches` is always true when `audit_roles` is empty
- `audit_all_keyspaces` is true when the new boolean option is enabled
+# Implementation

-For login auditing, the rule is simply:
-
-```text
-AUTH category enabled && role matches(login username)
-```
-
-## Implementation details
-
-### Configuration parsing
-
-Add a new config entry:
-
- `db::config::audit_all_keyspaces`
- `db::config::audit_roles`
-
-It should mirror the existing audit selectors:
-
- `audit_all_keyspaces`: type `named_value<bool>`, liveness `LiveUpdate`, default `false`
- `audit_roles`: type `named_value<sstring>`, liveness `LiveUpdate`, default empty string
-
-Parsing changes:
-
- keep `parse_audit_tables()` as-is
- keep `parse_audit_keyspaces()` semantics as-is
- add `parse_audit_roles()` that returns a set of role names
- normalize empty or whitespace-only keyspace lists to an empty configuration rather than treating them as real keyspace names
- add cross-field validation so `audit_all_keyspaces: true` cannot be combined with a non-empty
-  `audit_keyspaces`, both at startup and during live updates
-
-To avoid re-parsing on every request, the `audit::audit` service should store:
+## Efficient trigger evaluation

 ```c++
-bool _audit_all_keyspaces;
-std::set<sstring> _audited_keyspaces;
-std::set<sstring> _audited_roles;
+namespace audit {
+
+/// Stores triggers from an AUDIT statement.
+class triggers {
+    // Use trie structures for speedy string lookup.
+    optional<trie> _ks_trigger, _tbl_trigger, _usr_trigger;
+
+    // A logical-AND filter.
+    optional<unsigned> _cat_trigger;
+
+public:
+    /// True iff every non-null trigger matches the corresponding ainf element.
+    bool should_audit(const audit_info& ainf);
+};
+
+} // namespace audit
 ```

-Using a dedicated boolean keeps the hot-path check straightforward and avoids reinterpreting the existing
-`_audited_keyspaces` selector.
+To prevent modification of target tables, `audit::inspect()` will
+check the statement and throw if it is disallowed, similar to what
+`check_access()` currently does.

-Using `std::set` for the explicit selectors keeps the prototype aligned with the current implementation and minimizes code churn.
-If profiling later shows lookup cost matters here, the container choice can be revisited independently of the feature semantics.
+## Persisting audit definitions

-### Audit object changes
-
-The current `audit_info` already carries:
-
- category
- keyspace
- table
- query text
-
-The username is available separately from `service::query_state` and is already passed to storage helpers when an entry is written.
-For the prototype there is no need to duplicate the username into `audit_info`.
-
-Instead:
-
- change `should_log()` to take the effective username as an additional input
- change `should_log_login()` to check the username against `audit_roles`
- keep the storage helpers unchanged, because they already persist the username
- update the existing internal call sites in `inspect()` and `inspect_login()` to pass the username through
-
-One possible interface shape is:
-
-```c++
-bool should_log(std::string_view username, const audit_info* info) const;
-bool should_log_login(std::string_view username) const;
-```
-
-### Role semantics
-
-For the prototype, "role" means the role name already associated with the current client session:
-
- successful authenticated sessions use the session's user name
- failed login events use the login name from the authentication attempt
- failed login events are still subject to `audit_roles`, matched against the attempted login name
-
-This keeps the feature easy to explain and aligns the filter with what users already see in audit output.
-
-The prototype should **not** try to expand inherited roles. If a user logs in as `alice` and inherits permissions from another role,
-the audit filter still matches `alice`. This keeps the behavior deterministic and avoids expensive role graph lookups on the request path.
-
-### Keyspace semantics
-
-`audit_all_keyspaces: true` should affect any statement whose `audit_info` carries a keyspace name.
-
-Important consequences:
-
- it makes `DDL` / `DML` / `QUERY` auditing effectively cluster-wide
- it does not change the existing global handling of `AUTH`, `ADMIN`, and `DCL`
- statements that naturally have no keyspace name continue to depend on their category-specific behavior
-
-No extra schema or metadata scan is required: the request already carries the keyspace information needed for the decision.
-
-## Backwards compatibility
-
-This design keeps existing behavior intact:
-
- existing clusters that do not set `audit_roles` continue to audit all roles
- existing clusters that leave `audit_keyspaces` empty continue to audit no keyspaces
- existing explicit keyspace/table lists keep their current meaning
-
-The feature is enabled only by a new explicit boolean, so existing `audit_keyspaces` values do not need to be reinterpreted.
-The only newly-invalid combination is enabling `audit_all_keyspaces` while also listing explicit keyspaces.
-
-## Operational considerations
-
-### Performance and volume
-
-`audit_all_keyspaces: true` can significantly increase audit volume, especially with `QUERY` and `DML`.
-
-The intended mitigation is to combine it with:
-
- a narrow `audit_categories`
- a narrow `audit_roles`
-
-That combination gives operators a simple and cheap filter model:
-
- first by category
- then by role
- then by keyspace/table scope
-
-### Live updates
-
-`audit_roles` should follow the same live-update behavior as the current audit filters.
-
-Changing:
-
- `audit_roles`
- `audit_all_keyspaces`
- `audit_keyspaces`
- `audit_tables`
- `audit_categories`
-
-should update the in-memory selectors on all shards without restarting the node.
-
-### Prototype limitation
-
-Because matching is done against the authenticated session role name, `audit_roles` cannot express "audit everyone who inherits role X".
-Operators must list the concrete login roles they want to audit. This is a deliberate trade-off in the prototype to keep matching cheap
-and avoid role graph lookups on every audited request.
-
-Example: if `alice` inherits permissions from `admin_role`, configuring `audit_roles: "admin_role"` would not audit requests from
-`alice`; to audit those requests, `alice` itself must be listed.
-
-### Audit table schema
-
-No schema change is needed. The audit table already includes `username`, which is sufficient for both storage and later analysis.
-
-## Testing plan
-
-The prototype should extend existing audit coverage rather than introduce a separate test framework.
-
-### Parser / unit coverage
-
-Add focused tests for:
-
- empty `audit_roles`
- specific `audit_roles`
- `audit_all_keyspaces: true`
- invalid mixed configuration: `audit_all_keyspaces: true` with non-empty `audit_keyspaces`
- empty or whitespace-only keyspace lists such as `",,,"` or `"  "`, which should normalize to an empty configuration and therefore audit no keyspaces
- boolean config parsing for `audit_all_keyspaces`
-
-### Behavioral coverage
-
-Extend the existing audit tests in `test/cluster/dtest/audit_test.py` with scenarios such as:
-
-1. `audit_all_keyspaces: true` audits statements in multiple keyspaces without listing them explicitly
-2. `audit_roles: "alice"` logs requests from `alice` but not from `bob`
-3. `audit_all_keyspaces: true` + `audit_roles: "alice"` only logs `alice`'s traffic cluster-wide
-4. login auditing respects `audit_roles`
-5. live-updating `audit_roles` changes behavior without restart
-6. setting `audit_all_keyspaces: true` together with explicit `audit_keyspaces` is rejected with a clear error
-
-## Future evolution
-
-This prototype is deliberately small, but it fits a broader audit-spec design if we decide to revisit that later.
-
-In a future CQL-driven design, these two additions map naturally to triggers such as:
-
- `TRIGGER KEYSPACE IN *`
- `TRIGGER ROLE IN (...)`
-
-That means the prototype is not throwaway work: it improves the current operational model immediately while keeping a clean path
-toward richer audit objects in the future.
+Obviously, an audit definition must survive a server restart and stay
+consistent among all nodes in a cluster.  We'll accomplish both by
+storing audits in a system table.
--- a/docs/dev/logstor.md
+++ b/docs/dev/logstor.md
@@ -0,0 +1,124 @@
+# Logstor
+
+## Introduction
+
+Logstor is a log-structured storage engine for ScyllaDB optimized for key-value workloads. It provides an alternative storage backend for key-value tables - tables with a partition key only, with no clustering columns.
+
+Unlike the traditional LSM-tree based storage, logstor uses a log-structured approach with in-memory indexing, making it particularly suitable for workloads with frequent overwrites and point lookups.
+
+## Architecture
+
+Logstor consists of several key components:
+
+### Components
+
+#### Primary Index
+
+The primary index is entirely in memory and it maps a partition key to its location in the log segments. It consists of a B-tree per each table that is ordered token.
+
+#### Segment Manager
+
+The `segment_manager` handles the allocation and management of fixed-size segments (default 128KB). Segments are grouped into large files (default 32MB). Key responsibilities include:
+
+- **Segment allocation**: Provides segments for writing new data
+- **Space reclamation**: Tracks free space in each segment
+- **Compaction**: Copies live data from sparse segments to reclaim space
+- **Recovery**: Scans segments on startup to rebuild the index
+- **Separator**: Rewrites segments that have records from different compaction groups into new segments that are separated by compaction group.
+
+The data in the segments consists of records of type `log_record`. Each record contains the value for some key as a `canonical_mutation` and additional metadata.
+
+The `segment_manager` receives new writes via a `write_buffer` and writes them sequentially to the active segment with 4k-block alignment.
+
+#### Write Buffer
+
+The `write_buffer` manages a buffer of log records and handles the serialization of the records including headers and alignment. It can be used to write multiple records to the buffer and then write the buffer to the segment manager.
+
+The `buffered_writer` manages multiple write buffers for user writes, an active buffer and multiple flushing ones, to batch writes and manage backpressure.
+
+### Data Flow
+
+**Write Path:**
+1. Application writes mutation to logstor
+2. Mutation is converted to a log record
+3. Record is written to write buffer
+4. The buffer is switched and written to the active segment.
+5. Index is updated with new record locations
+6. Old record locations (for overwrites) are marked as free
+
+**Read Path:**
+1. Application requests data for a partition key
+2. Index lookup returns record location
+3. Segment manager reads record from disk
+4. Record is deserialized into a mutation and returned
+
+**Separator:**
+1. When a record is written to the active segment, it is also written to its compaction group's separator buffer. The separator buffer holds a reference to the original segment.
+2. The separator buffer is flushed when it's full, or requested to flush for other reason. It is written into a new segment in the compaction group, and it updates the location of the records from the original mixed segments to the new segments in the compaction group.
+3. After the separator buffer is flushed and all records from the original segment are moved, it releases the reference of the segment. When there are no more reference to the segment it is freed.
+
+**Compaction:**
+1. The amount of live data is tracked for each segment in its segment_descriptor. The segment descriptors are stored in a histogram by live data.
+2. A segment set from a single compaction group is submitted for compaction.
+3. Compaction picks segments for compaction from the segment set. It chooses segments with the lowest utilization such that compacting them results in net gain of free segments.
+4. It reads the segments, finding all live records, and writing them into a write buffer. When the buffer is full it is flushed into a new segment, and for each recording updating the index location to the new location.
+5. After all live records are rewritten the old segments are freed.
+
+## Usage
+
+### Enabling Logstor
+
+To use logstor, enable it in the configuration:
+
+```yaml
+enable_logstor: true
+
+experimental_features:
+  - logstor
+```
+
+### Creating Tables
+
+Tables using logstor must have no clustering columns, and created with the `storage_engine` property equals to 'logstor':
+
+```cql
+CREATE TABLE keyspace.user_profiles (
+    user_id uuid PRIMARY KEY,
+    name text,
+    email text,
+    metadata frozen<map<text, text>>
+) WITH storage_engine = 'logstor';
+```
+
+### Basic Operations
+
+**Insert/Update:**
+
+```cql
+INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'value1');
+INSERT INTO keyspace.table_name (pk, v) VALUES (2, 'value2');
+
+-- Overwrite with new value
+INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'updated_value');
+```
+
+Currently, updates must write the full row. Updating individual columns is not yet supported. Each write replaces the entire partition.
+
+**Select:**
+
+```cql
+SELECT * FROM keyspace.table_name WHERE pk = 1;
+-- Returns: (1, 'updated_value')
+
+SELECT pk, v FROM keyspace.table_name WHERE pk = 2;
+-- Returns: (2, 'value2')
+
+SELECT * FROM keyspace.table_name;
+-- Returns: (1, 'updated_value'), (2, 'value2')
+```
+
+**Delete:**
+
+```cql
+DELETE FROM keyspace.table_name WHERE pk = 1;
+```
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -52,7 +52,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
+               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|


        #. Install ScyllaDB packages.
@@ -125,7 +125,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
+               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|

        #. Install ScyllaDB packages.

@@ -133,19 +133,19 @@ Install ScyllaDB
    
               sudo yum install scylla

-            Running the command installs the latest official version of ScyllaDB Open Source.
-            Alternatively, you can to install a specific patch version:
+            Running the command installs the latest official version of ScyllaDB.
+            Alternatively, you can install a specific patch version:

            .. code-block:: console
    
               sudo yum install scylla-<your patch version>

-            Example: The following example shows the command to install ScyllaDB 5.2.3.
+            Example: The following example shows installing ScyllaDB 2025.3.1.

            .. code-block:: console
               :class: hide-copy-button
    
-               sudo yum install scylla-5.2.3
+               sudo yum install scylla-2025.3.1

 .. include:: /getting-started/_common/setup-after-install.rst

--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -36,11 +36,8 @@ release versions, run:
  curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases


-Versions 2025.1 and Later
-==============================
-
-Run the command with the ``--scylla-version`` option to specify the version
-you want to install.
+To install a non-default version, run the command with the ``--scylla-version``
+option to specify the version you want to install.

 **Example**

@@ -50,20 +47,4 @@ you want to install.
  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|


-Versions Earlier than 2025.1
-================================
-
-To install a supported version of *ScyllaDB Enterprise*, run the command with:
-
-* ``--scylla-product scylla-enterprise`` to specify that you want to install
-  ScyllaDB Entrprise.
-* ``--scylla-version`` to specify the version you want to install.
-
-For example:
-
-.. code:: console
-  
-  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
-
-
 .. include:: /getting-started/_common/setup-after-install.rst
--- a/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
+++ b/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
@@ -57,12 +57,11 @@ To enable shared dictionaries:
    internode_compression_enable_advanced: true
    rpc_dict_training_when: when_leader

-.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
+.. note::

-             Trained dictionaries contain randomly chosen samples of data transferred between
-             nodes. The data samples are persisted in the Raft log, which is not encrypted.
-             As a result, some data from otherwise encrypted tables might be stored on disk
-             unencrypted.
+   Some dictionary training data may be encrypted using storage-level encryption
+   (if enabled) instead of database-level encryption, meaning protection is
+   applied at the storage layer rather than within the database itself.


 Reference
--- a/ent/encryption/encrypted_file_impl.cc
+++ b/ent/encryption/encrypted_file_impl.cc
@@ -727,7 +727,12 @@ public:

        // now we need one page more to be able to save one for next lap
        auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
-        auto buf2 = co_await _input.read_exactly(fill_size);
+        // If the underlying stream is already at EOF (e.g. buf1 came from
+        // cached _next while the previous read_exactly drained the source),
+        // skip the read_exactly call — it would return empty anyway.
+        auto buf2 = _input.eof()
+            ? temporary_buffer<char>()
+            : co_await _input.read_exactly(fill_size);

        temporary_buffer<char> output(buf1.size() + buf2.size());

--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -172,6 +172,7 @@ public:
    gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
    gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
    gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
+    gms::feature logstor { *this, "LOGSTOR"sv };
    gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
    gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
    gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
--- a/idl/CMakeLists.txt
+++ b/idl/CMakeLists.txt
@@ -48,6 +48,7 @@ set(idl_headers
  messaging_service.idl.hh
  paxos.idl.hh
  raft.idl.hh
+  raft_util.idl.hh
  raft_storage.idl.hh
  group0.idl.hh
  hinted_handoff.idl.hh
@@ -55,6 +56,7 @@ set(idl_headers
  storage_proxy.idl.hh
  storage_service.idl.hh
  strong_consistency/state_machine.idl.hh
+  logstor.idl.hh
  group0_state_machine.idl.hh
  mapreduce_request.idl.hh
  replica_exception.idl.hh
--- a/idl/logstor.idl.hh
+++ b/idl/logstor.idl.hh
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "idl/frozen_schema.idl.hh"
+#include "idl/token.idl.hh"
+#include "mutation/canonical_mutation.hh"
+
+namespace replica {
+namespace logstor {
+
+struct primary_index_key {
+    dht::decorated_key dk;
+};
+
+class log_record {
+    replica::logstor::primary_index_key key;
+    replica::logstor::record_generation generation;
+    table_id table;
+    canonical_mutation mut;
+};
+
+}
+}
--- a/init.cc
+++ b/init.cc
@@ -96,6 +96,9 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
    if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
        disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
    }
+    if (!cfg.check_experimental(db::experimental_features_t::feature::LOGSTOR)) {
+        disabled.insert("LOGSTOR"s);
+    }
    if (!cfg.table_digest_insensitive_to_expiry()) {
        disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
    }
--- a/locator/everywhere_replication_strategy.cc
+++ b/locator/everywhere_replication_strategy.cc
@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic

 sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
    const auto replication_factor = erm.get_replication_factor();
-    if (read_replicas.size() > replication_factor) {
+    if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
+        if (read_replicas.size() > replication_factor + 1) {
+            return seastar::format(
+                    "everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
+                    "cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
+                    read_replicas.size(), replication_factor);
+        }
+    } else if (read_replicas.size() > replication_factor) {
        return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
    }
    return {};
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -531,6 +531,11 @@ tablet_id tablet_map::get_tablet_id(token t) const {
    return tablet_id(dht::compaction_group_of(_log2_tablets, t));
 }

+tablet_range_side tablet_map::get_tablet_range_side(token t) const {
+    auto id_after_split = dht::compaction_group_of(_log2_tablets + 1, t);
+    return tablet_range_side(id_after_split & 0x1);
+}
+
 std::pair<tablet_id, tablet_range_side> tablet_map::get_tablet_id_and_range_side(token t) const {
    auto id_after_split = dht::compaction_group_of(_log2_tablets + 1, t);
    auto current_id = id_after_split >> 1;
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -611,6 +611,10 @@ public:
    /// Returns tablet_id of a tablet which owns a given token.
    tablet_id get_tablet_id(token) const;

+    // Returns the side of the tablet's range that a given token belongs to.
+    // Less expensive than get_tablet_id_and_range_side() when tablet_id is already known.
+    tablet_range_side get_tablet_range_side(token) const;
+
    // Returns tablet_id and also the side of the tablet's range that a given token belongs to.
    std::pair<tablet_id, tablet_range_side> get_tablet_id_and_range_side(token) const;

--- a/main.cc
+++ b/main.cc
@@ -19,8 +19,6 @@
 #include "gms/inet_address.hh"
 #include "auth/allow_all_authenticator.hh"
 #include "auth/allow_all_authorizer.hh"
-#include "auth/maintenance_socket_authenticator.hh"
-#include "auth/maintenance_socket_role_manager.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/signal.hh>
 #include <seastar/core/timer.hh>
@@ -1964,6 +1962,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            checkpoint(stop_signal, "loading non-system sstables");
            replica::distributed_loader::init_non_system_keyspaces(db, proxy, sys_ks).get();

+            checkpoint(stop_signal, "recovering logstor");
+            db.invoke_on_all([] (replica::database& db) {
+                return db.recover_logstor();
+            }).get();
+
            // Depends on all keyspaces being initialized because after this call
            // we can be reloading schema.
            mm.local().register_feature_listeners();
@@ -2102,7 +2105,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            if (cfg->maintenance_socket() != "ignore") {
                checkpoint(stop_signal, "starting maintenance auth service");
                maintenance_auth_service.start(std::ref(qp), std::ref(group0_client),
-                        auth::make_authorizer_factory(auth::allow_all_authorizer_name, qp),
+                        auth::make_maintenance_socket_authorizer_factory(qp),
                        auth::make_maintenance_socket_authenticator_factory(qp, group0_client, mm, auth_cache),
                        auth::make_maintenance_socket_role_manager_factory(qp, group0_client, mm, auth_cache),
                        maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
--- a/node_ops/task_manager_module.cc
+++ b/node_ops/task_manager_module.cc
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
        .entity = stats.entity,
        .progress_units = "",
        .progress = tasks::task_manager::task::progress{},
-        .children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
+        .children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
    };
 }

--- a/pgo/exec_cql.py
+++ b/pgo/exec_cql.py
@@ -8,9 +8,10 @@

 """exec_cql.py
 Execute CQL statements from a file where each non-empty, non-comment line is exactly one CQL statement.
+Connects via a Unix domain socket (maintenance socket), bypassing authentication.
 Requires python cassandra-driver. Stops at first failure.
 Usage:
-  ./exec_cql.py --file ./conf/auth.cql [--host 127.0.0.1 --port 9042]
+  ./exec_cql.py --file ./conf/auth.cql --socket /path/to/cql.m
 """
 import argparse, os, sys
 from typing import Sequence
@@ -26,18 +27,27 @@ def read_statements(path: str) -> list[tuple[int, str]]:
                stms.append((lineno, line))
    return stms

-def exec_driver(statements: list[tuple[int, str]], host: str, port: int, timeout: float, username: str, password: str) -> int:
+def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout: float) -> int:
+    """Execute CQL statements via a Unix domain socket (maintenance socket).
+
+    The maintenance socket only starts listening after the auth subsystem is
+    fully initialised, so a successful connect means the node is ready.
+    """
+    from cassandra.cluster import Cluster
+    from cassandra.connection import UnixSocketEndPoint  # type: ignore
+    from cassandra.policies import WhiteListRoundRobinPolicy  # type: ignore
+
+    ep = UnixSocketEndPoint(socket_path)
    try:
-        from cassandra.cluster import Cluster
-        from cassandra.auth import PlainTextAuthProvider  # type: ignore
-    except Exception:
-        print('ERROR: cassandra-driver not installed. Install with: pip install cassandra-driver', file=sys.stderr)
+        cluster = Cluster(
+            contact_points=[ep],
+            load_balancing_policy=WhiteListRoundRobinPolicy([ep]),
+        )
+        session = cluster.connect()
+    except Exception as e:
+        print(f'ERROR: failed to connect to maintenance socket {socket_path}: {e}', file=sys.stderr)
        return 2
-    auth_provider = None
-    if username != "":
-        auth_provider = PlainTextAuthProvider(username=username, password=password)
-    cluster = Cluster([host], port=port, auth_provider=auth_provider)
-    session = cluster.connect()
+
    try:
        for _, (lineno, s) in enumerate(statements, 1):
            try:
@@ -50,13 +60,11 @@ def exec_driver(statements: list[tuple[int, str]], host: str, port: int, timeout
    return 0

 def main(argv: Sequence[str]) -> int:
-    ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file (driver only)')
+    ap = argparse.ArgumentParser(description='Execute one-line CQL statements from file via maintenance socket')
    ap.add_argument('--file', required=True)
-    ap.add_argument('--host', default='127.0.0.1')
-    ap.add_argument('--port', type=int, default=9042)
+    ap.add_argument('--socket', required=True,
+                    help='Path to the Unix domain maintenance socket (<workdir>/cql.m)')
    ap.add_argument('--timeout', type=float, default=30.0)
-    ap.add_argument('--username', default='cassandra')
-    ap.add_argument('--password', default='cassandra')
    args = ap.parse_args(argv)
    if not os.path.isfile(args.file):
        print(f"File not found: {args.file}", file=sys.stderr)
@@ -65,7 +73,7 @@ def main(argv: Sequence[str]) -> int:
    if not stmts:
        print('No statements found', file=sys.stderr)
        return 1
-    rc = exec_driver(stmts, args.host, args.port, args.timeout, args.username, args.password)
+    rc = exec_statements(stmts, args.socket, args.timeout)
    if rc == 0:
        print('All statements executed successfully')
    return rc
--- a/pgo/pgo.py
+++ b/pgo/pgo.py
@@ -15,6 +15,7 @@ from typing import Any, Optional
 import asyncio
 import contextlib
 import glob
+import hashlib
 import json
 import logging
 import os
@@ -364,12 +365,14 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
    llvm_profile_file = f"{addr}-%m.profraw"
    scylla_workdir = f"{addr}"
    logfile = f"{addr}.log"
+    socket = maintenance_socket_path(cluster_workdir, addr)
    command = [
        "env",
        f"LLVM_PROFILE_FILE={llvm_profile_file}",
        f"SCYLLA_HOME={os.path.realpath(os.getcwd())}", # We assume that the script has Scylla's `conf/` as its filesystem neighbour.
        os.path.realpath(executable),
        f"--workdir={scylla_workdir}",
+        f"--maintenance-socket={socket}",
        "--ring-delay-ms=0",
        "--developer-mode=yes",
        "--memory=1G",
@@ -391,6 +394,7 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str,
        f"--authenticator=PasswordAuthenticator",
        f"--authorizer=CassandraAuthorizer",
    ] + list(extra_opts)
+    training_logger.info(f"Using maintenance socket {socket}")
    return await run(['bash', '-c', fr"""exec {shlex.join(command)} >{q(logfile)} 2>&1"""], cwd=cluster_workdir)

 async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optional[list[str]], workdir: PathLike, cluster_name: str, extra_opts: list[str]) -> list[Process]:
@@ -433,16 +437,25 @@ async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optiona
            procs.append(proc)
            await wait_for_node(proc, addrs[i], timeout)
    except:
-        await stop_cluster(procs, addrs)
+        await stop_cluster(procs, addrs, cluster_workdir=workdir)
        raise
    return procs

-async def stop_cluster(procs: list[Process], addrs: list[str]) -> None:
+async def stop_cluster(procs: list[Process], addrs: list[str], cluster_workdir: PathLike) -> None:
    """Stops a Scylla cluster started with start_cluster().
    Doesn't return until all nodes exit, even if stop_cluster() is cancelled.

    """
    await clean_gather(*[cancel_process(p, timeout=60) for p in procs])
+    _cleanup_short_sockets(cluster_workdir, addrs)
+
+def _cleanup_short_sockets(cluster_workdir: PathLike, addrs: list[str]) -> None:
+    """Remove short maintenance socket files created in /tmp."""
+    for addr in addrs:
+        try:
+            os.unlink(maintenance_socket_path(cluster_workdir, addr))
+        except OSError:
+            pass

 async def wait_for_port(addr: str, port: int) -> None:
    await bash(fr'until printf "" >>/dev/tcp/{addr}/{port}; do sleep 0.1; done 2>/dev/null')
@@ -452,6 +465,33 @@ async def merge_profraw(directory: PathLike) -> None:
    if glob.glob(f"{directory}/*.profraw"):
        await bash(fr"llvm-profdata merge {q(directory)}/*.profraw -output {q(directory)}/prof.profdata")

+def maintenance_socket_path(cluster_workdir: PathLike, addr: str) -> str:
+    """Return the maintenance socket path for a node.
+
+    Returns a short deterministic path in /tmp (derived from an MD5 hash of
+    the natural ``<cluster_workdir>/<addr>/cql.m`` path) to stay within the
+    Unix domain socket length limit.
+    The same path is passed to Scylla via ``--maintenance-socket`` in
+    ``start_node()``.
+    """
+    natural = os.path.realpath(f"{cluster_workdir}/{addr}/cql.m")
+    path_hash = hashlib.md5(natural.encode()).hexdigest()[:12]
+    return os.path.join(tempfile.gettempdir(), f'pgo-{path_hash}.m')
+
+async def setup_cassandra_user(workdir: PathLike, addr: str) -> None:
+    """Create the ``cassandra`` superuser via the maintenance socket.
+
+    The default cassandra superuser is no longer seeded automatically, but
+    ``cassandra-stress`` hardcodes ``user=cassandra password=cassandra``.
+    We create the role over the maintenance socket so that cassandra-stress
+    and other tools that rely on the default credentials keep working.
+    """
+    socket = maintenance_socket_path(workdir, addr)
+    stmt = "CREATE ROLE cassandra WITH PASSWORD = 'cassandra' AND SUPERUSER = true AND LOGIN = true;"
+    f = q(socket)
+    # Write the statement to a temp file and execute it via exec_cql.py.
+    await bash(fr"""tmpf=$(mktemp); echo {q(stmt)} > "$tmpf"; python3 ./exec_cql.py --file "$tmpf" --socket {f}; rc=$?; rm -f "$tmpf"; exit $rc""")
+
 async def get_bolt_opts(executable: PathLike) -> list[str]:
    """Returns the extra opts which have to be passed to a BOLT-instrumented Scylla
    to trigger a generation of a BOLT profile file.
@@ -503,7 +543,7 @@ async def with_cluster(executable: PathLike, workdir: PathLike, cpusets: Optiona
        yield addrs, procs
    finally:
        training_logger.info(f"Stopping the cluster in {workdir}")
-        await stop_cluster(procs, addrs)
+        await stop_cluster(procs, addrs, cluster_workdir=workdir)
        training_logger.info(f"Stopped the cluster in {workdir}")

 ################################################################################
@@ -557,8 +597,10 @@ def kw(**kwargs):

@contextlib.asynccontextmanager
 async def with_cs_populate(executable: PathLike, workdir: PathLike) -> AsyncIterator[str]:
-    """Provides a Scylla cluster and waits for compactions to end before stopping it."""
+    """Provides a Scylla cluster, creates the cassandra superuser, and waits
+    for compactions to end before stopping it."""
    async with with_cluster(executable=executable, workdir=workdir) as (addrs, procs):
+        await setup_cassandra_user(workdir, addrs[0])
        yield addrs[0]
        async with asyncio.timeout(3600):
            # Should it also flush memtables?
@@ -667,9 +709,10 @@ populators["decommission_dataset"] = populate_decommission
 # AUTH CONNECTIONS STRESS ==================================================

 async def populate_auth_conns(executable: PathLike, workdir: PathLike) -> None:
-    # Create roles, table and permissions via CQL script.
+    # Create roles, table and permissions via CQL script over the maintenance socket.
    async with with_cs_populate(executable=executable, workdir=workdir) as server:
-        await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --host {server}")
+        socket = maintenance_socket_path(workdir, server)
+        await bash(fr"python3 ./exec_cql.py --file conf/auth.cql --socket {q(socket)}")

 async def train_auth_conns(executable: PathLike, workdir: PathLike) -> None:
    # Repeatedly connect as the reader user and perform simple reads to stress
@@ -722,7 +765,8 @@ populators["si_dataset"] = populate_si

 async def populate_counters(executable: PathLike, workdir: PathLike) -> None:
    async with with_cs_populate(executable=executable, workdir=workdir) as server:
-        await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --host {server}")
+        socket = maintenance_socket_path(workdir, server)
+        await bash(fr"python3 ./exec_cql.py --file conf/counters.cql --socket {q(socket)}")
        # Sleeps added in reaction to schema disagreement errors.
        # FIXME: get rid of this sleep and find a sane way to wait for schema
        # agreement.
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34a0955d2c5a88e18ddab0f1df085e10a17e14129c3e21de91e4f27ef949b6c4
-size 6502668
+oid sha256:d424ce6cc7f65338c34dd35881d23f5ad3425651d66e47dc2c3a20dc798848d4
+size 6598648
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -68,6 +68,7 @@ public:
    using resources = reader_resources;

    friend class reader_permit;
+    friend struct reader_concurrency_semaphore_tester;

    enum class evict_reason {
        permit, // evicted due to permit shortage
--- a/replica/CMakeLists.txt
+++ b/replica/CMakeLists.txt
@@ -9,6 +9,9 @@ target_sources(replica
    memtable.cc
    exceptions.cc
    dirty_memory_manager.cc
+    logstor/segment_manager.cc
+    logstor/logstor.cc
+    logstor/write_buffer.cc
    multishard_query.cc
    mutation_dump.cc
    schema_describe_helper.cc
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -17,6 +17,7 @@
 // FIXME: un-nest compaction_reenabler, so we can forward declare it and remove this include.
 #include "compaction/compaction_manager.hh"
 #include "locator/tablets.hh"
+#include "replica/logstor/compaction.hh"
 #include "sstables/sstable_set.hh"
 #include "utils/chunked_vector.hh"
 #include <absl/container/flat_hash_map.h>
@@ -33,6 +34,10 @@ class effective_replication_map;

 namespace replica {

+namespace logstor {
+class primary_index;
+}
+
 using enable_backlog_tracker = bool_class<class enable_backlog_tracker_tag>;

 enum class repair_sstable_classification {
@@ -91,6 +96,12 @@ class compaction_group {
    bool _tombstone_gc_enabled = true;
    std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
    repair_classifier_func _repair_sstable_classifier;
+
+    lw_shared_ptr<logstor::segment_set> _logstor_segments;
+    std::optional<logstor::separator_buffer> _logstor_separator;
+    std::vector<future<>> _separator_flushes;
+    seastar::semaphore _separator_flush_sem{1};
+
 private:
    std::unique_ptr<compaction_group_view> make_compacting_view();
    std::unique_ptr<compaction_group_view> make_non_compacting_view();
@@ -223,6 +234,7 @@ public:
    const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept;
    // Triggers regular compaction.
    void trigger_compaction();
+    void trigger_logstor_compaction();
    bool compaction_disabled() const;
    future<unsigned> estimate_pending_compactions() const;

@@ -231,6 +243,7 @@ public:

    size_t live_sstable_count() const noexcept;
    uint64_t live_disk_space_used() const noexcept;
+    size_t logstor_disk_space_used() const noexcept;
    sstables::file_size_stats live_disk_space_used_full_stats() const noexcept;
    uint64_t total_disk_space_used() const noexcept;
    sstables::file_size_stats total_disk_space_used_full_stats() const noexcept;
@@ -262,12 +275,37 @@ public:
    compaction::compaction_manager& get_compaction_manager() noexcept;
    const compaction::compaction_manager& get_compaction_manager() const noexcept;

+    logstor::segment_manager& get_logstor_segment_manager() noexcept;
+    const logstor::segment_manager& get_logstor_segment_manager() const noexcept;
+
+    logstor::compaction_manager& get_logstor_compaction_manager() noexcept;
+    const logstor::compaction_manager& get_logstor_compaction_manager() const noexcept;
+
+    logstor::primary_index& get_logstor_index() noexcept;
+
    future<> split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info);

    void set_repair_sstable_classifier(repair_classifier_func repair_sstable_classifier) {
        _repair_sstable_classifier = std::move(repair_sstable_classifier);
    }

+    void add_logstor_segment(logstor::segment_descriptor& desc) {
+        _logstor_segments->add_segment(desc);
+    }
+
+    future<> discard_logstor_segments();
+
+    future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
+    logstor::separator_buffer& get_separator_buffer(size_t write_size);
+
+    logstor::segment_set& logstor_segments() noexcept {
+        return *_logstor_segments;
+    }
+
+    const logstor::segment_set& logstor_segments() const noexcept {
+        return *_logstor_segments;
+    }
+
    friend class storage_group;
 };

@@ -312,7 +350,14 @@ public:

    const compaction_group_ptr& main_compaction_group() const noexcept;
    const std::vector<compaction_group_ptr>& split_ready_compaction_groups() const;
-    compaction_group_ptr& select_compaction_group(locator::tablet_range_side) noexcept;
+    // Selects the compaction group for the given token. Computes the range side
+    // from the token only when in splitting mode. This avoids the cost of computing
+    // range side on the hot path when it's not needed.
+    compaction_group_ptr& select_compaction_group(dht::token, const locator::tablet_map&) noexcept;
+    // Selects the compaction group for an sstable spanning a token range.
+    // If the first and last tokens fall on different sides of the split point,
+    // the sstable belongs to the main compaction group.
+    compaction_group_ptr& select_compaction_group(dht::token first, dht::token last, const locator::tablet_map&) noexcept;

    uint64_t live_disk_space_used() const;

@@ -432,7 +477,9 @@ public:
    // refresh_mutation_source must be called when there are changes to data source
    // structures but logical state of data is not changed (e.g. when state for a
    // new tablet replica is allocated).
-    virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
+    virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                                  const locator::effective_replication_map& erm,
+                                                  noncopyable_function<void()> refresh_mutation_source) = 0;

    virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
    virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -76,6 +76,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "timeout_config.hh"
 #include "tombstone_gc.hh"
+#include "logstor/logstor.hh"
 #include "service/qos/service_level_controller.hh"

 #include "replica/data_dictionary_impl.hh"
@@ -393,6 +394,13 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
    // Allow system tables a pool of 10 MB memory to write, but never block on other regions.
    , _system_dirty_memory_manager(*this, 10 << 20, cfg.unspooled_dirty_soft_limit(), default_scheduling_group())
    , _dirty_memory_manager(*this, dbcfg.available_memory * 0.50, cfg.unspooled_dirty_soft_limit(), dbcfg.statement_scheduling_group)
+    , _dirty_memory_threshold_controller([this] {
+        if (_logstor) {
+            size_t logstor_memory_usage = get_logstor_memory_usage();
+            size_t available_memory = _dbcfg.available_memory > logstor_memory_usage ? _dbcfg.available_memory - logstor_memory_usage : 0;
+            _dirty_memory_manager.update_threshold(available_memory * 0.50);
+        }
+    })
    , _dbcfg(dbcfg)
    , _memtable_controller(make_flush_controller(_cfg, _dbcfg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
        auto backlog = (_dirty_memory_manager.unspooled_dirty_memory()) / limit;
@@ -906,6 +914,50 @@ database::init_commitlog() {
    });
 }

+future<>
+database::init_logstor() {
+    dblog.info("Initializing logstor");
+
+    auto cfg = logstor::logstor_config{
+        .segment_manager_cfg = {
+            .base_dir = std::filesystem::path(_cfg.logstor_directory()),
+            .file_size = _cfg.logstor_file_size_in_mb() * 1024ull * 1024ull,
+            .disk_size = _cfg.logstor_disk_size_in_mb() * 1024ull * 1024ull,
+            .compaction_sg = _dbcfg.compaction_scheduling_group,
+            .compaction_static_shares = _cfg.compaction_static_shares,
+            .separator_sg = _dbcfg.memtable_scheduling_group,
+            .separator_delay_limit_ms = _cfg.logstor_separator_delay_limit_ms(),
+            .max_separator_memory = _cfg.logstor_separator_max_memory_in_mb() * 1024ull * 1024ull,
+        },
+        .flush_sg = _dbcfg.commitlog_scheduling_group,
+    };
+    _logstor = std::make_unique<logstor::logstor>(std::move(cfg));
+
+    _logstor->set_trigger_compaction_hook([this] {
+        trigger_logstor_compaction(false);
+    });
+
+    _logstor->set_trigger_separator_flush_hook([this] (size_t seq_num) {
+        (void)flush_logstor_separator(seq_num);
+    });
+
+    dblog.info("logstor initialized");
+    co_return;
+}
+
+future<>
+database::recover_logstor() {
+    if (!_logstor) {
+        co_return;
+    }
+
+    co_await _logstor->do_recovery(*this);
+
+    co_await _logstor->start();
+
+    _dirty_memory_threshold_controller.arm_periodic(std::chrono::seconds(5));
+}
+
 future<> database::modify_keyspace_on_all_shards(sharded<database>& sharded_db, std::function<future<>(replica::database&)> func) {
    // Run func first on shard 0
    // to allow "seeding" of the effective_replication_map
@@ -1128,6 +1180,17 @@ void database::add_column_family(keyspace& ks, schema_ptr schema, column_family:
        cf->set_truncation_time(db_clock::time_point::min());
    }

+    if (schema->logstor_enabled()) {
+        if (!_cfg.enable_logstor()) {
+            throw std::runtime_error(fmt::format("The table {}.{} is using logstor storage but logstor is not enabled in the configuration", schema->ks_name(), schema->cf_name()));
+        }
+        if (!_logstor) {
+            on_internal_error(dblog, "The table is using logstor but logstor is not initialized");
+        }
+        cf->init_logstor(_logstor.get());
+        dblog.info0("Table {}.{} is using logstor storage", schema->ks_name(), schema->cf_name());
+    }
+
    auto uuid = schema->id();
    if (_tables_metadata.contains(uuid)) {
        throw std::invalid_argument("UUID " + uuid.to_sstring() + " already mapped");
@@ -1699,7 +1762,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
        if (!range.is_singular()) {
            continue;
        }
-        auto token = dht::token::to_int64(ranges.front().start()->value().token());
+        auto token = dht::token::to_int64(range.start()->value().token());
        if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
            // Don't return immediately - account all ranges first
            ret = can_proceed::no;
@@ -2163,7 +2226,7 @@ static std::exception_ptr wrap_commitlog_add_error(const schema_ptr& s, const fr

 future<> database::apply_with_commitlog(column_family& cf, const mutation& m, db::timeout_clock::time_point timeout) {
    db::rp_handle h;
-    if (cf.commitlog() != nullptr && cf.durable_writes()) {
+    if (cf.commitlog() != nullptr && cf.durable_writes() && !cf.uses_logstor()) {
        auto fm = freeze(m);
        std::exception_ptr ex;
        try {
@@ -2212,6 +2275,10 @@ future<> database::do_apply_many(const utils::chunked_vector<frozen_mutation>& m
        auto s = local_schema_registry().get(muts[i].schema_version());
        auto&& cf = find_column_family(muts[i].column_family_id());

+        if (cf.uses_logstor()) {
+            continue;
+        }
+
        if (!cl) {
            cl = cf.commitlog();
        } else if (cl != cf.commitlog()) {
@@ -2248,16 +2315,16 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra
    // assume failure until proven otherwise
    auto update_writes_failed = defer([&] { ++_stats->total_writes_failed; });

-    utils::get_local_injector().inject("database_apply", [&s] () {
-        if (!is_system_keyspace(s->ks_name())) {
-            throw std::runtime_error("injected error");
+    co_await utils::get_local_injector().inject("database_apply", [&s] (auto& handler) -> future<> {
+        if (s->ks_name() != handler.get("ks_name") || s->cf_name() != handler.get("cf_name")) {
+            co_return;
        }
-    });
-    co_await utils::get_local_injector().inject("database_apply_wait", [&] (auto& handler) -> future<> {
-        if (s->cf_name() == handler.get("cf_name")) {
-            dblog.info("database_apply_wait: wait");
+        if (handler.get("what") == "throw") {
+            throw std::runtime_error(format("injected error for {}.{}", s->ks_name(), s->cf_name()));
+        } else if (handler.get("what") == "wait") {
+            dblog.info("database_apply: wait");
            co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
-            dblog.info("database_apply_wait: done");
+            dblog.info("database_apply: done");
        }
    });

@@ -2309,7 +2376,7 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra
    // frames.
    db::rp_handle h;
    auto cl = cf.commitlog();
-    if (cl != nullptr && cf.durable_writes()) {
+    if (cl != nullptr && cf.durable_writes() && !cf.uses_logstor()) {
        std::exception_ptr ex;
        try {
            commitlog_entry_writer cew(s, m, sync);
@@ -2633,6 +2700,9 @@ future<> database::start(sharded<qos::service_level_controller>& sl_controller,
        _compaction_manager.enable();
    }
    co_await init_commitlog();
+    if (_cfg.enable_logstor()) {
+        co_await init_logstor();
+    }
 }

 future<> database::shutdown() {
@@ -2673,6 +2743,11 @@ future<> database::stop() {
        co_await _commitlog->shutdown();
        dblog.info("Shutting down commitlog complete");
    }
+    if (_logstor) {
+        dblog.info("Shutting down logstor");
+        co_await _logstor->stop();
+        dblog.info("Shutting down logstor complete");
+    }
    if (_schema_commitlog) {
        dblog.info("Shutting down schema commitlog");
        co_await _schema_commitlog->shutdown();
@@ -2807,6 +2882,53 @@ future<> database::drop_cache_for_keyspace_on_all_shards(sharded<database>& shar
    });
 }

+future<> database::trigger_logstor_compaction_on_all_shards(sharded<database>& sharded_db, bool major) {
+    return sharded_db.invoke_on_all([major] (replica::database& db) {
+        return db.trigger_logstor_compaction(major);
+    });
+}
+
+void database::trigger_logstor_compaction(bool major) {
+    _tables_metadata.for_each_table([&] (table_id id, const lw_shared_ptr<table> tp) {
+        if (tp->uses_logstor()) {
+            tp->trigger_logstor_compaction();
+        }
+    });
+}
+
+future<> database::flush_logstor_separator_on_all_shards(sharded<database>& sharded_db) {
+    return sharded_db.invoke_on_all([] (replica::database& db) {
+        return db.flush_logstor_separator();
+    });
+}
+
+future<> database::flush_logstor_separator(std::optional<size_t> seq_num) {
+    return _tables_metadata.parallel_for_each_table([seq_num] (table_id, lw_shared_ptr<table> table) {
+        return table->flush_separator(seq_num);
+    });
+}
+
+future<logstor::table_segment_stats> database::get_logstor_table_segment_stats(table_id table) const {
+    return find_column_family(table).get_logstor_segment_stats();
+}
+
+size_t database::get_logstor_memory_usage() const {
+    if (!_logstor) {
+        return 0;
+    }
+    size_t m = 0;
+
+    m += _logstor->get_memory_usage();
+
+    get_tables_metadata().for_each_table([&m] (table_id, lw_shared_ptr<replica::table> table) {
+        if (table->uses_logstor()) {
+            m += table->get_logstor_memory_usage();
+        }
+    });
+
+    return m;
+}
+
 future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, table_id uuid, sstring tag, db::snapshot_options opts) {
    if (!opts.skip_flush) {
        co_await flush_table_on_all_shards(sharded_db, uuid);
@@ -2927,6 +3049,7 @@ future<> database::truncate_table_on_all_shards(sharded<database>& sharded_db, s
        co_await coroutine::parallel_for_each(views, [&] (lw_shared_ptr<replica::table> v) -> future<> {
            co_await flush_or_clear(*v);
        });
+        co_await cf.flush_separator();
        // Since writes could be appended to active memtable between getting low_mark above
        // and flush, the low_mark has to be adjusted to account for those writes, where
        // memtable was flushed with a higher replay position than the one obtained above.
@@ -2968,6 +3091,8 @@ future<> database::truncate(db::system_keyspace& sys_ks, column_family& cf, std:
    dblog.debug("Discarding sstable data for truncated CF + indexes");
    // TODO: notify truncation

+    co_await cf.discard_logstor_segments();
+
    db::replay_position rp = co_await cf.discard_sstables(truncated_at);
    // TODO: indexes.
    // Note: since discard_sstables was changed to only count tables owned by this shard,
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -16,6 +16,7 @@
 #include <seastar/core/execution_stage.hh>
 #include <seastar/core/when_all.hh>
 #include "replica/global_table_ptr.hh"
+#include "replica/logstor/compaction.hh"
 #include "types/user.hh"
 #include "utils/assert.hh"
 #include "utils/hash.hh"
@@ -35,6 +36,7 @@
 #include <seastar/core/gate.hh>
 #include "db/commitlog/replay_position.hh"
 #include "db/commitlog/commitlog_types.hh"
+#include "logstor/logstor.hh"
 #include "schema/schema_fwd.hh"
 #include "db/view/view.hh"
 #include "db/snapshot-ctl.hh"
@@ -544,6 +546,9 @@ private:
    utils::phased_barrier _flush_barrier;
    std::vector<view_ptr> _views;

+    logstor::logstor* _logstor = nullptr;
+    std::unique_ptr<logstor::primary_index> _logstor_index;
+
    std::unique_ptr<cell_locker> _counter_cell_locks; // Memory-intensive; allocate only when needed.

    // Labels used to identify writes and reads for this table in the rate_limiter structure.
@@ -611,6 +616,10 @@ public:
                                          sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);

+    bool add_logstor_segment(logstor::segment_descriptor&, dht::token first_token, dht::token last_token);
+
+    logstor::separator_buffer& get_logstor_separator_buffer(dht::token token, size_t write_size);
+
    // Restricted to new sstables produced by external processes such as repair.
    // The sstable might undergo split if table is in split mode.
    // If no need for split, the input sstable will only be attached to the sstable set.
@@ -833,6 +842,21 @@ public:
    // to issue disk operations safely.
    void mark_ready_for_writes(db::commitlog* cl);

+    void init_logstor(logstor::logstor* ls);
+
+    bool uses_logstor() const {
+        return _logstor != nullptr;
+    }
+
+    logstor::primary_index& logstor_index() noexcept {
+        return *_logstor_index;
+    }
+    const logstor::primary_index& logstor_index() const noexcept {
+        return *_logstor_index;
+    }
+
+    size_t get_logstor_memory_usage() const;
+
    // Creates a mutation reader which covers all data sources for this column family.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
    // Note: for data queries use query() instead.
@@ -858,6 +882,14 @@ public:
        return make_mutation_reader(std::move(schema), std::move(permit), range, full_slice);
    }

+    mutation_reader make_logstor_mutation_reader(schema_ptr s,
+            reader_permit permit,
+            const dht::partition_range& pr,
+            const query::partition_slice& slice,
+            tracing::trace_state_ptr trace_state,
+            streamed_mutation::forwarding fwd,
+            mutation_reader::forwarding fwd_mr) const;
+
    // The streaming mutation reader differs from the regular mutation reader in that:
    //  - Reflects all writes accepted by replica prior to creation of the
    //    reader and a _bounded_ amount of writes which arrive later.
@@ -1047,6 +1079,7 @@ public:
    bool needs_flush() const;
    future<> clear(); // discards memtable(s) without flushing them to disk.
    future<db::replay_position> discard_sstables(db_clock::time_point);
+    future<> discard_logstor_segments();

    bool can_flush() const;

@@ -1098,6 +1131,7 @@ public:
    void start_compaction();
    void trigger_compaction();
    void try_trigger_compaction(compaction_group& cg) noexcept;
+    void trigger_logstor_compaction();
    // Triggers offstrategy compaction, if needed, in the background.
    void trigger_offstrategy_compaction();
    // Performs offstrategy compaction, if needed, returning
@@ -1126,6 +1160,22 @@ public:
        return _compaction_manager;
    }

+    logstor::segment_manager& get_logstor_segment_manager() noexcept {
+        return _logstor->get_segment_manager();
+    }
+
+    const logstor::segment_manager& get_logstor_segment_manager() const noexcept {
+        return _logstor->get_segment_manager();
+    }
+
+    logstor::compaction_manager& get_logstor_compaction_manager() noexcept {
+        return _logstor->get_compaction_manager();
+    }
+
+    future<> flush_separator(std::optional<size_t> seq_num = std::nullopt);
+
+    future<logstor::table_segment_stats> get_logstor_segment_stats() const;
+
    table_stats& get_stats() const {
        return _stats;
    }
@@ -1613,6 +1663,8 @@ private:
    dirty_memory_manager _system_dirty_memory_manager;
    dirty_memory_manager _dirty_memory_manager;

+    timer<lowres_clock> _dirty_memory_threshold_controller;
+
    database_config _dbcfg;
    flush_controller _memtable_controller;
    drain_progress _drain_progress {};
@@ -1655,6 +1707,8 @@ private:
    bool _enable_autocompaction_toggle = false;
    querier_cache _querier_cache;

+    std::unique_ptr<logstor::logstor> _logstor;
+
    std::unique_ptr<db::large_data_handler> _large_data_handler;
    std::unique_ptr<db::large_data_handler> _nop_large_data_handler;

@@ -1696,6 +1750,8 @@ public:
    std::shared_ptr<data_dictionary::user_types_storage> as_user_types_storage() const noexcept;
    const data_dictionary::user_types_storage& user_types() const noexcept;
    future<> init_commitlog();
+    future<> init_logstor();
+    future<> recover_logstor();
    const gms::feature_service& features() const { return _feat; }
    future<> apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::rp_handle&&, db::timeout_clock::time_point timeout);
    future<> apply_in_memory(const mutation& m, column_family& cf, db::rp_handle&&, db::timeout_clock::time_point timeout);
@@ -1996,6 +2052,13 @@ public:
    // a wrapper around flush_all_tables, allowing the caller to express intent more clearly
    future<> flush_commitlog() { return flush_all_tables(); }

+    static future<> trigger_logstor_compaction_on_all_shards(sharded<database>& sharded_db, bool major);
+    void trigger_logstor_compaction(bool major);
+    static future<> flush_logstor_separator_on_all_shards(sharded<database>& sharded_db);
+    future<> flush_logstor_separator(std::optional<size_t> seq_num = std::nullopt);
+    future<logstor::table_segment_stats> get_logstor_table_segment_stats(table_id table) const;
+    size_t get_logstor_memory_usage() const;
+
    static future<db_clock::time_point> get_all_tables_flushed_at(sharded<database>& sharded_db);

    static future<> drop_cache_for_table_on_all_shards(sharded<database>& sharded_db, table_id id);
--- a/replica/dirty_memory_manager.cc
+++ b/replica/dirty_memory_manager.cc
@@ -142,6 +142,16 @@ void region_group::notify_unspooled_pressure_relieved() {
    _relief.signal();
 }

+void region_group::update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit) {
+    _cfg.unspooled_hard_limit = unspooled_hard_limit;
+    _cfg.unspooled_soft_limit = unspooled_soft_limit;
+    _cfg.real_hard_limit = real_hard_limit;
+
+    // check pressure with the new limits
+    update_real(0);
+    update_unspooled(0);
+}
+
 bool region_group::do_update_real_and_check_relief(ssize_t delta) {
    _real_total_memory += delta;

@@ -211,9 +221,18 @@ dirty_memory_manager::dirty_memory_manager(replica::database& db, size_t thresho
            .real_hard_limit = threshold,
            .start_reclaiming = std::bind_front(&dirty_memory_manager::start_reclaiming, this)
      }, deferred_work_sg)
+    , _threshold(threshold)
+    , _soft_limit(soft_limit)
    , _flush_serializer(1)
    , _waiting_flush(flush_when_needed()) {}

+void dirty_memory_manager::update_threshold(size_t threshold) {
+    if (threshold != _threshold) {
+        _threshold = threshold;
+        _region_group.update_limits(threshold / 2, threshold * _soft_limit / 2, threshold);
+    }
+}
+
 void
 dirty_memory_manager::setup_collectd(sstring namestr) {
    namespace sm = seastar::metrics;
--- a/replica/dirty_memory_manager.hh
+++ b/replica/dirty_memory_manager.hh
@@ -268,6 +268,8 @@ public:
    }
    void update_unspooled(ssize_t delta);

+    void update_limits(size_t unspooled_hard_limit, size_t unspooled_soft_limit, size_t real_hard_limit);
+
    void increase_usage(logalloc::region* r) { // Called by memtable's region_listener
        // It would be easier to call update, but it is unfortunately broken in boost versions up to at
        // least 1.59.
@@ -395,6 +397,9 @@ class dirty_memory_manager {
    // memory usage minus bytes that were already written to disk.
    dirty_memory_manager_logalloc::region_group _region_group;

+    size_t _threshold;
+    double _soft_limit;
+
    // We would like to serialize the flushing of memtables. While flushing many memtables
    // simultaneously can sustain high levels of throughput, the memory is not freed until the
    // memtable is totally gone. That means that if we have throttled requests, they will stay
@@ -483,6 +488,8 @@ public:
        return _region_group;
    }

+    void update_threshold(size_t threshold);
+
    void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
        _region_group.update_real(-delta);
        _region_group.update_unspooled(delta);
--- a/replica/logstor/compaction.hh
+++ b/replica/logstor/compaction.hh
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include "types.hh"
+#include "utils/chunked_vector.hh"
+#include "write_buffer.hh"
+#include "utils/log_heap.hh"
+
+namespace replica::logstor {
+
+constexpr log_heap_options segment_descriptor_hist_options(4 * 1024, 3, 128 * 1024);
+
+struct segment_set;
+
+struct segment_descriptor : public log_heap_hook<segment_descriptor_hist_options> {
+    // free_space = segment_size - net_data_size
+    // initially set to segment_size
+    // when writing records, decrease by total net data size
+    // when freeing a record, increase by the record's net data size
+    size_t free_space{0};
+    size_t record_count{0};
+    segment_generation seg_gen{1};
+    segment_set* owner{nullptr}; // non-owning, set when added to a segment_set
+
+    void reset(size_t segment_size) noexcept {
+        free_space = segment_size;
+        record_count = 0;
+    }
+
+    size_t net_data_size(size_t segment_size) const noexcept {
+        return segment_size - free_space;
+    }
+
+    void on_free_segment() noexcept {
+        ++seg_gen;
+    }
+
+    void on_write(size_t net_data_size, size_t cnt = 1) noexcept {
+        free_space -= net_data_size;
+        record_count += cnt;
+    }
+
+    void on_write(log_location loc) noexcept {
+        on_write(loc.size);
+    }
+
+    void on_free(size_t net_data_size, size_t cnt = 1) noexcept {
+        free_space += net_data_size;
+        record_count -= cnt;
+    }
+
+    void on_free(log_location loc) noexcept {
+        on_free(loc.size);
+    }
+};
+
+using segment_descriptor_hist = log_heap<segment_descriptor, segment_descriptor_hist_options>;
+
+struct segment_set {
+    segment_descriptor_hist _segments;
+    size_t _segment_count{0};
+
+    void add_segment(segment_descriptor& desc) {
+        desc.owner = this;
+        _segments.push(desc);
+        ++_segment_count;
+    }
+
+    void update_segment(segment_descriptor& desc) {
+        _segments.adjust_up(desc);
+    }
+
+    void remove_segment(segment_descriptor& desc) {
+        _segments.erase(desc);
+        desc.owner = nullptr;
+        --_segment_count;
+    }
+
+    size_t segment_count() const noexcept {
+        return _segment_count;
+    }
+};
+
+class segment_ref {
+    struct state {
+        log_segment_id id;
+        std::function<void()> on_last_release;
+        std::function<void()> on_failure;
+        bool flush_failure{false};
+        ~state() {
+            if (!flush_failure) {
+                if (on_last_release) on_last_release();
+            } else {
+                if (on_failure) on_failure();
+            }
+        }
+    };
+    lw_shared_ptr<state> _state;
+public:
+    segment_ref() = default;
+
+    // Copyable: copying increments the shared ref count
+    segment_ref(const segment_ref&) = default;
+    segment_ref& operator=(const segment_ref&) = default;
+    segment_ref(segment_ref&&) noexcept = default;
+    segment_ref& operator=(segment_ref&&) noexcept = default;
+
+    log_segment_id id() const noexcept { return _state->id; }
+    bool empty() const noexcept { return !_state; }
+
+    void set_flush_failure() noexcept { if (_state) _state->flush_failure = true; }
+
+private:
+    friend class segment_manager_impl;
+    explicit segment_ref(log_segment_id id, std::function<void()> on_last_release, std::function<void()> on_failure)
+        : _state(make_lw_shared<state>(id, std::move(on_last_release), std::move(on_failure)))
+    {}
+};
+
+struct separator_buffer {
+    write_buffer* buf;
+    utils::chunked_vector<future<>> pending_updates;
+    utils::chunked_vector<segment_ref> held_segments;
+    std::optional<size_t> min_seq_num;
+    bool flushed{false};
+
+    separator_buffer(write_buffer* wb)
+        : buf(wb)
+    {}
+
+    ~separator_buffer() {
+        if (!flushed && buf && buf->has_data()) {
+            for (auto& seg_ref : held_segments) {
+                seg_ref.set_flush_failure();
+            }
+        }
+    }
+
+    separator_buffer(const separator_buffer&) = delete;
+    separator_buffer& operator=(const separator_buffer&) = delete;
+
+    separator_buffer(separator_buffer&&) noexcept = default;
+    separator_buffer& operator=(separator_buffer&&) noexcept = default;
+
+    future<log_location_with_holder> write(log_record_writer writer) {
+        return buf->write(std::move(writer));
+    }
+
+    bool can_fit(const log_record_writer& writer) const noexcept {
+        return buf->can_fit(writer);
+    }
+
+    bool can_fit(size_t write_size) const noexcept {
+        return buf->can_fit(write_size);
+    }
+};
+
+class compaction_manager {
+public:
+    virtual ~compaction_manager() = default;
+
+    virtual separator_buffer allocate_separator_buffer() = 0;
+
+    virtual future<> flush_separator_buffer(separator_buffer, replica::compaction_group&) = 0;
+
+    virtual void submit(replica::compaction_group&) = 0;
+
+    virtual future<> stop_ongoing_compactions(replica::compaction_group&) = 0;
+};
+
+}
--- a/replica/logstor/index.hh
+++ b/replica/logstor/index.hh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include "dht/decorated_key.hh"
+#include "dht/ring_position.hh"
+#include "types.hh"
+#include "utils/bptree.hh"
+#include "utils/double-decker.hh"
+#include "utils/phased_barrier.hh"
+
+namespace replica::logstor {
+
+class primary_index_entry {
+    dht::decorated_key _key;
+    index_entry _e;
+    struct {
+        bool _head : 1;
+        bool _tail : 1;
+        bool _train : 1;
+    } _flags{};
+public:
+    primary_index_entry(dht::decorated_key key, index_entry e)
+        : _key(std::move(key))
+        , _e(std::move(e))
+    { }
+
+    primary_index_entry(primary_index_entry&&) noexcept = default;
+
+    bool is_head() const noexcept { return _flags._head; }
+    void set_head(bool v) noexcept { _flags._head = v; }
+    bool is_tail() const noexcept { return _flags._tail; }
+    void set_tail(bool v) noexcept { _flags._tail = v; }
+    bool with_train() const noexcept { return _flags._train; }
+    void set_train(bool v) noexcept { _flags._train = v; }
+
+    const dht::decorated_key& key() const noexcept { return _key; }
+    const index_entry& entry() const noexcept { return _e; }
+
+    friend class primary_index;
+
+    friend dht::ring_position_view ring_position_view_to_compare(const primary_index_entry& e) { return e._key; }
+};
+
+class primary_index final {
+public:
+    using partitions_type = double_decker<int64_t, primary_index_entry,
+                            dht::raw_token_less_comparator, dht::ring_position_comparator,
+                            16, bplus::key_search::linear>;
+private:
+    partitions_type _partitions;
+    schema_ptr _schema;
+    size_t _key_count = 0;
+
+    mutable utils::phased_barrier _reads_phaser{"logstor_primary_index"};
+
+public:
+    explicit primary_index(schema_ptr schema)
+        : _partitions(dht::raw_token_less_comparator{})
+        , _schema(std::move(schema))
+        {}
+
+    void set_schema(schema_ptr s) {
+        _schema = std::move(s);
+    }
+
+    void clear() {
+        _partitions.clear();
+        _key_count = 0;
+    }
+
+    utils::phased_barrier::operation start_read() const {
+        return _reads_phaser.start();
+    }
+
+    future<> await_pending_reads() {
+        return _reads_phaser.advance_and_await();
+    }
+
+    std::optional<index_entry> get(const primary_index_key& key) const {
+        auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
+        if (it != _partitions.end()) {
+            return it->_e;
+        }
+        return std::nullopt;
+    }
+
+    std::optional<index_entry> exchange(const primary_index_key& key, index_entry new_entry) {
+        partitions_type::bound_hint hint;
+        auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
+        if (hint.match) {
+            auto old_entry = i->_e;
+            i->_e = std::move(new_entry);
+            return old_entry;
+        } else {
+            _partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
+            ++_key_count;
+            return std::nullopt;
+        }
+    }
+
+    bool update_record_location(const primary_index_key& key, log_location old_location, log_location new_location) {
+        auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
+        if (it != _partitions.end()) {
+            if (it->_e.location == old_location) {
+                it->_e.location = new_location;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    std::pair<bool, std::optional<index_entry>> insert_if_newer(const primary_index_key& key, index_entry new_entry) {
+        partitions_type::bound_hint hint;
+        auto i = _partitions.lower_bound(key.dk, dht::ring_position_comparator(*_schema), hint);
+        if (hint.match) {
+            if (i->_e.generation < new_entry.generation) {
+                auto old_entry = i->_e;
+                i->_e = std::move(new_entry);
+                return {true, std::make_optional(old_entry)};
+            } else {
+                return {false, std::make_optional(i->_e)};
+            }
+        } else {
+            _partitions.emplace_before(i, key.dk.token().raw(), hint, key.dk, std::move(new_entry));
+            ++_key_count;
+            return {true, std::nullopt};
+        }
+    }
+
+    bool erase(const primary_index_key& key, log_location loc) {
+        auto it = _partitions.find(key.dk, dht::ring_position_comparator(*_schema));
+        if (it != _partitions.end() && it->_e.location == loc) {
+            it.erase(dht::raw_token_less_comparator{});
+            --_key_count;
+            return true;
+        }
+        return false;
+    }
+
+    auto begin() const noexcept { return _partitions.begin(); }
+    auto end() const noexcept { return _partitions.end(); }
+
+    bool empty() const noexcept { return _partitions.empty(); }
+
+    size_t get_key_count() const noexcept { return _key_count; }
+
+    size_t get_memory_usage() const noexcept { return _key_count * sizeof(index_entry); }
+
+    // First entry with key >= pos (for positioning at range start)
+    partitions_type::const_iterator lower_bound(const dht::ring_position_view& pos) const {
+        return _partitions.lower_bound(pos, dht::ring_position_comparator(*_schema));
+    }
+
+    // First entry with key strictly > key (for advancing past a key after a yield)
+    partitions_type::const_iterator upper_bound(const dht::decorated_key& key) const {
+        return _partitions.upper_bound(key, dht::ring_position_comparator(*_schema));
+    }
+
+};
+
+}
--- a/replica/logstor/logstor.cc
+++ b/replica/logstor/logstor.cc
@@ -0,0 +1,297 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#include "replica/logstor/logstor.hh"
+#include <seastar/core/coroutine.hh>
+#include <seastar/util/log.hh>
+#include <seastar/core/future.hh>
+#include "readers/from_mutations.hh"
+#include "keys/keys.hh"
+#include "replica/logstor/segment_manager.hh"
+#include "replica/logstor/types.hh"
+#include "utils/managed_bytes.hh"
+#include <openssl/ripemd.h>
+#include <openssl/evp.h>
+
+namespace replica::logstor {
+
+seastar::logger logstor_logger("logstor");
+
+logstor::logstor(logstor_config config)
+    : _segment_manager(config.segment_manager_cfg)
+    , _write_buffer(_segment_manager, config.flush_sg) {
+}
+
+future<> logstor::do_recovery(replica::database& db) {
+    co_await _segment_manager.do_recovery(db);
+}
+
+future<> logstor::start() {
+    logstor_logger.info("Starting logstor");
+
+    co_await _segment_manager.start();
+    co_await _write_buffer.start();
+
+    logstor_logger.info("logstor started");
+}
+
+future<> logstor::stop() {
+    logstor_logger.info("Stopping logstor");
+
+    co_await _write_buffer.stop();
+    co_await _segment_manager.stop();
+
+    logstor_logger.info("logstor stopped");
+}
+
+size_t logstor::get_memory_usage() const {
+    return _segment_manager.get_memory_usage();
+}
+
+future<> logstor::write(const mutation& m, compaction_group& cg, seastar::gate::holder cg_holder) {
+    primary_index_key key(m.decorated_key());
+    table_id table = m.schema()->id();
+    auto& index = cg.get_logstor_index();
+
+    // TODO ?
+    record_generation gen = index.get(key)
+        .transform([](const index_entry& entry) {
+            return entry.generation + 1;
+         }).value_or(record_generation(1));
+
+    log_record record {
+        .key = key,
+        .generation = gen,
+        .table = table,
+        .mut = canonical_mutation(m)
+    };
+
+    return _write_buffer.write(std::move(record), &cg, std::move(cg_holder)).then_unpack([this, &index, gen, key = std::move(key)]
+            (log_location location, seastar::gate::holder op) {
+        index_entry new_entry {
+            .location = location,
+            .generation = gen,
+        };
+
+        auto old_entry = index.exchange(key, std::move(new_entry));
+
+        // If overwriting, free old record
+        if (old_entry) {
+            _segment_manager.free_record(old_entry->location);
+        }
+    }).handle_exception([] (std::exception_ptr ep) {
+        logstor_logger.error("Error writing mutation: {}", ep);
+        return make_exception_future<>(ep);
+    });
+}
+
+future<std::optional<log_record>> logstor::read(const primary_index& index, primary_index_key key) {
+    auto op = index.start_read();
+
+    auto entry_opt = index.get(key);
+    if (!entry_opt.has_value()) {
+        return make_ready_future<std::optional<log_record>>(std::nullopt);
+    }
+
+    const auto& entry = *entry_opt;
+
+    return _segment_manager.read(entry.location).then([key = std::move(key), op = std::move(op)] (log_record record) {
+        return std::optional<log_record>(std::move(record));
+    }).handle_exception([] (std::exception_ptr ep) {
+        logstor_logger.error("Error reading record: {}", ep);
+        return make_exception_future<std::optional<log_record>>(ep);
+    });
+}
+
+future<std::optional<canonical_mutation>> logstor::read(const schema& s, const primary_index& index, const dht::decorated_key& dk) {
+    primary_index_key key(dk);
+    return read(index, key).then([&dk] (std::optional<log_record> record_opt) -> std::optional<canonical_mutation> {
+        if (!record_opt.has_value()) {
+            return std::nullopt;
+        }
+
+        auto& record = *record_opt;
+
+        if (record.mut.key() != dk.key()) [[unlikely]] {
+            throw std::runtime_error(fmt::format(
+                "Key mismatch reading log entry: expected {}, got {}",
+                dk.key(), record.mut.key()
+            ));
+        }
+
+        return std::optional<canonical_mutation>(std::move(record.mut));
+    });
+}
+
+segment_manager& logstor::get_segment_manager() noexcept {
+    return _segment_manager;
+}
+
+const segment_manager& logstor::get_segment_manager() const noexcept {
+    return _segment_manager;
+}
+
+compaction_manager& logstor::get_compaction_manager() noexcept {
+    return _segment_manager.get_compaction_manager();
+}
+
+const compaction_manager& logstor::get_compaction_manager() const noexcept {
+    return _segment_manager.get_compaction_manager();
+}
+
+mutation_reader logstor::make_reader(schema_ptr schema,
+                                            const primary_index& index,
+                                            reader_permit permit,
+                                            const dht::partition_range& pr,
+                                            const query::partition_slice& slice,
+                                            tracing::trace_state_ptr trace_state) {
+
+    class logstor_range_reader : public mutation_reader::impl {
+        logstor* _logstor;
+        const primary_index& _index;
+        dht::partition_range _pr;
+        query::partition_slice _slice;
+        tracing::trace_state_ptr _trace_state;
+        std::optional<dht::decorated_key> _last_key; // owns the key, safe across yields
+        mutation_reader_opt _current_partition_reader;
+        dht::ring_position_comparator _cmp;
+
+        // Finds the next iterator to process, safe to call after any co_await
+        primary_index::partitions_type::const_iterator find_next() const {
+            auto it = _last_key
+                ? _index.upper_bound(*_last_key)                        // strictly after last key
+                : position_at_range_start();                            // initial positioning
+            // If start was exclusive and we haven't yet seen a key
+            return it;
+        }
+
+        primary_index::partitions_type::const_iterator position_at_range_start() const {
+            if (!_pr.start()) {
+                return _index.begin();
+            }
+            auto it = _index.lower_bound(_pr.start()->value());
+            if (!_pr.start()->is_inclusive() && it != _index.end()) {
+                if (_cmp(it->key(), _pr.start()->value()) == 0) {
+                    ++it;
+                }
+            }
+            return it;
+        }
+
+        bool exceeds_range_end(const primary_index_entry& e) const {
+            if (!_pr.end()) return false;
+            auto c = _cmp(e.key(), _pr.end()->value());
+            return _pr.end()->is_inclusive() ? c > 0 : c >= 0;
+        }
+
+    public:
+        logstor_range_reader(schema_ptr s, const primary_index& idx, reader_permit p,
+                    logstor* ls, dht::partition_range pr,
+                    query::partition_slice slice, tracing::trace_state_ptr ts)
+            : impl(std::move(s), std::move(p))
+            , _logstor(ls), _index(idx), _pr(std::move(pr))
+            , _slice(std::move(slice)), _trace_state(std::move(ts))
+            , _cmp(*_schema)
+        {}
+
+        virtual future<> fill_buffer() override {
+            while (!is_buffer_full() && !_end_of_stream) {
+                // Drain current partition's reader first
+                if (_current_partition_reader) {
+                    co_await _current_partition_reader->fill_buffer();
+                    _current_partition_reader->move_buffer_content_to(*this);
+                    if (!_current_partition_reader->is_end_of_stream()) {
+                        continue;
+                    }
+                    co_await _current_partition_reader->close();
+                    _current_partition_reader = std::nullopt;
+                    // _last_key was already set when we opened the reader
+                }
+
+                // Find next key in range (safe after co_await since we use _last_key)
+                auto it = find_next();
+                if (it == _index.end() || exceeds_range_end(*it)) {
+                    _end_of_stream = true;
+                    break;
+                }
+
+                // Snapshot the key before yielding
+                auto current_key = it->key();
+
+                auto guard = reader_permit::awaits_guard(_permit);
+                auto cmut = co_await _logstor->read(*_schema, _index, current_key);
+
+                _last_key = current_key; // mark as visited even if not found (tombstoned)
+
+                if (!cmut) {
+                    continue; // key was removed between index lookup and read
+                }
+
+                tracing::trace(_trace_state, "logstor_range_reader: fetched key {}", current_key);
+
+                _current_partition_reader = make_mutation_reader_from_mutations(
+                    _schema, _permit, cmut->to_mutation(_schema),
+                    _slice, streamed_mutation::forwarding::no
+                );
+            }
+        }
+
+        virtual future<> next_partition() override {
+            clear_buffer_to_next_partition();
+            if (!is_buffer_empty()) return make_ready_future<>();
+            _end_of_stream = false;
+            if (_current_partition_reader) {
+                auto fut = _current_partition_reader->close();
+                _current_partition_reader = std::nullopt;
+                return fut;
+            }
+            return make_ready_future<>();
+        }
+
+        virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+            clear_buffer();
+            _end_of_stream = false;
+            _pr = pr;
+            _last_key = std::nullopt;      // re-position from new range start
+            if (_current_partition_reader) {
+                auto fut = _current_partition_reader->close();
+                _current_partition_reader = std::nullopt;
+                return fut;
+            }
+            return make_ready_future<>();
+        }
+
+        virtual future<> fast_forward_to(position_range pr) override {
+            if (_current_partition_reader) {
+                clear_buffer();
+                return _current_partition_reader->fast_forward_to(std::move(pr));
+            }
+            return make_ready_future<>();
+        }
+
+        virtual future<> close() noexcept override {
+            if (_current_partition_reader) {
+                return _current_partition_reader->close();
+            }
+            return make_ready_future<>();
+        }
+    };
+
+    return make_mutation_reader<logstor_range_reader>(
+        std::move(schema), index, std::move(permit), this, pr, slice, std::move(trace_state)
+    );
+}
+
+void logstor::set_trigger_compaction_hook(std::function<void()> fn) {
+    _segment_manager.set_trigger_compaction_hook(std::move(fn));
+}
+
+void logstor::set_trigger_separator_flush_hook(std::function<void(size_t)> fn) {
+    _segment_manager.set_trigger_separator_flush_hook(std::move(fn));
+}
+
+}
--- a/replica/logstor/logstor.hh
+++ b/replica/logstor/logstor.hh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <optional>
+#include <seastar/core/scheduling.hh>
+#include "readers/mutation_reader.hh"
+#include "replica/compaction_group.hh"
+#include "types.hh"
+#include "index.hh"
+#include "segment_manager.hh"
+#include "write_buffer.hh"
+#include "mutation/mutation.hh"
+#include "dht/decorated_key.hh"
+
+namespace replica {
+
+class compaction_group;
+class database;
+
+namespace logstor {
+
+extern seastar::logger logstor_logger;
+
+struct logstor_config {
+    segment_manager_config segment_manager_cfg;
+    seastar::scheduling_group flush_sg;
+};
+
+class logstor {
+
+    segment_manager _segment_manager;
+    buffered_writer _write_buffer;
+
+public:
+
+    explicit logstor(logstor_config);
+
+    logstor(const logstor&) = delete;
+    logstor& operator=(const logstor&) = delete;
+
+    future<> do_recovery(replica::database&);
+
+    future<> start();
+    future<> stop();
+
+    size_t get_memory_usage() const;
+
+    segment_manager& get_segment_manager() noexcept;
+    const segment_manager& get_segment_manager() const noexcept;
+
+    compaction_manager& get_compaction_manager() noexcept;
+    const compaction_manager& get_compaction_manager() const noexcept;
+
+    future<> write(const mutation&, compaction_group&, seastar::gate::holder cg_holder);
+
+    future<std::optional<log_record>> read(const primary_index&, primary_index_key);
+
+    future<std::optional<canonical_mutation>> read(const schema&, const primary_index&, const dht::decorated_key&);
+
+    /// Create a mutation reader for a specific key
+    mutation_reader make_reader(schema_ptr schema,
+                                       const primary_index& index,
+                                       reader_permit permit,
+                                       const dht::partition_range& pr,
+                                       const query::partition_slice& slice,
+                                       tracing::trace_state_ptr trace_state = nullptr);
+
+    void set_trigger_compaction_hook(std::function<void()> fn);
+    void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
+};
+
+} // namespace logstor
+} // namespace replica
--- a/replica/logstor/segment_manager.cc
+++ b/replica/logstor/segment_manager.cc
--- a/replica/logstor/segment_manager.hh
+++ b/replica/logstor/segment_manager.hh
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <cstdint>
+#include <filesystem>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/file.hh>
+#include <seastar/core/rwlock.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/shared_ptr.hh>
+#include "bytes_fwd.hh"
+#include "replica/logstor/write_buffer.hh"
+#include "types.hh"
+#include "utils/updateable_value.hh"
+
+namespace replica {
+
+class database;
+
+namespace logstor {
+
+class compaction_manager;
+class segment_set;
+class primary_index;
+
+static constexpr size_t default_segment_size = 128 * 1024;
+static constexpr size_t default_file_size = 32 * 1024 * 1024;
+
+/// Configuration for the segment manager
+struct segment_manager_config {
+    std::filesystem::path base_dir;
+    size_t segment_size = default_segment_size;
+    size_t file_size = default_file_size;
+    size_t disk_size;
+    bool compaction_enabled = true;
+    size_t max_segments_per_compaction = 8;
+    seastar::scheduling_group compaction_sg;
+    utils::updateable_value<float> compaction_static_shares;
+    seastar::scheduling_group separator_sg;
+    uint32_t separator_delay_limit_ms;
+    size_t max_separator_memory = 1 * 1024 * 1024;
+};
+
+struct table_segment_histogram_bucket {
+    size_t count;
+    size_t max_data_size;
+
+    table_segment_histogram_bucket& operator+=(table_segment_histogram_bucket& other) {
+        count += other.count;
+        max_data_size = std::max(max_data_size, other.max_data_size);
+        return *this;
+    }
+};
+
+struct table_segment_stats {
+    size_t compaction_group_count{0};
+    size_t segment_count{0};
+    std::vector<table_segment_histogram_bucket> histogram;
+
+    table_segment_stats& operator+=(table_segment_stats& other) {
+        compaction_group_count += other.compaction_group_count;
+        segment_count += other.segment_count;
+        histogram.resize(std::max(histogram.size(), other.histogram.size()));
+        for (size_t i = 0; i < other.histogram.size(); i++) {
+            histogram[i] += other.histogram[i];
+        }
+        return *this;
+    }
+};
+
+class segment_manager_impl;
+class log_index;
+
+class segment_manager {
+    std::unique_ptr<segment_manager_impl> _impl;
+private:
+    segment_manager_impl& get_impl() noexcept;
+    const segment_manager_impl& get_impl() const noexcept;
+public:
+    static constexpr size_t block_alignment = 4096;
+
+    explicit segment_manager(segment_manager_config config);
+    ~segment_manager();
+
+    segment_manager(const segment_manager&) = delete;
+    segment_manager& operator=(const segment_manager&) = delete;
+
+    future<> do_recovery(replica::database&);
+
+    future<> start();
+    future<> stop();
+
+    future<log_location> write(write_buffer& wb);
+
+    future<log_record> read(log_location location);
+
+    void free_record(log_location location);
+
+    future<> for_each_record(const std::vector<log_segment_id>& segments,
+                            std::function<future<>(log_location, log_record)> callback);
+
+    compaction_manager& get_compaction_manager() noexcept;
+    const compaction_manager& get_compaction_manager() const noexcept;
+
+    void set_trigger_compaction_hook(std::function<void()> fn);
+    void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
+
+    size_t get_segment_size() const noexcept;
+
+    future<> discard_segments(segment_set&);
+
+    size_t get_memory_usage() const;
+
+    future<> await_pending_writes();
+
+    friend class segment_manager_impl;
+
+};
+
+}
+}
--- a/replica/logstor/types.hh
+++ b/replica/logstor/types.hh
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <cstdint>
+#include <fmt/format.h>
+#include "mutation/canonical_mutation.hh"
+#include "replica/logstor/utils.hh"
+#include "dht/decorated_key.hh"
+
+namespace replica::logstor {
+
+struct log_segment_id {
+    uint32_t value;
+
+    bool operator==(const log_segment_id& other) const noexcept = default;
+    auto operator<=>(const log_segment_id& other) const noexcept = default;
+};
+
+struct log_location {
+    log_segment_id segment;
+    uint32_t offset;
+    uint32_t size;
+
+    bool operator==(const log_location& other) const noexcept = default;
+};
+
+struct primary_index_key {
+    dht::decorated_key dk;
+};
+
+using record_generation = generation_base<uint16_t>;
+using segment_generation = generation_base<uint16_t>;
+
+struct index_entry {
+    log_location location;
+    record_generation generation;
+
+    bool operator==(const index_entry& other) const noexcept = default;
+};
+
+struct log_record {
+    primary_index_key key;
+    record_generation generation;
+    table_id table;
+    canonical_mutation mut;
+};
+
+}
+
+// Format specialization declarations and implementations
+template <>
+struct fmt::formatter<replica::logstor::log_segment_id> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::log_segment_id& id, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "segment({})", id.value);
+    }
+};
+
+template <>
+struct fmt::formatter<replica::logstor::log_location> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::log_location& loc, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "{{segment:{}, offset:{}, size:{}}}",
+                             loc.segment, loc.offset, loc.size);
+    }
+};
+
+template <>
+struct fmt::formatter<replica::logstor::primary_index_key> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::primary_index_key& key, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "{}", key.dk);
+    }
+};
--- a/replica/logstor/utils.hh
+++ b/replica/logstor/utils.hh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <concepts>
+#include "serializer.hh"
+
+namespace replica::logstor {
+
+// an unsigned integer that can be incremented and compared with wraparound semantics
+template <std::unsigned_integral T>
+class generation_base {
+    T _value;
+
+public:
+
+    using underlying = T;
+
+    constexpr generation_base() noexcept : _value(0) {}
+    constexpr explicit generation_base(T value) noexcept : _value(value) {}
+
+    constexpr T value() const noexcept { return _value; }
+
+    constexpr generation_base& operator++() noexcept {
+        ++_value;
+        return *this;
+    }
+
+    constexpr generation_base operator++(int) noexcept {
+        auto old = *this;
+        ++_value;
+        return old;
+    }
+
+    constexpr generation_base& operator+=(T delta) noexcept {
+        _value += delta;
+        return *this;
+    }
+
+    constexpr generation_base operator+(T delta) const noexcept {
+        return generation_base(_value + delta);
+    }
+
+    constexpr bool operator==(const generation_base& other) const noexcept = default;
+
+    /// Comparison using wraparound semantics.
+    /// Returns true if this generation is less than other, accounting for wraparound.
+    /// Assumes generations are within half the value space of each other.
+    constexpr bool operator<(const generation_base& other) const noexcept {
+        // Use signed comparison after converting difference to signed type
+        // This handles wraparound: if diff > max/2, it's treated as negative
+        using signed_type = std::make_signed_t<T>;
+        auto diff = static_cast<signed_type>(_value - other._value);
+        return diff < 0;
+    }
+
+    constexpr bool operator<=(const generation_base& other) const noexcept {
+        return *this == other || *this < other;
+    }
+
+    constexpr bool operator>(const generation_base& other) const noexcept {
+        return other < *this;
+    }
+
+    constexpr bool operator>=(const generation_base& other) const noexcept {
+        return other <= *this;
+    }
+};
+
+}
+
+template <std::unsigned_integral T>
+struct fmt::formatter<replica::logstor::generation_base<T>> : fmt::formatter<T> {
+    template <typename FormatContext>
+    auto format(const replica::logstor::generation_base<T>& gen, FormatContext& ctx) const {
+        return fmt::formatter<T>::format(gen.value(), ctx);
+    }
+};
+
+namespace ser {
+
+template <std::unsigned_integral T>
+struct serializer<replica::logstor::generation_base<T>> {
+    template <typename Output>
+    static void write(Output& out, const replica::logstor::generation_base<T>& g) {
+        serializer<typename replica::logstor::generation_base<T>::underlying>::write(out, g.value());
+    }
+    template <typename Input>
+    static replica::logstor::generation_base<T> read(Input& in) {
+        auto val = serializer<typename replica::logstor::generation_base<T>::underlying>::read(in);
+        return replica::logstor::generation_base<T>(val);
+    }
+    template <typename Input>
+    static void skip(Input& in) {
+        serializer<typename replica::logstor::generation_base<T>::underlying>::skip(in);
+    }
+};
+
+}
--- a/replica/logstor/write_buffer.cc
+++ b/replica/logstor/write_buffer.cc
@@ -0,0 +1,278 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#include "write_buffer.hh"
+#include "segment_manager.hh"
+#include "bytes_fwd.hh"
+#include "logstor.hh"
+#include "replica/logstor/types.hh"
+#include <seastar/core/simple-stream.hh>
+#include <seastar/core/with_scheduling_group.hh>
+#include <seastar/core/on_internal_error.hh>
+#include "serializer_impl.hh"
+#include "idl/logstor.dist.hh"
+#include "idl/logstor.dist.impl.hh"
+#include <seastar/core/align.hh>
+#include <seastar/core/aligned_buffer.hh>
+
+namespace replica::logstor {
+
+void log_record_writer::compute_size() const {
+    seastar::measuring_output_stream ms;
+    ser::serialize(ms, _record);
+    _size = ms.size();
+}
+
+void log_record_writer::write(ostream& out) const {
+    ser::serialize(out, _record);
+}
+
+// write_buffer
+
+write_buffer::write_buffer(size_t buffer_size, bool with_record_copy)
+        : _buffer_size(buffer_size)
+        , _buffer(seastar::allocate_aligned_buffer<char>(buffer_size, 4096))
+        , _with_record_copy(with_record_copy)
+{
+    if (_with_record_copy) {
+        _records_copy.reserve(_buffer_size / 100);
+    }
+    reset();
+}
+
+void write_buffer::reset() {
+    _stream = seastar::simple_memory_output_stream(_buffer.get(), _buffer_size);
+    _header_stream = _stream.write_substream(buffer_header_size);
+    _buffer_header = {};
+    _net_data_size = 0;
+    _record_count = 0;
+    _written = {};
+    _records_copy.clear();
+    _write_gate = {};
+}
+
+future<> write_buffer::close() {
+    if (!_write_gate.is_closed()) {
+        co_await _write_gate.close();
+    }
+}
+
+size_t write_buffer::get_max_write_size() const noexcept {
+    return _buffer_size - (buffer_header_size + record_header_size);
+}
+
+bool write_buffer::can_fit(size_t data_size) const noexcept {
+    // Calculate total space needed including header, data, and alignment padding
+    auto total_size = record_header_size + data_size;
+    auto aligned_size = align_up(total_size, record_alignment);
+    return aligned_size <= _stream.size();
+}
+
+bool write_buffer::has_data() const noexcept {
+    return offset_in_buffer() > buffer_header_size;
+}
+
+future<log_location_with_holder> write_buffer::write(log_record_writer writer, compaction_group* cg, seastar::gate::holder cg_holder) {
+    const auto data_size = writer.size();
+
+    if (!can_fit(data_size)) {
+        throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", data_size, _stream.size()));
+    }
+
+    auto rh = record_header {
+        .data_size = data_size
+    };
+    ser::serialize(_stream, rh);
+
+    // Write actual data
+    size_t data_offset_in_buffer = offset_in_buffer();
+    auto data_out = _stream.write_substream(data_size);
+    writer.write(data_out);
+
+    _net_data_size += data_size;
+    _record_count++;
+
+    // Add padding to align record
+    pad_to_alignment(record_alignment);
+
+    auto record_location = [data_offset_in_buffer, data_size] (log_location base_location) {
+        return log_location {
+            .segment = base_location.segment,
+            .offset = base_location.offset + data_offset_in_buffer,
+            .size = data_size
+        };
+    };
+
+    if (_with_record_copy) {
+        _records_copy.push_back(record_in_buffer {
+            .writer = std::move(writer),
+            .offset_in_buffer = data_offset_in_buffer,
+            .data_size = data_size,
+            .loc = _written.get_shared_future().then(record_location),
+            .cg = cg,
+            .cg_holder = std::move(cg_holder)
+        });
+    }
+
+    // hold the write buffer until the write is complete, and pass the holder to the
+    // caller for follow-up operations that should continue holding the buffer, such
+    // as index updates.
+    auto op = _write_gate.hold();
+
+    return _written.get_shared_future().then([record_location, op = std::move(op)] (log_location base_location) mutable {
+        return std::make_tuple(record_location(base_location), std::move(op));
+    });
+}
+
+future<log_location> write_buffer::write_no_holder(log_record_writer writer) {
+    // write and leave the gate immediately after the write.
+    // use carefully when the gate it not needed.
+    return write(std::move(writer)).then_unpack([] (log_location loc, seastar::gate::holder op) {
+        return loc;
+    });
+}
+
+void write_buffer::pad_to_alignment(size_t alignment) {
+    auto current_pos = offset_in_buffer();
+    auto next_pos = align_up(current_pos, alignment);
+    auto padding = next_pos - current_pos;
+    if (padding > 0) {
+        _stream.fill('\0', padding);
+    }
+}
+
+void write_buffer::finalize(size_t alignment) {
+    _buffer_header.data_size = static_cast<uint32_t>(offset_in_buffer() - buffer_header_size);
+    pad_to_alignment(alignment);
+}
+
+void write_buffer::write_header(segment_generation seg_gen) {
+    _buffer_header.magic = buffer_header_magic;
+    _buffer_header.seg_gen = seg_gen;
+    ser::serialize<buffer_header>(_header_stream, _buffer_header);
+}
+
+future<> write_buffer::complete_writes(log_location base_location) {
+    _written.set_value(base_location);
+    co_await close();
+}
+
+future<> write_buffer::abort_writes(std::exception_ptr ex) {
+    if (!_written.available()) {
+        _written.set_exception(std::move(ex));
+    }
+    co_await close();
+}
+
+std::vector<write_buffer::record_in_buffer>& write_buffer::records() {
+    if (!_with_record_copy) {
+        on_internal_error(logstor_logger, "requesting records but the write buffer has no record copy enabled");
+    }
+    return _records_copy;
+}
+
+size_t write_buffer::estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size) {
+    // Calculate total size needed including headers and alignment padding
+    size_t total_size = record_header_size * record_count + net_data_size;
+
+    // not perfect so let's multiply by some overhead constant
+    total_size = static_cast<size_t>(total_size * 1.1);
+
+    return align_up(total_size, segment_size) / segment_size;
+
+}
+
+// buffered_writer
+
+buffered_writer::buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg)
+        : _sm(sm)
+        , _available_buffers(num_flushing_buffers)
+        , _flush_sg(flush_sg) {
+
+    _buffers.reserve(num_flushing_buffers + 1);
+    for (size_t i = 0; i < num_flushing_buffers + 1; ++i) {
+        _buffers.emplace_back(_sm.get_segment_size(), true);
+    }
+
+    _active_buffer = active_buffer {
+        .buf = &_buffers[0],
+    };
+
+    for (size_t i = 1; i < num_flushing_buffers + 1; ++i) {
+        _available_buffers.push(&_buffers[i]);
+    }
+}
+
+future<> buffered_writer::start() {
+    logstor_logger.info("Starting write buffer");
+    co_return;
+}
+
+future<> buffered_writer::stop() {
+    if (_async_gate.is_closed()) {
+        co_return;
+    }
+    logstor_logger.info("Stopping write buffer");
+
+    co_await _async_gate.close();
+    logstor_logger.info("Write buffer stopped");
+}
+
+future<log_location_with_holder> buffered_writer::write(log_record record, compaction_group* cg, seastar::gate::holder cg_holder) {
+    auto holder = _async_gate.hold();
+
+    log_record_writer writer(std::move(record));
+
+    if (writer.size() > _active_buffer.buf->get_max_write_size()) {
+        throw std::runtime_error(fmt::format("Write size {} exceeds buffer size {}", writer.size(), _active_buffer.buf->get_max_write_size()));
+    }
+
+    // Check if write fits in current buffer
+    while (!_active_buffer.buf->can_fit(writer)) {
+        co_await _buffer_switched.wait();
+    }
+
+    // Write to buffer at current position
+    auto fut = _active_buffer.buf->write(std::move(writer), cg, std::move(cg_holder));
+
+    // Trigger flush for the active buffer if not in progress
+    if (!std::exchange(_active_buffer.flush_requested, true)) {
+        (void)with_gate(_async_gate, [this] {
+            return switch_buffer().then([this] (write_buffer* old_buf) mutable {
+                return with_scheduling_group(_flush_sg, [this, old_buf] mutable {
+                    return flush(old_buf);
+                });
+            });
+        });
+    }
+
+    co_return co_await std::move(fut);
+}
+
+future<write_buffer*> buffered_writer::switch_buffer() {
+    // Wait for and get the next available buffer
+    auto new_buf = co_await _available_buffers.pop_eventually();
+
+    auto next_active_buffer = active_buffer {
+        .buf = std::move(new_buf),
+    };
+
+    auto old_active_buffer = std::exchange(_active_buffer, std::move(next_active_buffer));
+    _buffer_switched.broadcast();
+
+    co_return std::move(old_active_buffer.buf);
+}
+
+future<> buffered_writer::flush(write_buffer* buf) {
+    co_await _sm.write(*buf);
+
+    // Return the flushed buffer to the available queue
+    buf->reset();
+    _available_buffers.push(std::move(buf));
+}
+
+}
--- a/replica/logstor/write_buffer.hh
+++ b/replica/logstor/write_buffer.hh
@@ -0,0 +1,294 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/core/aligned_buffer.hh>
+#include <seastar/core/condition-variable.hh>
+#include <seastar/core/scheduling.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/simple-stream.hh>
+#include <seastar/core/shared_future.hh>
+#include "types.hh"
+#include "serializer.hh"
+
+namespace replica {
+
+class compaction_group;
+
+namespace logstor {
+
+class segment_manager;
+
+// Writer for log records that handles serialization and size computation
+class log_record_writer {
+
+    using ostream = seastar::simple_memory_output_stream;
+
+    log_record _record;
+    mutable std::optional<size_t> _size;
+
+    void compute_size() const;
+
+public:
+    explicit log_record_writer(log_record record)
+        : _record(std::move(record))
+    {}
+
+    // Get serialized size (computed lazily)
+    size_t size() const {
+        if (!_size) {
+            compute_size();
+        }
+        return *_size;
+    }
+
+    // Write the record to an output stream
+    void write(ostream& out) const;
+
+    const log_record& record() const {
+        return _record;
+    }
+};
+
+using log_location_with_holder = std::tuple<log_location, seastar::gate::holder>;
+
+// Manages a single aligned buffer for accumulating records and writing
+// them to the segment manager.
+//
+// usage:
+//
+// create write buffer with specified size:
+//     write_buffer wb(buffer_size);
+// write data to the buffer if fits and get a future for the log location when flushed:
+//     log_record_writer writer(record);
+//     auto loc_fut = wb.write(writer);
+// flush the buffer to the segment manager:
+//     co_await sm.write(wb);
+// await individual write locations:
+//     auto record_loc = co_await std::move(loc_fut);
+class write_buffer {
+public:
+
+    using ostream = seastar::simple_memory_output_stream;
+
+    // buffer: buffer_header | record_1 | ... | record_n | 0-padding
+    // record: record_header | record_data | 0-padding
+    //
+    // buffer_header and record are aligned by record_alignment
+    // buffer_header and record_header have explicit sizes and serialization below
+
+    static constexpr uint32_t buffer_header_magic = 0x4c475342;
+    static constexpr size_t record_alignment = 8;
+
+    struct buffer_header {
+        uint32_t magic;
+        uint32_t data_size; // size of all records data following the buffer_header
+        segment_generation seg_gen;
+        uint16_t reserved1;
+        uint32_t reserved2;
+    };
+    static constexpr size_t buffer_header_size = 3 * sizeof(uint32_t) + sizeof(uint16_t) + sizeof(segment_generation::underlying);
+
+    static_assert(buffer_header_size % record_alignment == 0, "Buffer header size must be aligned by record_alignment");
+
+    struct record_header {
+        uint32_t data_size; // size of the record data following the record_header
+    };
+    static constexpr size_t record_header_size = sizeof(uint32_t);
+
+private:
+
+    using aligned_buffer_type = std::unique_ptr<char[], free_deleter>;
+
+    size_t _buffer_size;
+    aligned_buffer_type _buffer;
+    seastar::simple_memory_output_stream _stream;
+    buffer_header _buffer_header;
+    seastar::simple_memory_output_stream _header_stream;
+
+    size_t _net_data_size{0};
+    size_t _record_count{0};
+
+    shared_promise<log_location> _written;
+
+    seastar::gate _write_gate;
+
+    struct record_in_buffer {
+        log_record_writer writer;
+        size_t offset_in_buffer;
+        size_t data_size;
+        future<log_location> loc;
+        compaction_group* cg;
+        seastar::gate::holder cg_holder;
+    };
+
+    bool _with_record_copy;
+    std::vector<record_in_buffer> _records_copy;
+
+public:
+
+    write_buffer(size_t buffer_size, bool with_record_copy);
+
+    void reset();
+
+    write_buffer(const write_buffer&) = delete;
+    write_buffer& operator=(const write_buffer&) = delete;
+
+    write_buffer(write_buffer&&) noexcept = default;
+    write_buffer& operator=(write_buffer&&) noexcept = default;
+
+    future<> close();
+
+    size_t get_buffer_size() const noexcept { return _buffer_size; }
+    size_t offset_in_buffer() const noexcept { return _buffer_size - _stream.size(); }
+
+    bool can_fit(size_t data_size) const noexcept;
+
+    bool can_fit(const log_record_writer& writer) const noexcept {
+        return can_fit(writer.size());
+    }
+
+    bool has_data() const noexcept;
+
+    size_t get_max_write_size() const noexcept;
+
+    size_t get_net_data_size() const noexcept { return _net_data_size; }
+    size_t get_record_count() const noexcept { return _record_count; }
+
+    // Write a record to the buffer.
+    // Returns a future that will be resolved with the log location once flushed and a gate holder
+    // that keeps the write buffer open. The gate should be held for index updates after the write
+    // is done.
+    future<log_location_with_holder> write(log_record_writer, compaction_group*, seastar::gate::holder cg_holder);
+
+    future<log_location_with_holder> write(log_record_writer writer) {
+        return write(std::move(writer), nullptr, {});
+    }
+
+    // Write a record to the buffer.
+    // Returns a future that will be resolved with the log location once flushed.
+    // If there are follow-up operations to the write such as index updates then consider
+    // using write_with_holder instead to keep the write buffer open until those operations are complete.
+    future<log_location> write_no_holder(log_record_writer);
+
+    static size_t estimate_required_segments(size_t net_data_size, size_t record_count, size_t segment_size);
+
+private:
+
+    const char* data() const noexcept { return _buffer.get(); }
+
+    void write_header(segment_generation);
+
+    // get all write records in the buffer.
+    // with_record_copy must be to true when creating the write_buffer.
+    std::vector<record_in_buffer>& records();
+
+    /// Complete all tracked writes with their locations when the buffer is flushed to base_location
+    future<> complete_writes(log_location base_location);
+    future<> abort_writes(std::exception_ptr);
+
+    void pad_to_alignment(size_t alignment);
+    void finalize(size_t alignment);
+
+    friend class segment_manager_impl;
+    friend class compaction_manager_impl;
+};
+
+// Manages multiple buffers, a single active buffer and multiple flushing buffers.
+// When switch is requested for the active buffer, it waits for a flushing buffer to
+// become available, and continuing to accumulate writes until then.
+class buffered_writer {
+    static constexpr size_t num_flushing_buffers = 4;
+
+    segment_manager& _sm;
+
+    struct active_buffer {
+        write_buffer* buf;
+        bool flush_requested{false};
+    } _active_buffer;
+
+    std::vector<write_buffer> _buffers;
+    seastar::queue<write_buffer*> _available_buffers;
+    seastar::gate _async_gate;
+    seastar::condition_variable _buffer_switched;
+    seastar::scheduling_group _flush_sg;
+
+public:
+    explicit buffered_writer(segment_manager& sm, seastar::scheduling_group flush_sg);
+
+    buffered_writer(const buffered_writer&) = delete;
+    buffered_writer& operator=(const buffered_writer&) = delete;
+
+    future<> start();
+    future<> stop();
+
+    future<log_location_with_holder> write(log_record, compaction_group* cg = nullptr, seastar::gate::holder cg_holder = {});
+
+private:
+    future<write_buffer*> switch_buffer();
+    future<> flush(write_buffer*);
+
+};
+
+}
+}
+
+namespace ser {
+
+template <>
+struct serializer<replica::logstor::write_buffer::buffer_header> {
+    template <typename Output>
+    static void write(Output& out, const replica::logstor::write_buffer::buffer_header& h) {
+        serializer<uint32_t>::write(out, h.magic);
+        serializer<uint32_t>::write(out, h.data_size);
+        serializer<replica::logstor::segment_generation>::write(out, h.seg_gen);
+        serializer<uint16_t>::write(out, h.reserved1);
+        serializer<uint32_t>::write(out, h.reserved2);
+    }
+    template <typename Input>
+    static replica::logstor::write_buffer::buffer_header read(Input& in) {
+        replica::logstor::write_buffer::buffer_header h;
+        h.magic = serializer<uint32_t>::read(in);
+        h.data_size = serializer<uint32_t>::read(in);
+        h.seg_gen = serializer<replica::logstor::segment_generation>::read(in);
+        h.reserved1 = serializer<uint16_t>::read(in);
+        h.reserved2 = serializer<uint32_t>::read(in);
+        return h;
+    }
+    template <typename Input>
+    static void skip(Input& in) {
+        serializer<uint32_t>::skip(in);
+        serializer<uint32_t>::skip(in);
+        serializer<replica::logstor::segment_generation>::skip(in);
+        serializer<uint16_t>::skip(in);
+        serializer<uint32_t>::skip(in);
+    }
+};
+
+template <>
+struct serializer<replica::logstor::write_buffer::record_header> {
+    template <typename Output>
+    static void write(Output& out, const replica::logstor::write_buffer::record_header& h) {
+        serializer<uint32_t>::write(out, h.data_size);
+    }
+    template <typename Input>
+    static replica::logstor::write_buffer::record_header read(Input& in) {
+        replica::logstor::write_buffer::record_header h;
+        h.data_size = serializer<uint32_t>::read(in);
+        return h;
+    }
+    template <typename Input>
+    static void skip(Input& in) {
+        serializer<uint32_t>::skip(in);
+    }
+};
+} // namespace ser
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -217,6 +217,17 @@ table::add_memtables_to_reader_list(std::vector<mutation_reader>& readers,
    }
 }

+mutation_reader
+table::make_logstor_mutation_reader(schema_ptr s,
+                                   reader_permit permit,
+                                   const dht::partition_range& pr,
+                                   const query::partition_slice& slice,
+                                   tracing::trace_state_ptr trace_state,
+                                   streamed_mutation::forwarding fwd,
+                                   mutation_reader::forwarding fwd_mr) const {
+    return _logstor->make_reader(std::move(s), logstor_index(), std::move(permit), pr, slice, std::move(trace_state));
+}
+
 mutation_reader
 table::make_mutation_reader(schema_ptr s,
                           reader_permit permit,
@@ -229,6 +240,10 @@ table::make_mutation_reader(schema_ptr s,
        return (*_virtual_reader).make_mutation_reader(s, std::move(permit), range, slice, trace_state, fwd, fwd_mr);
    }

+    if (_logstor) [[unlikely]] {
+        return make_logstor_mutation_reader(s, std::move(permit), range, slice, std::move(trace_state), fwd, fwd_mr);
+    }
+
    std::vector<mutation_reader> readers;

    // We're assuming that cache and memtables are both read atomically
@@ -716,7 +731,9 @@ public:
        return make_ready_future<>();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override {}

    compaction_group& compaction_group_for_token(dht::token token) const override {
        return get_compaction_group();
@@ -762,6 +779,11 @@ public:
    }
 };

+struct background_merge_guard {
+    compaction::compaction_reenabler compaction_guard;
+    locator::effective_replication_map_ptr erm_guard;
+};
+
 class tablet_storage_group_manager final : public storage_group_manager {
    replica::table& _t;
    locator::host_id _my_host_id;
@@ -782,7 +804,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
    utils::phased_barrier _merge_fiber_barrier;
    std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
    // Holds compaction reenabler which disables compaction temporarily during tablet merge
-    std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
+    std::vector<background_merge_guard> _compaction_reenablers_for_merging;
 private:
    const schema_ptr& schema() const {
        return _t.schema();
@@ -806,7 +828,8 @@ private:
    // Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
    // the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
    // are merged into a new storage group with id (X >> 1).
-    void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
+    void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                        const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);

    // When merge completes, compaction groups of sibling tablets are added to same storage
    // group, but they're not merged yet into one, since the merge completion handler happens
@@ -822,9 +845,8 @@ private:
        return tablet_map().get_tablet_id(t).value();
    }

-    std::pair<size_t, locator::tablet_range_side> storage_group_of(dht::token t) const {
-        auto [id, side] = tablet_map().get_tablet_id_and_range_side(t);
-        auto idx = id.value();
+    size_t storage_group_of(dht::token t) const {
+        auto idx = tablet_id_for_token(t);
 #ifndef SCYLLA_BUILD_MODE_RELEASE
        if (idx >= tablet_count()) {
            on_fatal_internal_error(tlogger, format("storage_group_of: index out of range: idx={} size_log2={} size={} token={}",
@@ -836,7 +858,7 @@ private:
                                                    idx, sg.token_range(), t));
        }
 #endif
-        return { idx, side };
+        return idx;
    }

    repair_classifier_func make_repair_sstable_classifier_func() const {
@@ -900,7 +922,9 @@ public:
                std::exchange(_stop_fut, make_ready_future())).discard_result();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override;

    compaction_group& compaction_group_for_token(dht::token token) const override;
    utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
@@ -911,7 +935,7 @@ public:
        return log2ceil(tablet_map().tablet_count());
    }
    storage_group& storage_group_for_token(dht::token token) const override {
-        return storage_group_for_id(storage_group_of(token).first);
+        return storage_group_for_id(storage_group_of(token));
    }

    locator::combined_load_stats table_load_stats() const override;
@@ -959,9 +983,20 @@ size_t storage_group::to_idx(locator::tablet_range_side side) const {
    return size_t(side);
 }

-compaction_group_ptr& storage_group::select_compaction_group(locator::tablet_range_side side) noexcept {
+compaction_group_ptr& storage_group::select_compaction_group(dht::token token, const locator::tablet_map& tmap) noexcept {
    if (splitting_mode()) {
-        return _split_ready_groups[to_idx(side)];
+        return _split_ready_groups[to_idx(tmap.get_tablet_range_side(token))];
+    }
+    return _main_cg;
+}
+
+compaction_group_ptr& storage_group::select_compaction_group(dht::token first, dht::token last, const locator::tablet_map& tmap) noexcept {
+    if (splitting_mode()) {
+        auto first_side = tmap.get_tablet_range_side(first);
+        auto last_side = tmap.get_tablet_range_side(last);
+        if (first_side == last_side) {
+            return _split_ready_groups[to_idx(first_side)];
+        }
    }
    return _main_cg;
 }
@@ -1056,6 +1091,38 @@ future<> compaction_group::split(compaction::compaction_type_options::split opt,
    }
 }

+future<> compaction_group::discard_logstor_segments() {
+    auto& sm = get_logstor_segment_manager();
+    co_await sm.discard_segments(*_logstor_segments);
+}
+
+future<> compaction_group::flush_separator(std::optional<size_t> seq_num) {
+    auto units = co_await get_units(_separator_flush_sem, 1);
+    auto pending = std::exchange(_separator_flushes, {});
+    if (_logstor_separator && (!seq_num || _logstor_separator->min_seq_num < *seq_num)) {
+        auto& cm = get_logstor_compaction_manager();
+        auto b = std::move(*_logstor_separator);
+        _logstor_separator.reset();
+        pending.push_back(cm.flush_separator_buffer(std::move(b), *this));
+    }
+    co_await when_all(pending.begin(), pending.end());
+}
+
+logstor::separator_buffer& compaction_group::get_separator_buffer(size_t write_size) {
+    if (!_logstor_separator || !_logstor_separator->can_fit(write_size)) {
+        auto& cm = get_logstor_compaction_manager();
+        if (_logstor_separator) {
+            auto b = std::move(*_logstor_separator);
+            _logstor_separator.reset();
+
+            std::erase_if(_separator_flushes, [](future<>& f) { return f.available(); });
+            _separator_flushes.push_back(cm.flush_separator_buffer(std::move(b), *this));
+        }
+        _logstor_separator.emplace(cm.allocate_separator_buffer());
+    }
+    return *_logstor_separator;
+}
+
 future<> storage_group::split(compaction::compaction_type_options::split opt, tasks::task_info tablet_split_task_info) {
    if (set_split_mode()) {
        co_return;
@@ -1222,9 +1289,9 @@ storage_group& table::storage_group_for_id(size_t i) const {
 }

 compaction_group& tablet_storage_group_manager::compaction_group_for_token(dht::token token) const {
-    auto [idx, range_side] = storage_group_of(token);
+    auto idx = storage_group_of(token);
    auto& sg = storage_group_for_id(idx);
-    return *sg.select_compaction_group(range_side);
+    return *sg.select_compaction_group(token, tablet_map());
 }

 compaction_group& table::compaction_group_for_token(dht::token token) const {
@@ -1265,8 +1332,8 @@ compaction_group& table::compaction_group_for_key(partition_key_view key, const
 }

 compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(const sstables::shared_sstable& sst) const {
-    auto [first_id, first_range_side] = storage_group_of(sst->get_first_decorated_key().token());
-    auto [last_id, last_range_side] = storage_group_of(sst->get_last_decorated_key().token());
+    auto first_id = storage_group_of(sst->get_first_decorated_key().token());
+    auto last_id = storage_group_of(sst->get_last_decorated_key().token());

    auto sstable_desc = [] (const sstables::shared_sstable& sst) {
        auto& identifier_opt = sst->sstable_identifier();
@@ -1289,12 +1356,10 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con

    try {
        auto& sg = storage_group_for_id(first_id);
-
-        if (first_range_side != last_range_side) {
-            return *sg.main_compaction_group();
-        }
-
-        return *sg.select_compaction_group(first_range_side);
+        return *sg.select_compaction_group(
+                sst->get_first_decorated_key().token(),
+                sst->get_last_decorated_key().token(),
+                tablet_map());
    } catch (std::out_of_range& e) {
        on_internal_error(tlogger, format("Unable to load SSTable {} of tablet {}, due to {}",
                                          sstable_desc(sst),
@@ -1465,6 +1530,7 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
                                        sstables::offstrategy offstrategy) {
    std::vector<sstables::shared_sstable> ret, ssts;
    std::exception_ptr ex;
+    log_level failure_log_level = log_level::error;
    try {
        bool trigger_compaction = offstrategy == sstables::offstrategy::no;
        auto& cg = compaction_group_for_sstable(new_sst);
@@ -1486,6 +1552,9 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
            co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
            sst = nullptr;
        }
+    } catch (compaction::compaction_stopped_exception&) {
+        failure_log_level = log_level::warn;
+        ex = std::current_exception();
    } catch (...) {
        ex = std::current_exception();
    }
@@ -1493,13 +1562,13 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
    if (ex) {
        // on failed split, input sstable is unlinked here.
        if (new_sst) {
-            tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
+            tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
            co_await new_sst->unlink();
        }
        // on failure after successful split, sstables not attached yet will be unlinked
-        co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+        co_await coroutine::parallel_for_each(ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
            if (sst) {
-                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
                co_await sst->unlink();
            }
        });
@@ -1513,6 +1582,7 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
                                         std::function<future<>(sstables::shared_sstable)> on_add) {
    std::exception_ptr ex;
    std::vector<sstables::shared_sstable> ret;
+    log_level failure_log_level = log_level::error;

    // We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
    // so the exception handling below will only have to unlink sstables not processed yet.
@@ -1522,14 +1592,17 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
            std::ranges::move(ssts, std::back_inserter(ret));

        }
+    } catch (compaction::compaction_stopped_exception&) {
+        failure_log_level = log_level::warn;
+        ex = std::current_exception();
    } catch (...) {
        ex = std::current_exception();
    }

    if (ex) {
-        co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+        co_await coroutine::parallel_for_each(new_ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
            if (sst) {
-                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
                co_await sst->unlink();
            }
        });
@@ -1568,6 +1641,19 @@ table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector
    }
 }

+bool table::add_logstor_segment(logstor::segment_descriptor& seg_desc, dht::token first_token, dht::token last_token) {
+    auto& cg = compaction_group_for_token(first_token);
+    if (&cg != &compaction_group_for_token(last_token)) {
+        return false;
+    }
+    cg.add_logstor_segment(seg_desc);
+    return true;
+}
+
+logstor::separator_buffer& table::get_logstor_separator_buffer(dht::token token, size_t write_size) {
+    return compaction_group_for_token(token).get_separator_buffer(write_size);
+}
+
 // Handles permit management only, used for situations where we don't want to inform
 // the compaction manager about backlogs (i.e., tests)
 class permit_monitor : public sstables::write_monitor {
@@ -1765,7 +1851,9 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
        utils::get_local_injector().inject("table_seal_active_memtable_try_flush", []() {
            throw std::system_error(ENOSPC, std::system_category(), "Injected error");
        });
-        co_return co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
+        co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
+        // signal a memtable was sealed
+        utils::get_local_injector().receive_message("table_seal_post_flush_waiters");
    });

    undo_stats.reset();
@@ -2021,8 +2109,15 @@ size_t compaction_group::live_sstable_count() const noexcept {
    return _main_sstables->size() + _maintenance_sstables->size();
 }

+size_t compaction_group::logstor_disk_space_used() const noexcept {
+    if (!_logstor_segments || !_t.uses_logstor()) {
+        return 0;
+    }
+    return _logstor_segments->segment_count() * _t.get_logstor_segment_manager().get_segment_size();
+}
+
 uint64_t compaction_group::live_disk_space_used() const noexcept {
-    return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk();
+    return _main_sstables->bytes_on_disk() + _maintenance_sstables->bytes_on_disk() + logstor_disk_space_used();
 }

 sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() const noexcept {
@@ -2372,6 +2467,12 @@ void table::trigger_compaction() {
    });
 }

+void table::trigger_logstor_compaction() {
+    for_each_compaction_group([] (compaction_group& cg) {
+        cg.trigger_logstor_compaction();
+    });
+}
+
 void table::try_trigger_compaction(compaction_group& cg) noexcept {
    try {
        cg.trigger_compaction();
@@ -2380,6 +2481,51 @@ void table::try_trigger_compaction(compaction_group& cg) noexcept {
    }
 }

+future<> table::flush_separator(std::optional<size_t> seq_num) {
+    if (!uses_logstor()) {
+        co_return;
+    }
+
+    // wait for all previous writes to be written to a separator buffer
+    co_await get_logstor_segment_manager().await_pending_writes();
+
+    // flush separator buffers
+    co_await parallel_foreach_compaction_group([seq_num] (compaction_group& cg) {
+        return cg.flush_separator(seq_num);
+    });
+}
+
+future<logstor::table_segment_stats> table::get_logstor_segment_stats() const {
+    logstor::table_segment_stats result;
+    if (!uses_logstor()) {
+        co_return std::move(result);
+    }
+
+    const auto segment_size = get_logstor_segment_manager().get_segment_size();
+    const auto bucket_count = 32;
+    const auto bucket_size = segment_size / bucket_count;
+
+    result.histogram.resize(bucket_count);
+
+    co_await const_cast<table*>(this)->parallel_foreach_compaction_group([&] (const compaction_group& cg) -> future<> {
+        const auto& cg_segments = cg.logstor_segments();
+
+        result.compaction_group_count++;
+        result.segment_count += cg_segments.segment_count();
+
+        for (const auto& desc : cg_segments._segments) {
+            co_await coroutine::maybe_yield();
+            auto data_size = desc.net_data_size(segment_size);
+            auto bucket_index = std::min<size_t>(data_size / bucket_size, bucket_count - 1);
+            auto& bucket = result.histogram[bucket_index];
+            bucket.count++;
+            bucket.max_data_size = std::max(bucket.max_data_size, data_size);
+        }
+    });
+
+    co_return std::move(result);
+}
+
 void compaction_group::trigger_compaction() {
    // But not if we're locked out or stopping
    if (!_async_gate.is_closed()) {
@@ -2390,6 +2536,14 @@ void compaction_group::trigger_compaction() {
    }
 }

+void compaction_group::trigger_logstor_compaction() {
+    if (!_async_gate.is_closed() && !_t.is_auto_compaction_disabled_by_user()) {
+        if (_logstor_segments) {
+            get_logstor_compaction_manager().submit(*this);
+        }
+    }
+}
+
 void table::trigger_offstrategy_compaction() {
    // Run in background.
    // This is safe since the the compaction task is tracked
@@ -2846,6 +3000,7 @@ compaction_group::compaction_group(table& t, size_t group_id, dht::token_range t
    , _async_gate(format("[compaction_group {}.{} {}]", t.schema()->ks_name(), t.schema()->cf_name(), group_id))
    , _backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
    , _repair_sstable_classifier(std::move(repair_classifier))
+    , _logstor_segments(make_lw_shared<logstor::segment_set>())
 {
 }

@@ -2879,9 +3034,13 @@ future<> compaction_group::stop(sstring reason) noexcept {
  for (auto view : all_views()) {
    co_await _t._compaction_manager.stop_ongoing_compactions(reason, view);
  }
+    if (_t.uses_logstor()) {
+        co_await get_logstor_compaction_manager().stop_ongoing_compactions(*this);
+    }
    co_await _async_gate.close();
    auto flush_future = co_await seastar::coroutine::as_future(flush());

+    co_await flush_separator();
    co_await _flush_gate.close();
    co_await _sstable_add_gate.close();
  // FIXME: indentation
@@ -3198,7 +3357,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
    }
 }

-void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
+void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                                                  const locator::tablet_map& old_tmap,
+                                                                  const locator::tablet_map& new_tmap) {
    auto table_id = schema()->id();
    size_t old_tablet_count = old_tmap.tablet_count();
    size_t new_tablet_count = new_tmap.tablet_count();
@@ -3222,7 +3383,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
        auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
        for (auto& view : new_cg->all_views()) {
            auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
-            _compaction_reenablers_for_merging.push_back(std::move(cre));
+            _compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
        }
        auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));

@@ -3255,7 +3416,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
    _merge_completion_event.signal();
 }

-void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
+void tablet_storage_group_manager::update_effective_replication_map(
+        const locator::effective_replication_map_ptr& old_erm,
+        const locator::effective_replication_map& erm,
+        noncopyable_function<void()> refresh_mutation_source)
+{
    auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
    auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);

@@ -3271,7 +3436,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
        if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
            utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
        }
-        handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
+        handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
    }

    // Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
@@ -3357,7 +3522,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
    };

    if (uses_tablets()) {
-        _sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
+        _sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
    }
    if (old_erm) {
        old_erm->invalidate();
@@ -4002,6 +4167,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
            }

            auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
+            auto close_lister = deferred_close(lister);
            while (auto de = lister.get().get()) {
                auto snapshot_name = de->name;
                all_snapshots.emplace(snapshot_name, snapshot_details());
@@ -4009,6 +4175,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
                auto& sd = all_snapshots.at(snapshot_name);
                sd.total += details.total;
                sd.live += details.live;
+                utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
+                    throw std::runtime_error("Injected exception in get_snapshot_details");
+                }).get();
            }
        }
        return all_snapshots;
@@ -4028,53 +4197,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
    }

    auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
-    while (auto de = co_await lister.get()) {
-        const auto& name = de->name;
-        future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
-        auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
-        auto size = sd.allocated_size;
+    std::exception_ptr ex;
+    try {
+        while (auto de = co_await lister.get()) {
+            const auto& name = de->name;
+            future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
+            auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
+            auto size = sd.allocated_size;

-        // The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
-        //
-        // All the others should just generate an exception: there is something wrong, so don't blindly
-        // add it to the size.
-        if (name != "manifest.json" && name != "schema.cql") {
-            details.total += size;
-            if (sd.number_of_links == 1) {
-                // File exists only in the snapshot directory.
-                details.live += size;
+            utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
+                throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
+            }).get();
+
+            // The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
+            //
+            // All the others should just generate an exception: there is something wrong, so don't blindly
+            // add it to the size.
+            if (name != "manifest.json" && name != "schema.cql") {
+                details.total += size;
+                if (sd.number_of_links == 1) {
+                    // File exists only in the snapshot directory.
+                    details.live += size;
+                    continue;
+                }
+                // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
+                // So check the datadir for the file too.
+            } else {
                continue;
            }
-            // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
-            // So check the datadir for the file too.
-        } else {
-            continue;
-        }

-        auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
-          try {
-            // File exists in the main SSTable directory. Snapshots are not contributing to size
-            auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
-            // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
-            if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
-                dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
-                        (path / name).native(), psd.device_id, psd.inode_number, psd.size,
-                        (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+            auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
+              try {
+                // File exists in the main SSTable directory. Snapshots are not contributing to size
+                auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
+                // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
+                if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
+                    dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
+                            (path / name).native(), psd.device_id, psd.inode_number, psd.size,
+                            (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+                    co_return false;
+                }
+                co_return true;
+              } catch (std::system_error& e) {
+                if (e.code() != std::error_code(ENOENT, std::system_category())) {
+                    throw;
+                }
                co_return false;
+              }
+            };
+            // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
+            if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
+                    !co_await exists_in_dir(data_directory, datadir, name)) {
+                details.live += size;
            }
-            co_return true;
-          } catch (std::system_error& e) {
-            if (e.code() != std::error_code(ENOENT, std::system_category())) {
-                throw;
-            }
-            co_return false;
-          }
-        };
-        // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
-        if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
-                !co_await exists_in_dir(data_directory, datadir, name)) {
-            details.live += size;
        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await lister.close();
+    if (ex) {
+        co_await coroutine::return_exception_ptr(std::move(ex));
    }

    co_return details;
@@ -4261,6 +4443,18 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
    co_return rp;
 }

+future<> table::discard_logstor_segments() {
+    if (!uses_logstor()) {
+        co_return;
+    }
+
+    _logstor_index->clear();
+
+    co_await parallel_foreach_compaction_group([] (compaction_group& cg) {
+        return cg.discard_logstor_segments();
+    });
+}
+
 void table::mark_ready_for_writes(db::commitlog* cl) {
    if (!_readonly) {
        on_internal_error(dblog, ::format("table {}.{} is already writable", _schema->ks_name(), _schema->cf_name()));
@@ -4271,6 +4465,19 @@ void table::mark_ready_for_writes(db::commitlog* cl) {
    _readonly = false;
 }

+void table::init_logstor(logstor::logstor* ls) {
+    _logstor = ls;
+    _logstor_index = std::make_unique<logstor::primary_index>(_schema);
+}
+
+size_t table::get_logstor_memory_usage() const {
+    size_t m = 0;
+    if (_logstor_index) {
+        m += _logstor_index->get_memory_usage();
+    }
+    return m;
+}
+
 db::commitlog* table::commitlog() const {
    if (_readonly) [[unlikely]] {
        on_internal_error(dblog, ::format("table {}.{} is readonly", _schema->ks_name(), _schema->cf_name()));
@@ -4295,6 +4502,9 @@ void table::set_schema(schema_ptr s) {
    if (_counter_cell_locks) {
        _counter_cell_locks->set_schema(s);
    }
+    if (_logstor_index) {
+        _logstor_index->set_schema(s);
+    }
    _schema = std::move(s);

    for (auto&& v : _views) {
@@ -4522,6 +4732,11 @@ future<> table::apply(const mutation& m, db::rp_handle&& h, db::timeout_clock::t

    auto& cg = compaction_group_for_token(m.token());
    auto holder = cg.async_gate().hold();
+
+    if (_logstor) [[unlikely]] {
+        return _logstor->write(m, cg, std::move(holder));
+    }
+
    return dirty_memory_region_group().run_when_memory_available([this, &m, h = std::move(h), &cg, holder = std::move(holder)] () mutable {
        do_apply(cg, std::move(h), m);
    }, timeout);
@@ -4537,6 +4752,10 @@ future<> table::apply(const frozen_mutation& m, schema_ptr m_schema, db::rp_hand
    auto& cg = compaction_group_for_key(m.key(), m_schema);
    auto holder = cg.async_gate().hold();

+    if (_logstor) [[unlikely]] {
+        return _logstor->write(m.unfreeze(m_schema), cg, std::move(holder));
+    }
+
    return dirty_memory_region_group().run_when_memory_available([this, &m, m_schema = std::move(m_schema), h = std::move(h), &cg, holder = std::move(holder)]() mutable {
        do_apply(cg, std::move(h), m, m_schema);
    }, timeout);
@@ -4641,13 +4860,14 @@ table::query(schema_ptr query_schema,
    }

    std::optional<full_position> last_pos;
-    if (querier_opt && querier_opt->current_position()) {
-        last_pos.emplace(*querier_opt->current_position());
-    }
-
-    if (!saved_querier || (querier_opt && !querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
-        co_await querier_opt->close();
-        querier_opt = {};
+    if (querier_opt) {
+        if (querier_opt->current_position()) {
+            last_pos.emplace(*querier_opt->current_position());
+        }
+        if (!saved_querier || (!querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
+            co_await querier_opt->close();
+            querier_opt = {};
+        }
    }
    if (saved_querier) {
        *saved_querier = std::move(querier_opt);
@@ -4737,6 +4957,10 @@ table::enable_auto_compaction() {
    //      see table::disable_auto_compaction() notes.
    _compaction_disabled_by_user = false;
    trigger_compaction();
+
+    if (uses_logstor()) {
+        trigger_logstor_compaction();
+    }
 }

 future<>
@@ -4768,11 +4992,18 @@ table::disable_auto_compaction() {
    // - it will break computation of major compaction descriptor
    //   for new submissions
    _compaction_disabled_by_user = true;
-    return with_gate(_async_gate, [this] {
-        return parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
-            return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
-        });
+
+    auto holder = _async_gate.hold();
+
+    co_await parallel_foreach_compaction_group_view([this] (compaction::compaction_group_view& view) {
+        return _compaction_manager.stop_ongoing_compactions("disable auto-compaction", &view, compaction::compaction_type::Compaction);
    });
+
+    if (uses_logstor()) {
+        co_await parallel_foreach_compaction_group([this] (compaction_group& cg) {
+            return get_logstor_compaction_manager().stop_ongoing_compactions(cg);
+        });
+    }
 }

 void table::set_tombstone_gc_enabled(bool tombstone_gc_enabled) noexcept {
@@ -4985,6 +5216,26 @@ const compaction::compaction_manager& compaction_group::get_compaction_manager()
    return _t.get_compaction_manager();
 }

+logstor::segment_manager& compaction_group::get_logstor_segment_manager() noexcept {
+    return _t.get_logstor_segment_manager();
+}
+
+const logstor::segment_manager& compaction_group::get_logstor_segment_manager() const noexcept {
+    return _t.get_logstor_segment_manager();
+}
+
+logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() noexcept {
+    return _t.get_logstor_compaction_manager();
+}
+
+const logstor::compaction_manager& compaction_group::get_logstor_compaction_manager() const noexcept {
+    return _t.get_logstor_compaction_manager();
+}
+
+logstor::primary_index& compaction_group::get_logstor_index() noexcept {
+    return _t.logstor_index();
+}
+
 compaction::compaction_group_view& compaction_group::as_view_for_static_sharding() const {
    return view_for_unrepaired_data();
 }
--- a/rust/CMakeLists.txt
+++ b/rust/CMakeLists.txt
@@ -87,6 +87,11 @@ target_include_directories(wasmtime_bindings
 target_link_libraries(wasmtime_bindings
  INTERFACE Rust::rust_combined)
 if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  # The PCH from scylla-precompiled-header is compiled with Seastar's compile
+  # flags, including sanitizer flags in Debug/Sanitize modes. Any target reusing
+  # this PCH must have matching compile options, otherwise the compiler rejects
+  # the PCH due to flag mismatch (e.g., -fsanitize=address).
+  target_link_libraries(wasmtime_bindings PRIVATE Seastar::seastar)
  target_precompile_headers(wasmtime_bindings REUSE_FROM scylla-precompiled-header)
 endif()

@@ -108,5 +113,6 @@ target_include_directories(inc
 target_link_libraries(inc
  INTERFACE Rust::rust_combined)
 if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_link_libraries(inc PRIVATE Seastar::seastar)
  target_precompile_headers(inc REUSE_FROM scylla-precompiled-header)
 endif()
--- a/schema/schema.cc
+++ b/schema/schema.cc
@@ -592,6 +592,7 @@ bool operator==(const schema::user_properties& lhs, const schema::user_propertie
        && lhs.compaction_strategy == rhs.compaction_strategy
        && lhs.compaction_strategy_options == rhs.compaction_strategy_options
        && lhs.compaction_enabled == rhs.compaction_enabled
+        && lhs.storage_engine == rhs.storage_engine
        && lhs.caching_options == rhs.caching_options
        && lhs.tablet_options == rhs.tablet_options
        && lhs.get_paxos_grace_seconds() == rhs.get_paxos_grace_seconds()
@@ -698,6 +699,7 @@ table_schema_version schema::calculate_digest(const schema::raw_schema& r) {
    feed_hash(h, r._view_info);
    feed_hash(h, r._indices_by_name);
    feed_hash(h, r._is_counter);
+    feed_hash(h, r._props.storage_engine);

    for (auto&& [name, ext] : r._props.extensions) {
        feed_hash(h, name);
@@ -874,6 +876,9 @@ auto fmt::formatter<schema>::format(const schema& s, fmt::format_context& ctx) c
    out = fmt::format_to(out, ",minIndexInterval={}", s._raw._props.min_index_interval);
    out = fmt::format_to(out, ",maxIndexInterval={}", s._raw._props.max_index_interval);
    out = fmt::format_to(out, ",speculativeRetry={}", s._raw._props.speculative_retry.to_sstring());
+    if (s.storage_engine() != storage_engine_type::normal) {
+        out = fmt::format_to(out, ",storage_engine={}", storage_engine_type_to_sstring(s.storage_engine()));
+    }
    out = fmt::format_to(out, ",tablets={{");
    if (s._raw._props.tablet_options) {
        n = 0;
@@ -1210,6 +1215,9 @@ fragmented_ostringstream& schema::schema_properties(const schema_describe_helper
    os << "\n    AND memtable_flush_period_in_ms = " << fmt::to_string(memtable_flush_period());
    os << "\n    AND min_index_interval = " << fmt::to_string(min_index_interval());
    os << "\n    AND speculative_retry = '" << speculative_retry().to_sstring() << "'";
+    if (storage_engine() != storage_engine_type::normal) {
+        os << "\n    AND storage_engine = '" << storage_engine_type_to_sstring(storage_engine()) << "'";
+    }

    if (has_tablet_options()) {
        os << "\n    AND tablets = {";
--- a/schema/schema.hh
+++ b/schema/schema.hh
@@ -175,6 +175,21 @@ public:
    bool operator==(const speculative_retry& other) const = default;
 };

+enum class storage_engine_type {
+    normal,
+    logstor,
+};
+
+inline sstring storage_engine_type_to_sstring(storage_engine_type t) {
+    switch (t) {
+    case storage_engine_type::normal:
+        return "normal";
+    case storage_engine_type::logstor:
+        return "logstor";
+    }
+    throw std::invalid_argument(format("unknown storage engine type: {:d}\n", uint8_t(t)));
+}
+
 using index_options_map = std::unordered_map<sstring, sstring>;

 enum class index_metadata_kind {
@@ -561,6 +576,7 @@ public:
        compaction::compaction_strategy_type compaction_strategy = compaction::compaction_strategy_type::incremental;
        std::map<sstring, sstring> compaction_strategy_options;
        bool compaction_enabled = true;
+        storage_engine_type storage_engine = storage_engine_type::normal;
        ::caching_options caching_options;
        std::optional<std::map<sstring, sstring>> tablet_options;

@@ -776,6 +792,14 @@ public:
        return _raw._props.compaction_enabled;
    }

+    storage_engine_type storage_engine() const {
+        return _raw._props.storage_engine;
+    }
+
+    bool logstor_enabled() const {
+        return _raw._props.storage_engine == storage_engine_type::logstor;
+    }
+
    const cdc::options& cdc_options() const {
        return _raw._props.get_cdc_options();
    }
--- a/schema/schema_builder.hh
+++ b/schema/schema_builder.hh
@@ -269,6 +269,11 @@ public:
        enable_schema_commitlog();
    }

+    schema_builder& set_logstor() {
+        _raw._props.storage_engine = storage_engine_type::logstor;
+        return *this;
+    }
+
    class default_names {
    public:
        default_names(const schema_builder&);
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -952,6 +952,8 @@ class sstring:

    @staticmethod
    def to_hex(data, size):
+        if size == 0:
+            return ''
        inf = gdb.selected_inferior()
        return bytes(inf.read_memory(data, size)).hex()

@@ -974,6 +976,8 @@ class sstring:
            return self.ref['u']['external']['str']

    def as_bytes(self):
+        if len(self) == 0:
+            return b''
        inf = gdb.selected_inferior()
        return bytes(inf.read_memory(self.data(), len(self)))

@@ -5636,6 +5640,8 @@ class scylla_sstable_summary(gdb.Command):
        self.inf = gdb.selected_inferior()

    def to_hex(self, data, size):
+        if size == 0:
+            return ''
        return bytes(self.inf.read_memory(data, size)).hex()

    def invoke(self, arg, for_tty):
@@ -5647,6 +5653,10 @@ class scylla_sstable_summary(gdb.Command):
            sst = seastar_lw_shared_ptr(arg).get().dereference()
        else:
            sst = arg
+        ms_version = int(gdb.parse_and_eval('sstables::sstable_version_types::ms'))
+        if int(sst['_version']) >= ms_version:
+            gdb.write("sstable uses ms format (trie-based index); summary is not populated.\n")
+            return
        summary = seastar_lw_shared_ptr(sst['_components']['_value']).get().dereference()['summary']

        gdb.write("header: {}\n".format(summary['header']))
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -227,8 +227,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
    for (const auto& m : modules.entries) {
        if (m.table == db::system_keyspace::service_levels_v2()->id()) {
            update_service_levels_cache = true;
-        } else if (m.table == db::system_keyspace::role_members()->id() || m.table == db::system_keyspace::role_attributes()->id()) {
-            update_service_levels_effective_cache = true;
        } else if (m.table == db::system_keyspace::dicts()->id()) {
            auto pk_type = db::system_keyspace::dicts()->partition_key_type();
            auto name_value = pk_type->deserialize_value(m.pk.representation());
@@ -247,6 +245,11 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
            auto cdc_log_table_id = table_id(value_cast<utils::UUID>(uuid_type->deserialize_value(elements.front())));
            update_cdc_streams.insert(cdc_log_table_id);
        } else if (auth::cache::includes_table(m.table)) {
+            if (m.table == db::system_keyspace::role_members()->id() ||
+                    m.table == db::system_keyspace::role_attributes()->id()) {
+                update_service_levels_effective_cache = true;
+            }
+
            auto schema = _ss.get_database().find_schema(m.table);
            const auto elements = m.pk.explode(*schema);
            auto role = value_cast<sstring>(schema->partition_key_type()->
@@ -255,6 +258,9 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
        }
    }
    
+    if (update_auth_cache_roles.size()) {
+        co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
+    }
    if (update_service_levels_cache || update_service_levels_effective_cache) { // this also updates SL effective cache
        co_await _ss.update_service_levels_cache(qos::update_both_cache_levels(update_service_levels_cache), qos::query_context::group0);
    }
@@ -264,9 +270,6 @@ future<> group0_state_machine::reload_modules(modules_to_reload modules) {
    if (update_cdc_streams.size()) {
        co_await _ss.load_cdc_streams(std::move(update_cdc_streams));
    }
-    if (update_auth_cache_roles.size()) {
-        co_await _ss.auth_cache().load_roles(std::move(update_auth_cache_roles));
-    }
 }

 future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merger) {
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -4653,6 +4653,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
    auto& stats = handler_ptr->stats();
    auto& handler = *handler_ptr;
    auto& global_stats = handler._proxy->_global_stats;
+    auto schema = handler_ptr->get_schema();

    if (handler.get_targets().size() == 0) {
        // Usually we remove the response handler when receiving responses from all targets.
@@ -4748,7 +4749,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
        }

        // Waited on indirectly.
-        (void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats] (std::exception_ptr eptr) {
+        (void)f.handle_exception([response_id, forward_size, coordinator, handler_ptr, p = shared_from_this(), &stats, schema] (std::exception_ptr eptr) {
            ++stats.writes_errors.get_ep_stat(handler_ptr->_effective_replication_map_ptr->get_topology(), coordinator);
            error err = error::FAILURE;
            std::optional<sstring> msg;
@@ -4762,8 +4763,8 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
                // ignore, disconnect will be logged by gossiper
            } else if (const auto* e = try_catch_nested<seastar::gate_closed_exception>(eptr)) {
                // may happen during shutdown, log and ignore it
-                slogger.warn("gate_closed_exception during mutation write to {}: {}",
-                    coordinator, e->what());
+                slogger.warn("gate_closed_exception during mutation write to {}.{} on {}: {}",
+                    schema->ks_name(), schema->cf_name(), coordinator, e->what());
            } else if (try_catch<timed_out_error>(eptr)) {
                // from lmutate(). Ignore so that logs are not flooded
                // database total_writes_timedout counter was incremented.
@@ -4774,7 +4775,8 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
            } else if (auto* e = try_catch<replica::critical_disk_utilization_exception>(eptr)) {
                msg = e->what();
            } else {
-                slogger.error("exception during mutation write to {}: {}", coordinator, eptr);
+                slogger.error("exception during mutation write to {}.{} on {}: {}",
+                    schema->ks_name(), schema->cf_name(), coordinator, eptr);
            }
            p->got_failure_response(response_id, coordinator, forward_size + 1, std::nullopt, err, std::move(msg));
        });
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -910,7 +910,7 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {
                    frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
                } else {
                    co_await for_each_split_mutation(std::move(mut), max_size, [&] (mutation m) -> future<> {
-                        frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
+                        frozen_muts_to_apply.push_back(co_await freeze_gently(m));
                    });
                }
            }
@@ -3026,6 +3026,8 @@ future<> storage_service::drain() {
 }

 future<> storage_service::do_drain() {
+    co_await utils::get_local_injector().inject("storage_service_drain_wait", utils::wait_for_message(60s));
+
    // Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
    co_await stop_transport();

@@ -4016,6 +4018,9 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
        } catch (raft::request_aborted& ex) {
            slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
            break;
+        } catch (seastar::gate_closed_exception& ex) {
+            slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
+            break;
        } catch (...) {
            slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
                          table, std::current_exception(), split_retry.sleep_time());
@@ -4082,6 +4087,58 @@ future<> storage_service::snitch_reconfigured() {
    }
 }

+future<> storage_service::local_topology_barrier() {
+    if (this_shard_id() != 0) {
+        co_await container().invoke_on(0, [] (storage_service& ss) {
+            return ss.local_topology_barrier();
+        });
+        co_return;
+    }
+
+    auto version = _topology_state_machine._topology.version;
+
+    utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
+        throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
+    });
+
+    co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
+    if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
+        for (auto& n : _topology_state_machine._topology.transition_nodes) {
+            if (!_address_map.find(locator::host_id{n.first.uuid()})) {
+                rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
+                break;
+            }
+        }
+    }
+
+    co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
+        const auto current_version = ss._shared_token_metadata.get()->get_version();
+        rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
+                      "current version {}, stale versions (version: use_count): {}",
+                      version, current_version, ss._shared_token_metadata.describe_stale_versions());
+
+        // This shouldn't happen under normal operation, it's only plausible
+        // if the topology change coordinator has
+        // moved to another node and managed to update the topology
+        // parallel to this method. The previous coordinator
+        // should be inactive now, so it won't observe this
+        // exception. By returning exception we aim
+        // to reveal any other conditions where this may arise.
+        if (current_version != version) {
+            co_await coroutine::return_exception(std::runtime_error(
+                    ::format("raft topology: command::barrier_and_drain, the version has changed, "
+                             "version {}, current_version {}, the topology change coordinator "
+                             " had probably migrated to another node",
+                             version, current_version)));
+        }
+
+        co_await ss._shared_token_metadata.stale_versions_in_use();
+        co_await get_topology_session_manager().drain_closing_sessions();
+
+        rtlogger.info("raft_topology_cmd::barrier_and_drain done");
+    });
+}
+
 future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd) {
    raft_topology_cmd_result result;
    rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
@@ -4109,12 +4166,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            state.last_index = cmd_index;
        }

-        // We capture the topology version right after the checks
-        // above, before any yields. This is crucial since _topology_state_machine._topology
-        // might be altered concurrently while this method is running,
-        // which can cause the fence command to apply an invalid fence version.
-        const auto version = _topology_state_machine._topology.version;
-
        switch (cmd.cmd) {
            case raft_topology_cmd::command::barrier: {
                utils::get_local_injector().inject("raft_topology_barrier_fail",
@@ -4153,44 +4204,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            }
            break;
            case raft_topology_cmd::command::barrier_and_drain: {
-                utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
-                    throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
-                });
-                co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
-                if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
-                    for (auto& n : _topology_state_machine._topology.transition_nodes) {
-                        if (!_address_map.find(locator::host_id{n.first.uuid()})) {
-                            rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
-                            break;
-                        }
-                    }
-                }
-                co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
-                    const auto current_version = ss._shared_token_metadata.get()->get_version();
-                    rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
-                        "current version {}, stale versions (version: use_count): {}",
-                        version, current_version, ss._shared_token_metadata.describe_stale_versions());
-
-                    // This shouldn't happen under normal operation, it's only plausible
-                    // if the topology change coordinator has
-                    // moved to another node and managed to update the topology
-                    // parallel to this method. The previous coordinator
-                    // should be inactive now, so it won't observe this
-                    // exception. By returning exception we aim
-                    // to reveal any other conditions where this may arise.
-                    if (current_version != version) {
-                        co_await coroutine::return_exception(std::runtime_error(
-                            ::format("raft topology: command::barrier_and_drain, the version has changed, "
-                                     "version {}, current_version {}, the topology change coordinator "
-                                     " had probably migrated to another node",
-                                version, current_version)));
-                    }
-
-                    co_await ss._shared_token_metadata.stale_versions_in_use();
-                    co_await get_topology_session_manager().drain_closing_sessions();
-
-                    rtlogger.info("raft_topology_cmd::barrier_and_drain done");
-                });
+                co_await local_topology_barrier();

                co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
                    auto ks = handler.get("keyspace");
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -813,6 +813,9 @@ public:
    future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
    future<> raft_initialize_discovery_leader(const join_node_request_params& params);
    future<> initialize_done_topology_upgrade_state();
+    // Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
+    // In particular, waits for non-latest local erms to go die.
+    future<> local_topology_barrier();
 private:
     // State machine that is responsible for topology change
    topology_state_machine& _topology_state_machine;
--- a/service/task_manager_module.cc
+++ b/service/task_manager_module.cc
@@ -195,9 +195,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
    } else if (is_resize_task(task_type)) {
        auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
        res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
-        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
    } else {
-        res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    }
    res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
    co_return res->status;
@@ -312,7 +312,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
            }
            return make_ready_future();
        });
-        res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    } else if (is_migration_task(task_type)) {    // Migration task.
        auto tablet_id = hint.get_tablet_id();
        res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
@@ -326,7 +326,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
        if (task_info.tablet_task_id.uuid() == id.uuid()) {
            update_status(task_info, res.status, sched_nr);
            res.status.state = tasks::task_manager::task_state::running;
-            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
            co_return res;
        }
    }
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -2229,6 +2229,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                _tablet_allocator.set_load_stats(reconciled_stats);
            }
        }
+
+        // Wait for the background storage group merge to finish before releasing the state machine.
+        // Background merge holds the old erm, so a successful barrier joins with it.
+        // This guarantees that the background merge doesn't run concurrently with the next merge.
+        // Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
+        // by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
+        // The background merge fiber will try to stop a compaction group which is locked, and the lock is held
+        // by the background merge fiber.
+        tm = nullptr;
+        if (!guard) {
+            guard = co_await start_operation();
+        }
+        co_await global_tablet_token_metadata_barrier(std::move(guard));
    }

    using get_table_ids_func = std::function<std::unordered_set<table_id>(const db::system_keyspace::topology_requests_entry&)>;
--- a/sstables/index_entry.hh
+++ b/sstables/index_entry.hh
@@ -201,95 +201,49 @@ public:
    virtual future<std::optional<entry_info>> next_entry() = 0;
 };

-// Allocated inside LSA.
-class promoted_index {
-    deletion_time _del_time;
-    uint64_t _promoted_index_start;
-    uint32_t _promoted_index_size;
-    uint32_t _num_blocks;
-public:
-    promoted_index(const schema& s,
-        deletion_time del_time,
-        uint64_t promoted_index_start,
-        uint32_t promoted_index_size,
-        uint32_t num_blocks)
-            : _del_time{del_time}
-            , _promoted_index_start(promoted_index_start)
-            , _promoted_index_size(promoted_index_size)
-            , _num_blocks(num_blocks)
-    { }
-
-    [[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
-    [[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
-
-    // Call under allocating_section.
-    // For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
-    std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
-        reader_permit,
-        tracing::trace_state_ptr,
-        file_input_stream_options,
-        use_caching);
+// Promoted index information produced by the parser.
+struct parsed_promoted_index_entry {
+    deletion_time del_time;
+    uint64_t promoted_index_start;
+    uint32_t promoted_index_size;
+    uint32_t num_blocks;
 };

+using promoted_index = parsed_promoted_index_entry;
+
 // A partition index element.
 // Allocated inside LSA.
-class index_entry {
-private:
-    managed_bytes _key;
-    mutable std::optional<dht::token> _token;
-    uint64_t _position;
-    managed_ref<promoted_index> _index;
+struct [[gnu::packed]] index_entry {
+    mutable int64_t raw_token;
+    uint64_t data_file_offset;
+    uint32_t key_offset;

-public:
-
-    key_view get_key() const {
-        return key_view{_key};
-    }
-
-    // May allocate so must be called under allocating_section.
-    decorated_key_view get_decorated_key(const schema& s) const {
-        if (!_token) {
-            _token.emplace(s.get_partitioner().get_token(get_key()));
-        }
-        return decorated_key_view(*_token, get_key());
-    }
-
-    uint64_t position() const { return _position; };
-
-    std::optional<deletion_time> get_deletion_time() const {
-        if (_index) {
-            return _index->get_deletion_time();
-        }
-
-        return {};
-    }
-
-    index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
-        : _key(std::move(key))
-        , _position(position)
-        , _index(std::move(index))
-    {}
-
-    index_entry(index_entry&&) = default;
-    index_entry& operator=(index_entry&&) = default;
-
-    // Can be nullptr
-    const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
-    managed_ref<promoted_index>& get_promoted_index() { return _index; }
-    uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
-
-    size_t external_memory_usage() const {
-        return _key.external_memory_usage() + _index.external_memory_usage();
-    }
+    uint64_t position() const { return data_file_offset; }
+    dht::raw_token token() const { return dht::raw_token(raw_token); }
 };

+// Required for optimized LSA migration of storage of managed_vector.
+static_assert(std::is_trivially_move_assignable_v<index_entry>);
+static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
+
 // A partition index page.
 //
 // Allocated in the standard allocator space but with an LSA allocator as the current allocator.
 // So the shallow part is in the standard allocator but all indirect objects are inside LSA.
 class partition_index_page {
 public:
-    lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
+    lsa::chunked_managed_vector<index_entry> _entries;
+    managed_bytes _key_storage;
+
+    // Stores promoted index information of index entries.
+    // The i-th element corresponds to the i-th entry in _entries.
+    // Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
+    // that entry doesn't have a promoted index.
+    // It's not chunked, because promoted index is present only when there are large partitions in the page,
+    // which also means the page will have typically only 1 entry due to summary:data_file size ratio.
+    // Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
+    // which is typical in workloads with small partitions.
+    managed_vector<promoted_index> _promoted_indexes;
 public:
    partition_index_page() = default;
    partition_index_page(partition_index_page&&) noexcept = default;
@@ -298,15 +252,68 @@ public:
    bool empty() const { return _entries.empty(); }
    size_t size() const { return _entries.size(); }

+    stop_iteration clear_gently() {
+        // Vectors have trivial storage, so are fast to destroy.
+        return stop_iteration::yes;
+    }
+
    void clear_one_entry() {
        _entries.pop_back();
    }

+    bool has_promoted_index(size_t i) const {
+        return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    const promoted_index& get_promoted_index(size_t i) const {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    promoted_index& get_promoted_index(size_t i) {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index size for the i-th entry.
+    uint32_t get_promoted_index_size(size_t i) const {
+        return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
+    }
+
+    /// Get deletion_time for partition represented by the i-th entry.
+    /// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
+    /// It has to be read from the data file.
+    std::optional<deletion_time> get_deletion_time(size_t i) const {
+        if (has_promoted_index(i)) {
+            return get_promoted_index(i).del_time;
+        }
+        return {};
+    }
+
+    key_view get_key(size_t i) const {
+        auto start = _entries[i].key_offset;
+        auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
+        auto v = managed_bytes_view(_key_storage).prefix(end);
+        v.remove_prefix(start);
+        return key_view(v);
+    }
+
+    decorated_key_view get_decorated_key(const schema& s, size_t i) const {
+        auto key = get_key(i);
+        auto t = _entries[i].token();
+        if (!t) {
+            t = dht::raw_token(s.get_partitioner().get_token(key));
+            _entries[i].raw_token = t.value;
+        }
+        return decorated_key_view(dht::token(t), key);
+    }
+
    size_t external_memory_usage() const {
        size_t size = _entries.external_memory_usage();
-        for (auto&& e : _entries) {
-            size += sizeof(index_entry) + e->external_memory_usage();
-        }
+        size += _promoted_indexes.external_memory_usage();
+        size += _key_storage.external_memory_usage();
        return size;
    }
 };
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -25,14 +25,6 @@ namespace sstables {
 extern seastar::logger sstlog;
 extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;

-// Promoted index information produced by the parser.
-struct parsed_promoted_index_entry {
-    deletion_time del_time;
-    uint64_t promoted_index_start;
-    uint32_t promoted_index_size;
-    uint32_t num_blocks;
-};
-
 // Partition index entry information produced by the parser.
 struct parsed_partition_index_entry {
    temporary_buffer<char> key;
@@ -53,9 +45,10 @@ class index_consumer {
    schema_ptr _s;
    logalloc::allocating_section _alloc_section;
    logalloc::region& _region;
+    utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
+    size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
+    size_t _key_storage_size = 0;
 public:
-    index_list indexes;
-
    index_consumer(logalloc::region& r, schema_ptr s)
        : _s(s)
        , _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
@@ -64,36 +57,63 @@ public:
        , _region(r)
    { }

-    ~index_consumer() {
-        with_allocator(_region.allocator(), [&] {
-            indexes._entries.clear_and_release();
-        });
+    void consume_entry(parsed_partition_index_entry&& e) {
+        _key_storage_size += e.key.size();
+        _parsed_entries.emplace_back(std::move(e));
+        if (e.promoted_index) {
+            _max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
+        }
    }

-    void consume_entry(parsed_partition_index_entry&& e) {
-        _alloc_section(_region, [&] {
+    future<index_list> finalize() {
+        index_list result;
+        // In case of exception, need to deallocate under region allocator.
+        auto delete_result = seastar::defer([&] {
            with_allocator(_region.allocator(), [&] {
-                managed_ref<promoted_index> pi;
-                if (e.promoted_index) {
-                    pi = make_managed<promoted_index>(*_s,
-                            e.promoted_index->del_time,
-                            e.promoted_index->promoted_index_start,
-                            e.promoted_index->promoted_index_size,
-                            e.promoted_index->num_blocks);
-                }
-                auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
-                indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
+                result._entries = {};
+                result._promoted_indexes = {};
+                result._key_storage = {};
            });
        });
+        auto i = _parsed_entries.begin();
+        size_t key_offset = 0;
+        while (i != _parsed_entries.end()) {
+            _alloc_section(_region, [&] {
+                with_allocator(_region.allocator(), [&] {
+                    result._entries.reserve(_parsed_entries.size());
+                    result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
+                    if (result._key_storage.empty()) {
+                        result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
+                    }
+                    managed_bytes_mutable_view key_out(result._key_storage);
+                    key_out.remove_prefix(key_offset);
+                    while (i != _parsed_entries.end()) {
+                        parsed_partition_index_entry& e = *i;
+                        if (e.promoted_index) {
+                            result._promoted_indexes[result._entries.size()] = *e.promoted_index;
+                        }
+                        write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
+                        result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
+                        ++i;
+                        key_offset += e.key.size();
+                        if (need_preempt()) {
+                            break;
+                        }
+                    }
+                });
+            });
+            co_await coroutine::maybe_yield();
+        }
+        delete_result.cancel();
+        _parsed_entries.clear();
+        co_return std::move(result);
    }

    void prepare(uint64_t size) {
-        _alloc_section = logalloc::allocating_section();
-        _alloc_section(_region, [&] {
-            with_allocator(_region.allocator(), [&] {
-                indexes._entries.reserve(size);
-            });
-        });
+        _max_promoted_index_entry_plus_one = 0;
+        _key_storage_size = 0;
+        _parsed_entries.clear();
+        _parsed_entries.reserve(size);
    }
 };

@@ -198,10 +218,14 @@ public:

        switch (_state) {
        // START comes first, to make the handling of the 0-quantity case simpler
+            state_START:
        case state::START:
            sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
            _state = state::KEY_SIZE;
-            break;
+            if (data.size() == 0) {
+                break;
+            }
+            [[fallthrough]];
        case state::KEY_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
            _entry_offset = current_pos();
@@ -227,7 +251,16 @@ public:
        case state::PROMOTED_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
            _position = this->_u64;
-            if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
+            if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
+                data.trim_front(1);
+                _consumer.consume_entry(parsed_partition_index_entry{
+                    .key = std::move(_key),
+                    .data_file_offset = _position,
+                    .index_offset = _entry_offset,
+                    .promoted_index = std::nullopt
+                });
+                goto state_START;
+            } else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
                _state = state::PARTITION_HEADER_LENGTH_1;
                break;
            }
@@ -339,33 +372,6 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
    return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
 }

-inline
-std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
-    reader_permit permit,
-    tracing::trace_state_ptr trace_state,
-    file_input_stream_options options,
-    use_caching caching)
-{
-    if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
-        seastar::shared_ptr<cached_file> cached_file_ptr = caching
-                ? sst->_cached_index_file
-                : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
-                                                    sst->manager().get_cache_tracker().get_index_cached_file_stats(),
-                                                    sst->manager().get_cache_tracker().get_lru(),
-                                                    sst->manager().get_cache_tracker().region(),
-                                                    sst->_index_file_size);
-        return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
-            _promoted_index_start, _promoted_index_size,
-            promoted_index_cache_metrics, permit,
-            sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
-    }
-
-    auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
-    auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
-    return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
-        std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
-}
-
 // Less-comparator for lookups in the partition index.
 class index_comparator {
    dht::ring_position_comparator_for_sstables _tri_cmp;
@@ -376,27 +382,17 @@ public:
        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

-    bool operator()(const index_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
-    }
-
-    bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
-        return operator()(*e, rp);
-    }
-
-    bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
-        return operator()(rp, *e);
-    }
-
    bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }
-
-    bool operator()(dht::ring_position_view rp, const index_entry& e) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
-    }
 };

+inline
+std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
+    dht::ring_position_comparator_for_sstables tri_cmp(s);
+    return tri_cmp(page.get_decorated_key(s, idx), rp);
+}
+
 // Contains information about index_reader position in the index file
 struct index_bound {
    index_bound() = default;
@@ -537,7 +533,7 @@ private:
                    if (ex) {
                        return make_exception_future<index_list>(std::move(ex));
                    }
-                    return make_ready_future<index_list>(std::move(bound.consumer->indexes));
+                    return bound.consumer->finalize();
                });
            });
        };
@@ -550,17 +546,18 @@ private:
            if (bound.current_list->empty()) {
                throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
            }
-            bound.data_file_position = bound.current_list->_entries[0]->position();
+            bound.data_file_position = bound.current_list->_entries[0].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();

            if (sstlog.is_enabled(seastar::log_level::trace)) {
                sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
                logalloc::reclaim_lock rl(_region);
-                for (auto&& e : bound.current_list->_entries) {
+                for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
+                    auto& e = bound.current_list->_entries[i];
                    auto dk = dht::decorate_key(*_sstable->_schema,
-                        e->get_key().to_partition_key(*_sstable->_schema));
-                    sstlog.trace("  {} -> {}", dk, e->position());
+                        bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
+                    sstlog.trace("  {} -> {}", dk, e.position());
                }
            }

@@ -604,7 +601,13 @@ private:
    // Valid if partition_data_ready(bound)
    index_entry& current_partition_entry(index_bound& bound) {
        parse_assert(bool(bound.current_list), _sstable->index_filename());
-        return *bound.current_list->_entries[bound.current_index_idx];
+        return bound.current_list->_entries[bound.current_index_idx];
+    }
+
+    // Valid if partition_data_ready(bound)
+    partition_index_page& current_page(index_bound& bound) {
+        parse_assert(bool(bound.current_list), _sstable->index_filename());
+        return *bound.current_list;
    }

    future<> advance_to_next_partition(index_bound& bound) {
@@ -617,7 +620,7 @@ private:
        if (bound.current_index_idx + 1 < bound.current_list->size()) {
            ++bound.current_index_idx;
            bound.current_pi_idx = 0;
-            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
+            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            return reset_clustered_cursor(bound);
@@ -680,9 +683,13 @@ private:
        return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
            sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
            auto i = _alloc_section(_region, [&] {
-                auto& entries = bound.current_list->_entries;
-                return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
-                    index_comparator(*_sstable->_schema));
+                auto& page = *bound.current_list;
+                auto& s = *_sstable->_schema;
+                auto r = std::views::iota(bound.current_index_idx, page._entries.size());
+                auto it = std::ranges::partition_point(r, [&] (int idx) {
+                    return index_entry_tri_cmp(s, page, idx, pos) < 0;
+                });
+                return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
            });
            // i is valid until next allocation point
            auto& entries = bound.current_list->_entries;
@@ -697,7 +704,7 @@ private:
            }
            bound.current_index_idx = std::distance(std::begin(entries), i);
            bound.current_pi_idx = 0;
-            bound.data_file_position = (*i)->position();
+            bound.data_file_position = (*i).position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
@@ -800,6 +807,34 @@ public:
        }
    }

+    static
+    std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
+        shared_sstable sst,
+        reader_permit permit,
+        tracing::trace_state_ptr trace_state,
+        file_input_stream_options options,
+        use_caching caching)
+    {
+        if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
+            seastar::shared_ptr<cached_file> cached_file_ptr = caching
+                    ? sst->_cached_index_file
+                    : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
+                                                        sst->manager().get_cache_tracker().get_index_cached_file_stats(),
+                                                        sst->manager().get_cache_tracker().get_lru(),
+                                                        sst->manager().get_cache_tracker().region(),
+                                                        sst->_index_file_size);
+            return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
+                pi.promoted_index_start, pi.promoted_index_size,
+                promoted_index_cache_metrics, permit,
+                sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
+        }
+
+        auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
+        auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
+        return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
+            std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
+    }
+
    // Ensures that partition_data_ready() returns true.
    // Can be called only when !eof()
    future<> read_partition_data() override {
@@ -835,10 +870,10 @@ public:
    clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
        if (!bound.clustered_cursor) {
            _alloc_section(_region, [&] {
-                index_entry& e = current_partition_entry(bound);
-                promoted_index* pi = e.get_promoted_index().get();
-                if (pi) {
-                    bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
+                partition_index_page& page = current_page(bound);
+                if (page.has_promoted_index(bound.current_index_idx)) {
+                    promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
+                    bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
                        get_file_input_stream_options(), _use_caching);
                }
            });
@@ -861,15 +896,15 @@ public:
    // It may be unavailable for old sstables for which this information was not generated.
    // Can be called only when partition_data_ready().
    std::optional<sstables::deletion_time> partition_tombstone() override {
-        return current_partition_entry(_lower_bound).get_deletion_time();
+        return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
    }

    // Returns the key for current partition.
    // Can be called only when partition_data_ready().
    std::optional<partition_key> get_partition_key() override {
        return _alloc_section(_region, [this] {
-            index_entry& e = current_partition_entry(_lower_bound);
-            return e.get_key().to_partition_key(*_sstable->_schema);
+            return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
+                .to_partition_key(*_sstable->_schema);
        });
    }

@@ -883,8 +918,8 @@ public:
    // Returns the number of promoted index entries for the current partition.
    // Can be called only when partition_data_ready().
    uint64_t get_promoted_index_size() {
-        index_entry& e = current_partition_entry(_lower_bound);
-        return e.get_promoted_index_size();
+        partition_index_page& page = current_page(_lower_bound);
+        return page.get_promoted_index_size(_lower_bound.current_index_idx);
    }

    bool partition_data_ready() const override {
@@ -975,9 +1010,9 @@ public:
                return make_ready_future<bool>(false);
            }
            return read_partition_data().then([this, key] {
-                index_comparator cmp(*_sstable->_schema);
                bool found = _alloc_section(_region, [&] {
-                    return cmp(key, current_partition_entry(_lower_bound)) == 0;
+                    auto& page = current_page(_lower_bound);
+                    return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
                });
                return make_ready_future<bool>(found);
            });
--- a/sstables/object_storage_client.cc
+++ b/sstables/object_storage_client.cc
@@ -189,10 +189,11 @@ public:
            {}
            future<std::optional<directory_entry>> get() override {
                std::filesystem::path dir(_prefix);
-                do {
+                while (true) {
                    if (_pos == _info.size()) {
                        _info.clear();
                        _info = co_await _client->list_objects(_bucket, _prefix, _paging);
+                        _pos = 0;
                    }
                    if (_info.empty()) {
                        break;
@@ -203,7 +204,7 @@ public:
                        continue;
                    }
                    co_return ent;
-                } while (false);
+                }

                co_return std::nullopt;
            }
@@ -276,7 +277,7 @@ public:
            co_await f.close();

            auto names = ranges | std::views::transform([](auto& p) { return p.name; }) | std::ranges::to<std::vector<std::string>>();
-            co_await _client->merge_objects(bucket, object, std::move(names), {}, as);
+            co_await _client->merge_objects(bucket, object, names, {}, as);

            co_await parallel_for_each(names, [this, bucket](auto& name) -> future<> {
                co_await _client->delete_object(bucket, name);
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -257,14 +257,11 @@ public:
        while (partial_page || i != _cache.end()) {
            if (partial_page) {
                auto preempted = with_allocator(_region.allocator(), [&] {
-                    while (!partial_page->empty()) {
-                        partial_page->clear_one_entry();
-                        if (need_preempt()) {
-                            return true;
-                        }
+                    while (partial_page->clear_gently() != stop_iteration::yes) {
+                        return true;
                    }
                    partial_page.reset();
-                    return false;
+                    return need_preempt();
                });
                if (preempted) {
                    auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -1132,7 +1132,6 @@ public:

    friend class mc::writer;
    friend class index_reader;
-    friend class promoted_index;
    friend class sstables_manager;
    template <typename DataConsumeRowsContext>
    friend future<std::unique_ptr<DataConsumeRowsContext>>
--- a/sstables/sstables_manager.cc
+++ b/sstables/sstables_manager.cc
@@ -180,18 +180,11 @@ storage_manager::config_updater::config_updater(const db::config& cfg, storage_m
 {}

 sstables::sstable::version_types sstables_manager::get_highest_supported_format() const noexcept {
-    // FIXME: start announcing `ms` here after it becomes the default.
-    // (There are several tests which expect that new sstables are written with
-    // the format reported by this API).
-    //
-    // After `ms` becomes the default, this function look like this:
-    //
-    // if (_features.ms_sstable) {
-    //     return sstable_version_types::ms;
-    // } else {
-    //     return sstable_version_types::me;
-    // }
-    return sstable_version_types::me;
+     if (_features.ms_sstable) {
+         return sstable_version_types::ms;
+     } else {
+         return sstable_version_types::me;
+     }
 }

 sstables::sstable::version_types sstables_manager::get_preferred_sstable_version() const {
--- a/sstables_loader.cc
+++ b/sstables_loader.cc
@@ -221,10 +221,16 @@ private:
        sst->set_sstable_level(0);
        auto units = co_await sst_manager.dir_semaphore().get_units(1);
        sstables::sstable_open_config cfg {
+            .unsealed_sstable = true,
            .ignore_component_digest_mismatch = db.get_config().ignore_component_digest_mismatch(),
        };
        co_await sst->load(table.get_effective_replication_map()->get_sharder(*table.schema()), cfg);
-        co_await table.add_sstable_and_update_cache(sst);
+        co_await table.add_new_sstable_and_update_cache(sst, [&sst_manager, sst] (sstables::shared_sstable loading_sst) -> future<> {
+            if (loading_sst == sst) {
+                auto writer_cfg = sst_manager.configure_writer(loading_sst->get_origin());
+                co_await loading_sst->seal_sstable(writer_cfg.backup);
+            }
+        });
    }

    future<>
@@ -295,7 +301,8 @@ private:
                        sstables::sstable_state::normal,
                        sstables::sstable::component_basename(
                            _table.schema()->ks_name(), _table.schema()->cf_name(), descriptor.version, gen, descriptor.format, it->first),
-                        sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend()});
+                        sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend(),
+                                                           .leave_unsealed = true});
                    auto out = co_await sstable_sink->output(foptions, stream_options);

                    input_stream src(co_await [this, &it, sstable, f = files.at(it->first)]() -> future<input_stream<char>> {
--- a/tasks/task_manager.cc
+++ b/tasks/task_manager.cc
@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
    : _module(std::move(module))
 {}

-future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
+future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
    auto ms = module->get_task_manager()._messaging;
    if (!ms) {
        auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
@@ -417,19 +417,18 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
        tmlogger.info("tasks_vt_get_children: waiting");
        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{60});
    });
-    co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
-        if (is_host_alive(host_id)) {
-            return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
-                return resp | std::views::transform([host_id] (auto id) {
-                    return task_identity{
-                        .host_id = host_id,
-                        .task_id = id
-                    };
-                }) | std::ranges::to<utils::chunked_vector<task_identity>>();
-            });
-        } else {
-            return make_ready_future<utils::chunked_vector<task_identity>>();
-        }
+    co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
+        return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
+            return resp | std::views::transform([host_id] (auto id) {
+                return task_identity{
+                    .host_id = host_id,
+                    .task_id = id
+                };
+            }) | std::ranges::to<utils::chunked_vector<task_identity>>();
+        }).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
+            tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
+            return utils::chunked_vector<task_identity>{};
+        });
    }, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
        std::move(b.begin(), b.end(), std::back_inserter(a));
        return a;
--- a/tasks/task_manager.hh
+++ b/tasks/task_manager.hh
@@ -19,6 +19,7 @@
 #include "db_clock.hh"
 #include "utils/log.hh"
 #include "locator/host_id.hh"
+#include "locator/token_metadata_fwd.hh"
 #include "schema/schema_fwd.hh"
 #include "tasks/types.hh"
 #include "utils/chunked_vector.hh"
@@ -282,7 +283,7 @@ public:
            impl& operator=(impl&&) = delete;
            virtual ~impl() = default;
        protected:
-            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
+            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
        public:
            virtual task_group get_group() const noexcept = 0;
            // Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.
--- a/test.py
+++ b/test.py
@@ -181,7 +181,7 @@ def parse_cmd_line() -> argparse.Namespace:
                        help="Run only tests for given build mode(s)")
    parser.add_argument('--repeat', action="store", default="1", type=int,
                        help="number of times to repeat test execution")
-    parser.add_argument('--timeout', action="store", default="24000", type=int,
+    parser.add_argument('--timeout', action="store", default="3600", type=int,
                        help="timeout value for single test execution")
    parser.add_argument('--session-timeout', action="store", default="24000", type=int,
                        help="timeout value for test.py/pytest session execution")
--- a/test/alternator/test_streams.py
+++ b/test/alternator/test_streams.py
@@ -469,18 +469,6 @@ def test_get_records_nonexistent_iterator(dynamodbstreams):
 # not allowed (see test_streams_change_type), and while removing and re-adding
 # a stream is possible, it is very slow. So we create four different fixtures
 # with the four different StreamViewType settings for these four fixtures.
-#
-# It turns out that DynamoDB makes reusing the same table in different tests
-# very difficult, because when we request a "LATEST" iterator we sometimes
-# miss the immediately following write (this issue doesn't happen in
-# ALternator, just in DynamoDB - presumably LATEST adds some time slack?)
-# So all the fixtures we create below have scope="function", meaning that a
-# separate table is created for each of the tests using these fixtures. This
-# slows the tests down a bit, but not by much (about 0.05 seconds per test).
-# It is still worthwhile to use a fixture rather than to create a table
-# explicitly - it is convenient, safe (the table gets deleted automatically)
-# and if in the future we can work around the DynamoDB problem, we can return
-# these fixtures to module scope.

@contextmanager
 def create_table_ss(dynamodb, dynamodbstreams, type):
@@ -524,43 +512,43 @@ def create_table_s_no_ck(dynamodb, dynamodbstreams, type):
    yield table, arn
    table.delete()

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_sss_new_and_old_images_lsi(dynamodb, dynamodbstreams):
    yield from create_table_sss_lsi(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_keys_only(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_new_image(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'NEW_IMAGE') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_old_image(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'OLD_IMAGE') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_new_and_old_images(dynamodb, dynamodbstreams):
    with create_table_ss(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES') as stream:
        yield stream

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_keys_only(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'KEYS_ONLY')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_new_image(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'NEW_IMAGE')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_old_image(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'OLD_IMAGE')

-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_s_no_ck_new_and_old_images(dynamodb, dynamodbstreams):
    yield from create_table_s_no_ck(dynamodb, dynamodbstreams, 'NEW_AND_OLD_IMAGES')

@@ -626,13 +614,30 @@ def list_shards(dynamodbstreams, arn):

 # Utility function for getting shard iterators starting at "LATEST" for
 # all the shards of the given stream arn.
+# On DynamoDB (but not Alternator), LATEST has a time slack: it may point to
+# a position slightly before the true end of the stream, so writes from a
+# previous test that reused the same table can appear to be "in the future"
+# relative to the returned iterators and therefore show up unexpectedly in
+# the current test's reads.  To work around this we drain any already-pending
+# records from the iterators before returning them, so the caller is
+# guaranteed to see only events written *after* this call returns.
 def latest_iterators(dynamodbstreams, arn):
    iterators = []
    for shard_id in list_shards(dynamodbstreams, arn):
        iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
            ShardId=shard_id, ShardIteratorType='LATEST')['ShardIterator'])
    assert len(set(iterators)) == len(iterators)
-    return iterators
+    # Drain any records that are already visible at the LATEST position.
+    # We keep fetching until no more records are returned, which means that
+    # the stream is caught up. This drain loop is not necessary on Alternator,
+    # and needlessly slows the test down.
+    if not dynamodbstreams._endpoint.host.endswith('.amazonaws.com'):
+        return iterators
+    while True:
+        events = []
+        iterators = fetch_more(dynamodbstreams, iterators, events)
+        if events == []:
+            return iterators

 # Similar to latest_iterators(), just also returns the shard id which produced
 # each iterator.
@@ -641,7 +646,16 @@ def shards_and_latest_iterators(dynamodbstreams, arn):
    for shard_id in list_shards(dynamodbstreams, arn):
        shards_and_iterators.append((shard_id, dynamodbstreams.get_shard_iterator(StreamArn=arn,
            ShardId=shard_id, ShardIteratorType='LATEST')['ShardIterator']))
-    return shards_and_iterators
+    # Drain pre-existing records from the iterators, for the same reason as
+    # explained in latest_iterators() above.
+    if not dynamodbstreams._endpoint.host.endswith('.amazonaws.com'):
+        return shards_and_iterators
+    while True:
+        events = []
+        new_iters = fetch_more(dynamodbstreams, [it for _, it in shards_and_iterators], events)
+        shards_and_iterators = list(zip([sh for sh, _ in shards_and_iterators], new_iters))
+        if events == []:
+            return shards_and_iterators

 # Utility function for fetching more content from the stream (given its
 # array of iterators) into an "output" array. Call repeatedly to get more
@@ -806,9 +820,11 @@ def fetch_and_compare_events(dynamodb, dynamodbstreams, iterators, expected_even
 # function "updatefunc" which is supposed to do some updates to the table
 # and also return an expected_events list. do_test() then fetches the streams
 # data and compares it to the expected_events using compare_events().
-def do_test(test_table_ss_stream, dynamodb, dynamodbstreams, updatefunc, mode, p = random_string(), c = random_string()):
+def do_test(test_table_ss_stream, dynamodb, dynamodbstreams, updatefunc, mode):
    table, arn = test_table_ss_stream
    iterators = latest_iterators(dynamodbstreams, arn)
+    p = random_string()
+    c = random_string()
    expected_events = updatefunc(table, p, c)
    fetch_and_compare_events(dynamodb, dynamodbstreams, iterators, expected_events, mode)

@@ -956,7 +972,7 @@ def test_streams_updateitem_old_image_empty_item(test_table_ss_old_image, dynamo
 # columns they are only included in the preimage if they change.
 # Currently fails in Alternator because the item's key is missing in
 # OldImage (#6935) and the LSI key is also missing (#7030).
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="module")
 def test_table_ss_old_image_and_lsi(dynamodb, dynamodbstreams):
    table = create_test_table(dynamodb,
        Tags=TAGS,
@@ -1357,49 +1373,48 @@ def test_streams_after_sequence_number(test_table_ss_keys_only, dynamodbstreams)

 # Test the "TRIM_HORIZON" iterator, which can be used to re-read *all* the
 # previously-read events of the stream shard again.
-# NOTE: This test relies on the test_table_ss_keys_only fixture giving us a
-# brand new stream, with no old events saved from other tests. If we ever
-# change this, we should change this test to use a different fixture.
-def test_streams_trim_horizon(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
-    # Do two UpdateItem operations to the same key, that are expected to leave
-    # two events in the stream.
-    p = random_string()
-    c = random_string()
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
-    # Eventually, *one* of the stream shards will return the two events:
-    timeout = time.time() + 15
-    while time.time() < timeout:
-        for (shard_id, iter) in shards_and_iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and len(response['Records']) == 2:
-                assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                sequence_number_1 = response['Records'][0]['dynamodb']['SequenceNumber']
-                sequence_number_2 = response['Records'][1]['dynamodb']['SequenceNumber']
-                # If we use the TRIM_HORIZON iterator, we should receive the
-                # same two events again, in the same order.
-                # Note that we assume that the fixture gave us a brand new
-                # stream, with no old events saved from other tests. If we
-                # couldn't assume this, this test would need to become much
-                # more complex, and would need to read from this shard until
-                # we find the two events we are looking for.
-                iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
-                    ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+def test_streams_trim_horizon(dynamodb, dynamodbstreams):
+    # This test needs a brand-new stream, without old data from other
+    # tests, so we can't reuse the test_table_ss_keys_only fixture.
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
+        # Do two UpdateItem operations to the same key, that are expected to leave
+        # two events in the stream.
+        p = random_string()
+        c = random_string()
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
+        # Eventually, *one* of the stream shards will return the two events:
+        timeout = time.time() + 15
+        while time.time() < timeout:
+            for (shard_id, iter) in shards_and_iterators:
                response = dynamodbstreams.get_records(ShardIterator=iter)
-                assert 'Records' in response
-                assert len(response['Records']) == 2
-                assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][0]['dynamodb']['SequenceNumber'] == sequence_number_1
-                assert response['Records'][1]['dynamodb']['SequenceNumber'] == sequence_number_2
-                return
-        time.sleep(0.5)
-    pytest.fail("timed out")
+                if 'Records' in response and len(response['Records']) == 2:
+                    assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    sequence_number_1 = response['Records'][0]['dynamodb']['SequenceNumber']
+                    sequence_number_2 = response['Records'][1]['dynamodb']['SequenceNumber']
+                    # If we use the TRIM_HORIZON iterator, we should receive the
+                    # same two events again, in the same order.
+                    # Note that we assume that the fixture gave us a brand new
+                    # stream, with no old events saved from other tests. If we
+                    # couldn't assume this, this test would need to become much
+                    # more complex, and would need to read from this shard until
+                    # we find the two events we are looking for.
+                    iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                        ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+                    response = dynamodbstreams.get_records(ShardIterator=iter)
+                    assert 'Records' in response
+                    assert len(response['Records']) == 2
+                    assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][0]['dynamodb']['SequenceNumber'] == sequence_number_1
+                    assert response['Records'][1]['dynamodb']['SequenceNumber'] == sequence_number_2
+                    return
+            time.sleep(0.5)
+        pytest.fail("timed out")

 # Test the StartingSequenceNumber information returned by DescribeStream.
 # The DynamoDB documentation explains that StartingSequenceNumber is
@@ -1414,45 +1429,47 @@ def test_streams_trim_horizon(test_table_ss_keys_only, dynamodbstreams):
 # that the important thing is that reading a shard starting at
 # StartingSequenceNumber will result in reading all the available items -
 # similar to how TRIM_HORIZON works. This is what the following test verifies.
-def test_streams_starting_sequence_number(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    # Do two UpdateItem operations to the same key, that are expected to leave
-    # two events in the stream.
-    p = random_string()
-    c = random_string()
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
-    table.update_item(Key={'p': p, 'c': c},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
-    # Get for all the stream shards the iterator starting at the shard's
-    # StartingSequenceNumber:
-    response = dynamodbstreams.describe_stream(StreamArn=arn)
-    shards = response['StreamDescription']['Shards']
-    while 'LastEvaluatedShardId' in response['StreamDescription']:
-        response = dynamodbstreams.describe_stream(StreamArn=arn,
-            ExclusiveStartShardId=response['StreamDescription']['LastEvaluatedShardId'])
-        shards.extend(response['StreamDescription']['Shards'])
-    iterators = []
-    for shard in shards:
-        shard_id = shard['ShardId']
-        start = shard['SequenceNumberRange']['StartingSequenceNumber']
-        assert start.isdecimal()
-        iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
-            ShardId=shard_id, ShardIteratorType='AT_SEQUENCE_NUMBER',
-            SequenceNumber=start)['ShardIterator'])
+def test_streams_starting_sequence_number(dynamodb, dynamodbstreams):
+    # This test needs a brand-new stream, without old data from other
+    # tests, so we can't reuse the test_table_ss_keys_only fixture.
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        # Do two UpdateItem operations to the same key, that are expected to leave
+        # two events in the stream.
+        p = random_string()
+        c = random_string()
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 3})
+        table.update_item(Key={'p': p, 'c': c},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
+        # Get for all the stream shards the iterator starting at the shard's
+        # StartingSequenceNumber:
+        response = dynamodbstreams.describe_stream(StreamArn=arn)
+        shards = response['StreamDescription']['Shards']
+        while 'LastEvaluatedShardId' in response['StreamDescription']:
+            response = dynamodbstreams.describe_stream(StreamArn=arn,
+                ExclusiveStartShardId=response['StreamDescription']['LastEvaluatedShardId'])
+            shards.extend(response['StreamDescription']['Shards'])
+        iterators = []
+        for shard in shards:
+            shard_id = shard['ShardId']
+            start = shard['SequenceNumberRange']['StartingSequenceNumber']
+            assert start.isdecimal()
+            iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                ShardId=shard_id, ShardIteratorType='AT_SEQUENCE_NUMBER',
+                SequenceNumber=start)['ShardIterator'])

-    # Eventually, *one* of the stream shards will return the two events:
-    timeout = time.time() + 15
-    while time.time() < timeout:
-        for iter in iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and len(response['Records']) == 2:
-                assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
-                return
-        time.sleep(0.5)
+        # Eventually, *one* of the stream shards will return the two events:
+        timeout = time.time() + 15
+        while time.time() < timeout:
+            for iter in iterators:
+                response = dynamodbstreams.get_records(ShardIterator=iter)
+                if 'Records' in response and len(response['Records']) == 2:
+                    assert response['Records'][0]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    assert response['Records'][1]['dynamodb']['Keys'] == {'p': {'S': p}, 'c': {'S': c}}
+                    return
+            time.sleep(0.5)

-    pytest.fail("timed out")
+        pytest.fail("timed out")

 # Above we tested some specific operations in small tests aimed to reproduce
 # a specific bug, in the following tests we do a all the different operations,
@@ -1746,50 +1763,49 @@ def test_stream_specification(test_table_stream_with_result, dynamodbstreams):
 # that the right answer is that NextShardIterator should be *missing*
 # (reproduces issue #7237).
@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
-def test_streams_closed_read(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
-    # Do an UpdateItem operation that is expected to leave one event in the
-    # stream.
-    table.update_item(Key={'p': random_string(), 'c': random_string()},
-        UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
-    # Disable streaming for this table. Note that the test_table_ss_keys_only
-    # fixture has "function" scope so it is fine to ruin table, it will not
-    # be used in other tests.
-    disable_stream(dynamodbstreams, table)
+def test_streams_closed_read(dynamodb, dynamodbstreams):
+    # This test can't use the shared table test_table_ss_keys_only,
+    # because it wants to disable streaming, so let's create a new table:
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        shards_and_iterators = shards_and_latest_iterators(dynamodbstreams, arn)
+        # Do an UpdateItem operation that is expected to leave one event in the
+        # stream.
+        table.update_item(Key={'p': random_string(), 'c': random_string()},
+            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
+        disable_stream(dynamodbstreams, table)

-    # Even after streaming is disabled for the table, we can still read
-    # from the earlier stream (it is guaranteed to work for 24 hours).
-    # The iterators we got earlier should still be fully usable, and
-    # eventually *one* of the stream shards will return one event:
-    timeout = time.time() + 15
-    while time.time() < timeout:
-        for (shard_id, iter) in shards_and_iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and response['Records'] != []:
-                # Found the shard with the data! Test that it only has
-                # one event. NextShardIterator should either be missing now,
-                # indicating that it is a closed shard (DynamoDB does this),
-                # or, it may (and currently does in Alternator) return another
-                # and reading from *that* iterator should then tell us that
-                # we reached the end of the shard (i.e., zero results and
-                # missing NextShardIterator).
-                assert len(response['Records']) == 1
-                if 'NextShardIterator' in response:
-                    response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
-                    assert len(response['Records']) == 0
-                    assert not 'NextShardIterator' in response
-                # Until now we verified that we can read the closed shard
-                # using an old iterator. Let's test now that the closed
-                # shard id is also still valid, and a new iterator can be
-                # created for it, and the old data can be read from it:
-                iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
-                    ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+        # Even after streaming is disabled for the table, we can still read
+        # from the earlier stream (it is guaranteed to work for 24 hours).
+        # The iterators we got earlier should still be fully usable, and
+        # eventually *one* of the stream shards will return one event:
+        timeout = time.time() + 15
+        while time.time() < timeout:
+            for (shard_id, iter) in shards_and_iterators:
                response = dynamodbstreams.get_records(ShardIterator=iter)
-                assert len(response['Records']) == 1
-                return
-        time.sleep(0.5)
-    pytest.fail("timed out")
+                if 'Records' in response and response['Records'] != []:
+                    # Found the shard with the data! Test that it only has
+                    # one event. NextShardIterator should either be missing now,
+                    # indicating that it is a closed shard (DynamoDB does this),
+                    # or, it may (and currently does in Alternator) return another
+                    # and reading from *that* iterator should then tell us that
+                    # we reached the end of the shard (i.e., zero results and
+                    # missing NextShardIterator).
+                    assert len(response['Records']) == 1
+                    if 'NextShardIterator' in response:
+                        response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
+                        assert len(response['Records']) == 0
+                        assert not 'NextShardIterator' in response
+                    # Until now we verified that we can read the closed shard
+                    # using an old iterator. Let's test now that the closed
+                    # shard id is also still valid, and a new iterator can be
+                    # created for it, and the old data can be read from it:
+                    iter = dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                        ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')['ShardIterator']
+                    response = dynamodbstreams.get_records(ShardIterator=iter)
+                    assert len(response['Records']) == 1
+                    return
+            time.sleep(0.5)
+        pytest.fail("timed out")

 # In the above test (test_streams_closed_read) we used a disabled stream as
 # a means to generate a closed shard, and tested the behavior of that closed
@@ -1800,84 +1816,83 @@ def test_streams_closed_read(test_table_ss_keys_only, dynamodbstreams):
 # stream's shards should give an indication that they are all closed - but
 # all these shards should still be readable.
@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
-def test_streams_disabled_stream(test_table_ss_keys_only, dynamodbstreams):
-    table, arn = test_table_ss_keys_only
-    iterators = latest_iterators(dynamodbstreams, arn)
-    # Do an UpdateItem operation that is expected to leave one event in the
-    # stream.
-    table.update_item(Key={'p': random_string(), 'c': random_string()},
-        UpdateExpression='SET x = :x', ExpressionAttributeValues={':x': 5})
+def test_streams_disabled_stream(dynamodb, dynamodbstreams):
+    # This test can't use the shared table test_table_ss_keys_only,
+    # because it wants to disable streaming, so let's create a new table:
+    with create_table_ss(dynamodb, dynamodbstreams, 'KEYS_ONLY') as (table, arn):
+        iterators = latest_iterators(dynamodbstreams, arn)
+        # Do an UpdateItem operation that is expected to leave one event in the
+        # stream.
+        table.update_item(Key={'p': random_string(), 'c': random_string()},
+            UpdateExpression='SET x = :x', ExpressionAttributeValues={':x': 5})

-    # Wait for this one update to become available in the stream before we
-    # disable the stream. Otherwise, theoretically (although unlikely in
-    # practice) we may disable the stream before the update was saved to it.
-    timeout = time.time() + 15
-    found = False
-    while time.time() < timeout and not found:
-        for iter in iterators:
-            response = dynamodbstreams.get_records(ShardIterator=iter)
-            if 'Records' in response and len(response['Records']) > 0:
-                found = True
-                break
-        time.sleep(0.5)
-    assert found
+        # Wait for this one update to become available in the stream before we
+        # disable the stream. Otherwise, theoretically (although unlikely in
+        # practice) we may disable the stream before the update was saved to it.
+        timeout = time.time() + 15
+        found = False
+        while time.time() < timeout and not found:
+            for iter in iterators:
+                response = dynamodbstreams.get_records(ShardIterator=iter)
+                if 'Records' in response and len(response['Records']) > 0:
+                    found = True
+                    break
+            time.sleep(0.5)
+        assert found

-    # Disable streaming for this table. Note that the test_table_ss_keys_only
-    # fixture has "function" scope so it is fine to ruin table, it will not
-    # be used in other tests.
-    disable_stream(dynamodbstreams, table)
+        disable_stream(dynamodbstreams, table)

-    # Check that the stream ARN which we previously got for the disabled
-    # stream is still listed by ListStreams
-    arns = [stream['StreamArn'] for stream in dynamodbstreams.list_streams(TableName=table.name)['Streams']]
-    assert arn in arns
+        # Check that the stream ARN which we previously got for the disabled
+        # stream is still listed by ListStreams
+        arns = [stream['StreamArn'] for stream in dynamodbstreams.list_streams(TableName=table.name)['Streams']]
+        assert arn in arns

-    # DescribeStream on the disabled stream still works and lists its shards.
-    # All these shards are listed as being closed (i.e., should have
-    # EndingSequenceNumber). The basic details of the stream (e.g., the view
-    # type) are available and the status of the stream is DISABLED.
-    response = dynamodbstreams.describe_stream(StreamArn=arn)['StreamDescription']
-    assert response['StreamStatus'] == 'DISABLED'
-    assert response['StreamViewType'] == 'KEYS_ONLY'
-    assert response['TableName'] == table.name
-    shards_info = response['Shards']
-    while 'LastEvaluatedShardId' in response:
-        response = dynamodbstreams.describe_stream(StreamArn=arn, ExclusiveStartShardId=response['LastEvaluatedShardId'])['StreamDescription']
+        # DescribeStream on the disabled stream still works and lists its shards.
+        # All these shards are listed as being closed (i.e., should have
+        # EndingSequenceNumber). The basic details of the stream (e.g., the view
+        # type) are available and the status of the stream is DISABLED.
+        response = dynamodbstreams.describe_stream(StreamArn=arn)['StreamDescription']
        assert response['StreamStatus'] == 'DISABLED'
        assert response['StreamViewType'] == 'KEYS_ONLY'
        assert response['TableName'] == table.name
-        shards_info.extend(response['Shards'])
-    print('Number of shards in stream: {}'.format(len(shards_info)))
-    for shard in shards_info:
-        assert 'EndingSequenceNumber' in shard['SequenceNumberRange']
-        assert shard['SequenceNumberRange']['EndingSequenceNumber'].isdecimal()
+        shards_info = response['Shards']
+        while 'LastEvaluatedShardId' in response:
+            response = dynamodbstreams.describe_stream(StreamArn=arn, ExclusiveStartShardId=response['LastEvaluatedShardId'])['StreamDescription']
+            assert response['StreamStatus'] == 'DISABLED'
+            assert response['StreamViewType'] == 'KEYS_ONLY'
+            assert response['TableName'] == table.name
+            shards_info.extend(response['Shards'])
+        print('Number of shards in stream: {}'.format(len(shards_info)))
+        for shard in shards_info:
+            assert 'EndingSequenceNumber' in shard['SequenceNumberRange']
+            assert shard['SequenceNumberRange']['EndingSequenceNumber'].isdecimal()

-    # We can get TRIM_HORIZON iterators for all these shards, to read all
-    # the old data they still have (this data should be saved for 24 hours
-    # after the stream was disabled)
-    iterators = []
-    for shard in shards_info:
-        iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
-            ShardId=shard['ShardId'], ShardIteratorType='TRIM_HORIZON')['ShardIterator'])
+        # We can get TRIM_HORIZON iterators for all these shards, to read all
+        # the old data they still have (this data should be saved for 24 hours
+        # after the stream was disabled)
+        iterators = []
+        for shard in shards_info:
+            iterators.append(dynamodbstreams.get_shard_iterator(StreamArn=arn,
+                ShardId=shard['ShardId'], ShardIteratorType='TRIM_HORIZON')['ShardIterator'])

-    # We can read the one change we did in one of these iterators. The data
-    # should be available immediately - no need for retries with timeout.
-    nrecords = 0
-    for iter in iterators:
-        response = dynamodbstreams.get_records(ShardIterator=iter)
-        if 'Records' in response:
-            nrecords += len(response['Records'])
-        # The shard is closed, so NextShardIterator should either be missing
-        # now,  indicating that it is a closed shard (DynamoDB does this),
-        # or, it may (and currently does in Alternator) return an iterator
-        # and reading from *that* iterator should then tell us that
-        # we reached the end of the shard (i.e., zero results and
-        # missing NextShardIterator).
-        if 'NextShardIterator' in response:
-            response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
-            assert len(response['Records']) == 0
-            assert not 'NextShardIterator' in response
-    assert nrecords == 1
+        # We can read the one change we did in one of these iterators. The data
+        # should be available immediately - no need for retries with timeout.
+        nrecords = 0
+        for iter in iterators:
+            response = dynamodbstreams.get_records(ShardIterator=iter)
+            if 'Records' in response:
+                nrecords += len(response['Records'])
+            # The shard is closed, so NextShardIterator should either be missing
+            # now,  indicating that it is a closed shard (DynamoDB does this),
+            # or, it may (and currently does in Alternator) return an iterator
+            # and reading from *that* iterator should then tell us that
+            # we reached the end of the shard (i.e., zero results and
+            # missing NextShardIterator).
+            if 'NextShardIterator' in response:
+                response = dynamodbstreams.get_records(ShardIterator=response['NextShardIterator'])
+                assert len(response['Records']) == 0
+                assert not 'NextShardIterator' in response
+        assert nrecords == 1

 # When streams are enabled for a table, we get a unique ARN which should be
 # unique but not change unless streams are eventually disabled for this table.
--- a/test/boost/cache_algorithm_test.cc
+++ b/test/boost/cache_algorithm_test.cc
@@ -62,7 +62,11 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(1.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
+        // We disable compression because the sstable writer targets a specific
+        // (*compressed* data file size : summary file size) ratio,
+        // so the number of keys per index page becomes hard to control,
+        // and might be arbitrarily large.
+        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();

@@ -154,7 +158,11 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(0.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
+        // We disable compression because the sstable writer targets a specific
+        // (*compressed* data file size : summary file size) ratio,
+        // so the number of keys per index page becomes hard to control,
+        // and might be arbitrarily large.
+        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();

--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -1058,6 +1058,30 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
    });
 }

+SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
+    return make_ready_future();
+#endif
+    return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
+        sharded<db::snapshot_ctl> sc;
+        sc.start(std::ref(e.db()), std::ref(e.get_storage_proxy()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
+        auto stop_sc = deferred_stop(sc);
+
+        auto& cf = e.local_db().find_column_family("ks", "cf");
+        take_snapshot(e).get();
+
+        utils::get_local_injector().enable("get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        auto details = cf.get_snapshot_details().get();
+        BOOST_REQUIRE_EQUAL(details.size(), 1);
+    });
+}
+
 // toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
 SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
    return do_with_cql_env_thread([] (cql_test_env& e) {
--- a/test/boost/encrypted_file_test.cc
+++ b/test/boost/encrypted_file_test.cc
@@ -23,8 +23,11 @@
 #include "test/lib/tmpdir.hh"
 #include "test/lib/random_utils.hh"
 #include "test/lib/exception_utils.hh"
+#include "test/lib/limiting_data_source.hh"
 #include "utils/io-wrappers.hh"

+#include <seastar/util/memory-data-source.hh>
+
 using namespace encryption;

 static tmpdir dir;
@@ -595,6 +598,113 @@ SEASTAR_TEST_CASE(test_encrypted_data_source_simple) {
    co_await test_random_data_source(sizes);
 }

+// Reproduces the production deadlock where encrypted SSTable component downloads
+// got stuck during restore. The encrypted_data_source::get() caches a block in
+// _next, then on the next call bypasses input_stream::read()'s _eof check and
+// calls input_stream::read_exactly() — which does NOT check _eof when _buf is
+// empty. This causes a second get() on the underlying source after EOS.
+//
+// In production the underlying source was chunked_download_source whose get()
+// hung forever. Here we simulate it with a strict source that fails the test.
+//
+// The fix belongs in seastar's input_stream::read_exactly(): check _eof before
+// calling _fd.get(), consistent with read(), read_up_to(), and consume().
+static future<> test_encrypted_source_copy(size_t plaintext_size) {
+    testlog.info("test_encrypted_source_copy: plaintext_size={}", plaintext_size);
+
+    key_info info{"AES/CBC", 256};
+    auto k = ::make_shared<symmetric_key>(info);
+
+    // Step 1: Encrypt the plaintext into memory buffers
+    auto plaintext = generate_random<char>(plaintext_size);
+    std::vector<temporary_buffer<char>> encrypted_bufs;
+    {
+        data_sink sink(make_encrypted_sink(create_memory_sink(encrypted_bufs), k));
+        co_await sink.put(plaintext.clone());
+        co_await sink.close();
+    }
+
+    // Flatten encrypted buffers into a single contiguous buffer
+    size_t encrypted_total = 0;
+    for (const auto& b : encrypted_bufs) {
+        encrypted_total += b.size();
+    }
+    temporary_buffer<char> encrypted(encrypted_total);
+    size_t pos = 0;
+    for (const auto& b : encrypted_bufs) {
+        std::copy(b.begin(), b.end(), encrypted.get_write() + pos);
+        pos += b.size();
+    }
+
+    // Step 2: Create a data source from the encrypted data that fails on
+    // post-EOS get() — simulating a source like chunked_download_source
+    // that would hang forever in this situation.
+    class strict_memory_source final : public limiting_data_source_impl {
+        bool _eof = false;
+    public:
+        strict_memory_source(temporary_buffer<char> data, size_t chunk_size)
+            : limiting_data_source_impl(
+                data_source(std::make_unique<util::temporary_buffer_data_source>(std::move(data))),
+                chunk_size) {}
+
+        future<temporary_buffer<char>> get() override {
+            BOOST_REQUIRE_MESSAGE(!_eof,
+                "get() called on source after it already returned EOS — "
+                "this is the production deadlock: read_exactly() does not "
+                "check _eof before calling _fd.get()");
+            auto buf = co_await limiting_data_source_impl::get();
+            _eof = buf.empty();
+            co_return buf;
+        }
+    };
+
+    // Step 3: Wrap in encrypted_data_source and drain via consume() —
+    // the exact code path used by seastar::copy() which is what
+    // sstables_loader_helpers::download_sstable() calls.
+    // Try multiple chunk sizes to hit different alignment scenarios.
+    for (size_t chunk_size : {1ul, 7ul, 4096ul, 8192ul, encrypted_total, encrypted_total + 1}) {
+        if (chunk_size == 0) continue;
+        auto src = data_source(make_encrypted_source(
+            data_source(std::make_unique<strict_memory_source>(encrypted.clone(), chunk_size)), k));
+        auto in = input_stream<char>(std::move(src));
+
+        // consume() is what seastar::copy() uses internally. It calls
+        // encrypted_data_source::get() via _fd.get() until EOF.
+        size_t total_decrypted = 0;
+        co_await in.consume([&total_decrypted](temporary_buffer<char> buf) {
+            total_decrypted += buf.size();
+            return make_ready_future<consumption_result<char>>(continue_consuming{});
+        });
+        co_await in.close();
+
+        BOOST_REQUIRE_EQUAL(total_decrypted, plaintext_size);
+    }
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_8k) {
+    co_await test_encrypted_source_copy(8192);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_4k) {
+    co_await test_encrypted_source_copy(4096);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_small) {
+    co_await test_encrypted_source_copy(100);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_12k) {
+    co_await test_encrypted_source_copy(12288);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_unaligned) {
+    co_await test_encrypted_source_copy(8193);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_1byte) {
+    co_await test_encrypted_source_copy(1);
+}
+

 SEASTAR_TEST_CASE(test_encrypted_data_source_fuzzy) {
    std::mt19937_64 rand_gen(std::random_device{}());
--- a/test/boost/memtable_test.cc
+++ b/test/boost/memtable_test.cc
@@ -1004,7 +1004,20 @@ SEASTAR_TEST_CASE(memtable_flush_compresses_mutations) {
    }, db_config);
 }

-SEASTAR_TEST_CASE(memtable_flush_period) {
+static auto check_has_error_injection() {
+    return boost::unit_test::precondition([](auto){
+        return 
+#ifdef SCYLLA_ENABLE_ERROR_INJECTION
+            true
+#else
+            false
+#endif
+        ;
+    });
+}
+
+SEASTAR_TEST_CASE(memtable_flush_period, *check_has_error_injection()) {
+#ifdef SCYLLA_ENABLE_ERROR_INJECTION
    auto db_config = make_shared<db::config>();
    db_config->enable_cache.set(false);
    return do_with_cql_env_thread([](cql_test_env& env) {
@@ -1028,6 +1041,9 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
        t.apply(m);
        BOOST_REQUIRE_EQUAL(t.sstables_count(), 0); // add mutation and check there are no sstables for this table

+        auto& errj = utils::get_local_injector();
+        errj.enable("table_seal_post_flush_waiters", true);
+
        // change schema to set memtable flush period
        // we use small value in this test but it is impossible to set the period less than 60000ms using ALTER TABLE construction
        schema_builder b(t.schema());
@@ -1035,8 +1051,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
        schema_ptr s2 = b.build();
        t.set_schema(s2);

-        sleep(500ms).get(); // wait until memtable flush starts at least once
-        BOOST_REQUIRE(t.sstables_count() == 1 || t.get_stats().pending_flushes > 0);    // flush started
+        BOOST_TEST_MESSAGE("Wait for flush");
+        errj.inject("table_seal_post_flush_waiters", utils::wait_for_message(std::chrono::minutes(2))).get();
+        BOOST_TEST_MESSAGE("Flush received");
+
        BOOST_REQUIRE(eventually_true([&] { // wait until memtable will be flushed at least once
            return t.sstables_count() == 1;
        }));
@@ -1047,6 +1065,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
            .produces(m)
            .produces_end_of_stream();
    }, db_config);
+#else
+    BOOST_TEST_MESSAGE("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev)");
+    return make_ready_future<>();
+#endif
 }

 SEASTAR_TEST_CASE(sstable_compaction_does_not_resurrect_data) {
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -15,6 +15,7 @@
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/random_utils.hh"
 #include "test/lib/exception_utils.hh"
+#include "test/lib/eventually.hh"
 #include "db/config.hh"

 #include <fmt/ranges.h>
@@ -200,6 +201,10 @@ public:
        return _sem;
    }

+    const replica::querier_cache::stats& get_stats() const {
+        return _cache.get_stats();
+    }
+
    dht::partition_range make_partition_range(bound begin, bound end) const {
        return dht::partition_range::make({_mutations.at(begin.value()).decorated_key(), begin.is_inclusive()},
                {_mutations.at(end.value()).decorated_key(), end.is_inclusive()});
@@ -562,24 +567,21 @@ SEASTAR_THREAD_TEST_CASE(test_time_based_cache_eviction) {

    const auto entry1 = t.produce_first_page_and_save_data_querier(1);

-    seastar::sleep(500ms).get();
+    BOOST_REQUIRE_EQUAL(t.get_stats().time_based_evictions, 0);

-    const auto entry2 = t.produce_first_page_and_save_data_querier(2);
+    // Don't waste time retrying before the TTL is up
+    sleep(1s).get();

-    seastar::sleep(700ms).get();
+    eventually_true([&t] {
+        auto stats = t.get_stats();
+        return stats.time_based_evictions == 1;
+    });

    t.assert_cache_lookup_data_querier(entry1.key, *t.get_schema(), entry1.expected_range, entry1.expected_slice)
        .misses()
        .no_drops()
        .time_based_evictions();

-    seastar::sleep(700ms).get();
-
-    t.assert_cache_lookup_data_querier(entry2.key, *t.get_schema(), entry2.expected_range, entry2.expected_slice)
-        .misses()
-        .no_drops()
-        .time_based_evictions();
-
    // There should be no inactive reads, the querier_cache should unregister
    // the expired queriers.
    BOOST_REQUIRE_EQUAL(t.get_semaphore().get_stats().inactive_reads, 0);
--- a/test/boost/reader_concurrency_semaphore_test.cc
+++ b/test/boost/reader_concurrency_semaphore_test.cc
@@ -26,6 +26,7 @@
 #include <fmt/ranges.h>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/testing/on_internal_error.hh>
 #undef SEASTAR_TESTING_MAIN
 #include <seastar/testing/test_case.hh>
 #include <seastar/testing/thread_test_case.hh>
@@ -35,6 +36,13 @@
 #include "replica/database.hh" // new_reader_base_cost is there :(
 #include "db/config.hh"

+// Provides access to private members of reader_concurrency_semaphore for testing.
+struct reader_concurrency_semaphore_tester {
+    static void signal(reader_concurrency_semaphore& sem, reader_resources r) {
+        sem.signal(r);
+    }
+};
+
 BOOST_AUTO_TEST_SUITE(reader_concurrency_semaphore_test)

 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads) {
@@ -2595,4 +2603,35 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_preemptive_abort_requ
    permit2 = {};
 }

+// Verify that signal() detects and corrects a negative resource leak.
+// When a bug causes available resources to exceed initial resources
+// after signal(), the semaphore should report the negative leak via
+// on_internal_error_noexcept and clamp _resources back to _initial_resources
+// so that consumed_resources() never goes negative.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_signal_detects_negative_resource_leak) {
+    const auto initial = reader_resources{2, 2048};
+
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial.count, initial.memory);
+    auto stop_sem = deferred_stop(semaphore);
+
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial);
+    BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
+
+    // Simulate a negative leak: signal more resources than were ever consumed.
+    // This would happen if a bug double-returned resources or inflated
+    // the amount returned to signal().
+    // signal() calls on_internal_error_noexcept which would abort in
+    // test mode, so temporarily disable that.
+    const auto leaked = reader_resources{1, 512};
+    {
+        seastar::testing::scoped_no_abort_on_internal_error no_abort;
+        reader_concurrency_semaphore_tester::signal(semaphore, leaked);
+    }
+
+    // signal() should have detected the over-return and clamped
+    // available resources back to initial.
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial);
+    BOOST_REQUIRE_EQUAL(semaphore.consumed_resources(), reader_resources{});
+}
+
 BOOST_AUTO_TEST_SUITE_END()
--- a/test/boost/s3_test.cc
+++ b/test/boost/s3_test.cc
@@ -982,21 +982,29 @@ BOOST_AUTO_TEST_CASE(s3_fqn_manipulation) {
 }

 BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 5_MiB), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("too many parts: 100352 > 10000");
-        });
-    }
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, s3::minimum_part_size), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(format("too many parts: 100352 > {}", s3::maximum_parts_in_piece));
+    });
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 4_MiB), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(format("part_size too small: 4194304 is smaller than minimum part size: {}", s3::minimum_part_size));
+    });
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(s3::maximum_object_size + 1, 0), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(
+            format("object size too large: {} is larger than maximum S3 object size: {}", s3::maximum_object_size + 1, s3::maximum_object_size));
+    });
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(1_TiB, s3::maximum_part_size + 1), std::runtime_error, [](const std::runtime_error& e) {
+        return std::string(e.what()).starts_with(
+            format("part_size too large: {} is larger than maximum part size: {}", s3::maximum_part_size + 1, s3::maximum_part_size));
+    });
+    size_t total_size = s3::minimum_part_size * (s3::maximum_parts_in_piece + 1); // 10001 parts at 5 MiB
+    BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(total_size, s3::minimum_part_size), std::runtime_error, [](auto& e) {
+        return std::string(e.what()).starts_with(format("too many parts: 10001 > {}", s3::maximum_parts_in_piece));
+    });
    {
        auto [parts, size] = s3::calc_part_size(490_GiB, 100_MiB);
        BOOST_REQUIRE_EQUAL(size, 100_MiB);
        BOOST_REQUIRE(parts == 5018);
    }
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 4_MiB), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("part_size too small: 4194304 is smaller than minimum part size: 5242880");
-        });
-    }
    {
        auto [parts, size] = s3::calc_part_size(50_MiB, 0);
        BOOST_REQUIRE_EQUAL(size, 50_MiB);
@@ -1013,24 +1021,14 @@ BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
        BOOST_REQUIRE(parts == 9839);
    }
    {
-        auto [parts, size] = s3::calc_part_size(50_MiB * 10000, 0);
+        auto [parts, size] = s3::calc_part_size(50_MiB * s3::maximum_parts_in_piece, 0);
        BOOST_REQUIRE_EQUAL(size, 50_MiB);
-        BOOST_REQUIRE_EQUAL(parts, 10000);
+        BOOST_REQUIRE_EQUAL(parts, s3::maximum_parts_in_piece);
    }
    {
-        auto [parts, size] = s3::calc_part_size(50_MiB * 10000 + 1, 0);
+        auto [parts, size] = s3::calc_part_size(50_MiB * s3::maximum_parts_in_piece + 1, 0);
        BOOST_REQUIRE(size > 50_MiB);
-        BOOST_REQUIRE(parts <= 10000);
-    }
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(50_TiB, 0), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("object size too large: 54975581388800 is larger than maximum S3 object size: 53687091200000");
-        });
-    }
-    {
-        BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(1_TiB, 5_GiB + 1), std::runtime_error, [](const std::runtime_error& e) {
-            return std::string(e.what()).starts_with("part_size too large: 5368709121 is larger than maximum part size: 5368709120");
-        });
+        BOOST_REQUIRE(parts <= s3::maximum_parts_in_piece);
    }
    {
        auto [parts, size] = s3::calc_part_size(5_TiB, 0);
@@ -1038,21 +1036,16 @@ BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
        BOOST_REQUIRE_EQUAL(size, 525_MiB);
    }
    {
-        auto [parts, size] = s3::calc_part_size(5_MiB * 10000, 5_MiB);
-        BOOST_REQUIRE_EQUAL(size, 5_MiB);
-        BOOST_REQUIRE_EQUAL(parts, 10000);
-    }
-    {
-        size_t total = 5_MiB * 10001; // 10001 parts at 5 MiB
-        BOOST_REQUIRE_EXCEPTION(
-            s3::calc_part_size(total, 5_MiB), std::runtime_error, [](auto& e) { return std::string(e.what()).starts_with("too many parts: 10001 > 10000"); });
+        auto [parts, size] = s3::calc_part_size(s3::minimum_part_size * s3::maximum_parts_in_piece, s3::minimum_part_size);
+        BOOST_REQUIRE_EQUAL(size, s3::minimum_part_size);
+        BOOST_REQUIRE_EQUAL(parts, s3::maximum_parts_in_piece);
    }
    {
        size_t total = 500_GiB + 123; // odd size to force non-MiB alignment
        auto [parts, size] = s3::calc_part_size(total, 0);

        BOOST_REQUIRE(size % 1_MiB == 0); // aligned
-        BOOST_REQUIRE(parts <= 10000);
+        BOOST_REQUIRE(parts <= s3::maximum_parts_in_piece);
    }
    {
        auto [parts, size] = s3::calc_part_size(6_MiB, 0);
--- a/test/boost/schema_change_test.cc
+++ b/test/boost/schema_change_test.cc
@@ -676,7 +676,7 @@ SEASTAR_TEST_CASE(test_system_schema_version_is_stable) {

        // If you changed the schema of system.batchlog then this is expected to fail.
        // Just replace expected version with the new version.
-        BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("1f504ac7-350f-37aa-8a9e-105b1325d8e3")));
+        BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("c3f984e4-f886-3616-bb80-f8c68ed93595")));
    });
 }

--- a/Show More
+++ b/Show More