topology_coordinator: suppress cancel warning in should_preempt_balancing

Agent-Logs-Url: https://github.com/scylladb/scylladb/sessions/ff8e4ba3-e470-4446-8a15-9f173b22c277 Co-authored-by: tgrabiec <283695+tgrabiec@users.noreply.github.com>
vector_search: forward non-primary key restrictions to Vector Store service
2026-04-20 00:20:47 +00:00 · 2026-04-10 19:25:21 +00:00 · 2026-04-10 17:16:29 +02:00 · 2026-04-10 12:24:18 +02:00 · 2026-04-10 12:17:43 +02:00 · 2026-04-10 11:11:21 +02:00
349 changed files with 24158 additions and 8151 deletions
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -55,22 +55,26 @@ ninja build/<mode>/test/boost/<test_name>
 ninja build/<mode>/scylla

 # Run all tests in a file
-./test.py --mode=<mode> <test_path>
+./test.py --mode=<mode> test/<suite>/<test_name>.py

 # Run a single test case from a file
-./test.py --mode=<mode> <test_path>::<test_function_name>
+./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
+
+# Run all tests in a directory
+./test.py --mode=<mode> test/<suite>/

 # Examples
-./test.py --mode=dev alternator/
-./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/alternator/
+./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/cqlpy/test_json.py

 # Optional flags
-./test.py --mode=dev cluster/test_raft_no_quorum -v  # Verbose output
-./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5  # Repeat test 5 times
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v  # Verbose output
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5  # Repeat test 5 times
 ```

 **Important:**
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
+- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
 - To run a single test case, append `::<test_function_name>` to the file path
 - Add `-v` for verbose output
 - Add `--repeat <num>` to repeat a test multiple times
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -8,6 +8,9 @@ on:
 jobs:
  check-fixes-prefix:
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
    steps:
      - name: Check PR body for "Fixes" prefix patterns
        uses: actions/github-script@v7
--- a/.github/workflows/call_validate_pr_author_email.yml
+++ b/.github/workflows/call_validate_pr_author_email.yml
@@ -7,6 +7,11 @@ on:
      - synchronize
      - reopened

+permissions:
+  contents: read
+  pull-requests: write
+  statuses: write
+
 jobs:
  validate_pr_author_email:
    uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -1,4 +1,6 @@
 name: Trigger Scylla CI Route
+permissions:
+  contents: read

 on:
  issue_comment:
--- a/.github/workflows/trigger_jenkins.yaml
+++ b/.github/workflows/trigger_jenkins.yaml
@@ -1,5 +1,8 @@
 name: Trigger next gating

+permissions:
+  contents: read
+
 on:
  push:
    branches:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,12 @@ cmake_minimum_required(VERSION 3.27)

 project(scylla)

+# Disable CMake's automatic -fcolor-diagnostics injection (CMake 3.24+ adds
+# it for Clang+Ninja). configure.py does not add any color diagnostics flags,
+# so we clear the internal CMake variable to prevent injection.
+set(CMAKE_CXX_COMPILE_OPTIONS_COLOR_DIAGNOSTICS "")
+set(CMAKE_C_COMPILE_OPTIONS_COLOR_DIAGNOSTICS "")
+
 list(APPEND CMAKE_MODULE_PATH
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake
  ${CMAKE_CURRENT_SOURCE_DIR}/seastar/cmake)
@@ -51,6 +57,16 @@ set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_SCAN_FOR_MODULES OFF CACHE INTERNAL "")
 set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)

+# Global defines matching configure.py
+# Since gcc 13, libgcc doesn't need the exception workaround
+add_compile_definitions(SEASTAR_NO_EXCEPTION_HACK)
+# Hacks needed to expose internal APIs for xxhash dependencies
+add_compile_definitions(XXH_PRIVATE_API)
+# SEASTAR_TESTING_MAIN is added later (after add_subdirectory(seastar) and
+# add_subdirectory(abseil)) to avoid leaking into the seastar subdirectory.
+# If SEASTAR_TESTING_MAIN is defined globally before seastar, it causes a
+# duplicate 'main' symbol in seastar_testing.
+
 if(is_multi_config)
    find_package(Seastar)
    # this is atypical compared to standard ExternalProject usage:
@@ -98,10 +114,31 @@ else()
    set(Seastar_IO_URING ON CACHE BOOL "" FORCE)
    set(Seastar_SCHEDULING_GROUPS_COUNT 21 CACHE STRING "" FORCE)
    set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE)
+    # Match configure.py's build_seastar_shared_libs: Debug and Dev
+    # build Seastar as a shared library, others build it static.
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "Dev")
+        set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
+    else()
+        set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
+    endif()
    add_subdirectory(seastar)
-    target_compile_definitions (seastar
-      PRIVATE
-        SEASTAR_NO_EXCEPTION_HACK)
+
+    # Coverage mode sets cmake_build_type='Debug' for Seastar
+    # (configure.py:515), so Seastar's pkg-config output includes sanitizer
+    # link flags in seastar_libs_coverage (configure.py:2514,2649).
+    # Seastar's own CMake only activates sanitizer targets for Debug/Sanitize
+    # configs, so we inject link options on the seastar target for Coverage.
+    # Using PUBLIC ensures they propagate to all targets linking Seastar
+    # (but not standalone tools like patchelf), matching configure.py's
+    # behavior.  Compile-time flags and defines are handled globally in
+    # cmake/mode.Coverage.cmake.
+    if(CMAKE_BUILD_TYPE STREQUAL "Coverage")
+        target_link_options(seastar
+            PUBLIC
+                -fsanitize=address
+                -fsanitize=undefined
+                -fsanitize=vptr)
+    endif()
 endif()

 set(ABSL_PROPAGATE_CXX_STD ON CACHE BOOL "" FORCE)
@@ -111,8 +148,10 @@ if(Scylla_ENABLE_LTO)
 endif()

 find_package(Sanitizers QUIET)
+# Match configure.py:2192 — abseil gets sanitizer flags with -fno-sanitize=vptr
+# to exclude vptr checks which are incompatible with abseil's usage.
 list(APPEND absl_cxx_flags
-    $<$<CONFIG:Debug,Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_COMPILE_OPTIONS>>)
+    $<$<CONFIG:Debug,Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_COMPILE_OPTIONS>;-fno-sanitize=vptr>)
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    list(APPEND ABSL_GCC_FLAGS ${absl_cxx_flags})
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -137,9 +176,38 @@ add_library(absl::headers ALIAS absl-headers)
 # unfortunately.
 set_target_properties(absl_strerror PROPERTIES EXCLUDE_FROM_ALL TRUE)

+# Now that seastar and abseil subdirectories are fully processed, add
+# SEASTAR_TESTING_MAIN globally. This matches configure.py's global define
+# without leaking into seastar (which would cause duplicate main symbols).
+add_compile_definitions(SEASTAR_TESTING_MAIN)
+
 # System libraries dependencies
 find_package(Boost REQUIRED
    COMPONENTS filesystem program_options system thread regex unit_test_framework)
+# When using shared Boost libraries, define BOOST_ALL_DYN_LINK (matching configure.py)
+if(NOT Boost_USE_STATIC_LIBS)
+    add_compile_definitions(BOOST_ALL_DYN_LINK)
+endif()
+
+# CMake's Boost package config adds per-component defines like
+# BOOST_UNIT_TEST_FRAMEWORK_DYN_LINK, BOOST_REGEX_DYN_LINK, etc. on the
+# imported targets. configure.py only uses BOOST_ALL_DYN_LINK (which covers
+# all components), so strip the per-component defines to align the two build
+# systems.
+foreach(_boost_target
+    Boost::unit_test_framework
+    Boost::regex
+    Boost::filesystem
+    Boost::program_options
+    Boost::system
+    Boost::thread)
+  if(TARGET ${_boost_target})
+    # Completely remove all INTERFACE_COMPILE_DEFINITIONS from the Boost target.
+    # This prevents per-component *_DYN_LINK and *_NO_LIB defines from
+    # propagating. BOOST_ALL_DYN_LINK (set globally) covers all components.
+    set_property(TARGET ${_boost_target} PROPERTY INTERFACE_COMPILE_DEFINITIONS)
+  endif()
+endforeach()
 target_link_libraries(Boost::regex
  INTERFACE
    ICU::i18n
@@ -196,6 +264,10 @@ if (Scylla_USE_PRECOMPILED_HEADER)
    message(STATUS "Using precompiled header for Scylla - remember to add `sloppiness = pch_defines,time_macros` to ccache.conf, if you're using ccache.")
    target_precompile_headers(scylla-precompiled-header PRIVATE "stdafx.hh")
    target_compile_definitions(scylla-precompiled-header PRIVATE SCYLLA_USE_PRECOMPILED_HEADER)
+    # Match configure.py: -fpch-validate-input-files-content tells the compiler
+    # to check content of stdafx.hh if timestamps don't match (important for
+    # ccache/git workflows where timestamps may not be preserved).
+    add_compile_options(-fpch-validate-input-files-content)
  endif()
 else()
  set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
--- a/2
+++ b/2
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -699,6 +699,17 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        // for such a size.
        co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit));
    }
+    // Check the concurrency limit early, before acquiring memory and
+    // reading the request body, to avoid piling up memory from excess
+    // requests that will be rejected anyway. This mirrors the CQL
+    // transport which also checks concurrency before memory acquisition
+    // (transport/server.cc).
+    if (_pending_requests.get_count() >= _max_concurrent_requests) {
+        _executor._stats.requests_shed++;
+        co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count()));
+    }
+    _pending_requests.enter();
+    auto leave = defer([this] () noexcept { _pending_requests.leave(); });
    // JSON parsing can allocate up to roughly 2x the size of the raw
    // document, + a couple of bytes for maintenance.
    // If the Content-Length of the request is not available, we assume
@@ -760,12 +771,6 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        _executor._stats.unsupported_operations++;
        co_return api_error::unknown_operation(fmt::format("Unsupported operation {}", op));
    }
-    if (_pending_requests.get_count() >= _max_concurrent_requests) {
-        _executor._stats.requests_shed++;
-        co_return api_error::request_limit_exceeded(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _pending_requests.get_count()));
-    }
-    _pending_requests.enter();
-    auto leave = defer([this] () noexcept { _pending_requests.leave(); });
    executor::client_state client_state(service::client_state::external_tag(),
        _auth_service, &_sl_controller, _timeout_config.current_values(), req->get_client_address());
    if (!username.empty()) {
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -33,7 +33,7 @@
 #include "data_dictionary/data_dictionary.hh"
 #include "utils/rjson.hh"

-static logging::logger elogger("alternator-streams");
+static logging::logger slogger("alternator-streams");

 /**
 * Base template type to implement  rapidjson::internal::TypeHelper<...>:s
@@ -437,7 +437,7 @@ const cdc::stream_id& find_parent_shard_in_previous_generation(db_clock::time_po
    if (prev_streams.empty()) {
        // something is really wrong - streams are empty
        // let's try internal_error in hope it will be notified and fixed
-        on_internal_error(elogger, fmt::format("streams are empty for cdc generation at {} ({})", prev_timestamp, prev_timestamp.time_since_epoch().count()));
+        on_internal_error(slogger, fmt::format("streams are empty for cdc generation at {} ({})", prev_timestamp, prev_timestamp.time_since_epoch().count()));
    }
    auto it = std::lower_bound(prev_streams.begin(), prev_streams.end(), child.token(), [](const cdc::stream_id& id, const dht::token& t) {
        return id.token() < t;
@@ -787,16 +787,18 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&
 struct event_id {
    cdc::stream_id stream;
    utils::UUID timestamp;
+    size_t index = 0;

    static constexpr auto marker = 'E';

-    event_id(cdc::stream_id s, utils::UUID ts)
+    event_id(cdc::stream_id s, utils::UUID ts, size_t index)
        : stream(s)
        , timestamp(ts)
+        , index(index)
    {}
    
    friend std::ostream& operator<<(std::ostream& os, const event_id& id) {
-        fmt::print(os, "{}{}:{}", marker, id.stream.to_bytes(), id.timestamp);
+        fmt::print(os, "{}{}:{}:{}", marker, id.stream.to_bytes(), id.timestamp, id.index);
        return os;
    }
 };
@@ -808,7 +810,19 @@ struct rapidjson::internal::TypeHelper<ValueType, alternator::event_id>
 {};

 namespace alternator {
-    
+    namespace {
+        struct managed_bytes_ptr_hash {
+            size_t operator()(const managed_bytes *k) const noexcept {
+                return std::hash<managed_bytes>{}(*k);
+            }
+        };
+        struct managed_bytes_ptr_equal {
+            bool operator()(const managed_bytes *a, const managed_bytes *b) const noexcept {
+                return *a == *b;
+            }
+        };
+    }
+
 future<executor::request_return_type> executor::get_records(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
    _stats.api_operations.get_records++;
    auto start_time = std::chrono::steady_clock::now();
@@ -879,6 +893,12 @@ future<executor::request_return_type> executor::get_records(client_state& client

    auto pks = schema->partition_key_columns();
    auto cks = schema->clustering_key_columns();
+    
+    auto base_cks = base->clustering_key_columns();
+    if (base_cks.size() > 1) {
+        throw api_error::internal(fmt::format("invalid alternator table, clustering key count ({}) is bigger than one", base_cks.size()));
+    }
+    const bytes *clustering_key_column_name = !base_cks.empty() ? &base_cks.front().name() : nullptr;

    std::transform(pks.begin(), pks.end(), std::back_inserter(columns), [](auto& c) { return &c; });
    std::transform(cks.begin(), cks.end(), std::back_inserter(columns), [](auto& c) { return &c; });
@@ -933,42 +953,40 @@ future<executor::request_return_type> executor::get_records(client_state& client
            return cdef->name->name() == eor_column_name;
        })
    );
+    auto clustering_key_index = clustering_key_column_name ? std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [&](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == *clustering_key_column_name;
+        })
+    ) : 0;

    std::optional<utils::UUID> timestamp;
-    auto dynamodb = rjson::empty_object();
-    auto record = rjson::empty_object();
+    struct Record {
+        rjson::value record;
+        rjson::value dynamodb;
+    };
+    const managed_bytes empty_managed_bytes;
+    std::unordered_map<const managed_bytes*, Record, managed_bytes_ptr_hash, managed_bytes_ptr_equal> records_map;
    const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();

    using op_utype = std::underlying_type_t<cdc::operation>;

-    auto maybe_add_record = [&] {
-        if (!dynamodb.ObjectEmpty()) {
-            rjson::add(record, "dynamodb", std::move(dynamodb));
-            dynamodb = rjson::empty_object();
-        }
-        if (!record.ObjectEmpty()) {
-            rjson::add(record, "awsRegion", rjson::from_string(dc_name));
-            rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
-            rjson::add(record, "eventSource", "scylladb:alternator");
-            rjson::add(record, "eventVersion", "1.1");
-            rjson::push_back(records, std::move(record));
-            record = rjson::empty_object();
-            --limit;
-        }
-    };
-
    for (auto& row : result_set->rows()) {
        auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
        auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
        auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
+        const managed_bytes* cs_ptr = clustering_key_column_name ? &*row[clustering_key_index] : &empty_managed_bytes;
+        auto records_it = records_map.emplace(cs_ptr, Record{});
+        auto &record = records_it.first->second;

-        if (!dynamodb.HasMember("Keys")) {
+        if (records_it.second) {
+            record.dynamodb = rjson::empty_object();
+            record.record = rjson::empty_object();
            auto keys = rjson::empty_object();
            describe_single_item(*selection, row, key_names, keys);
-            rjson::add(dynamodb, "Keys", std::move(keys));
-            rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
-            rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
-            rjson::add(dynamodb, "StreamViewType", type);
+            rjson::add(record.dynamodb, "Keys", std::move(keys));
+            rjson::add(record.dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
+            rjson::add(record.dynamodb, "SequenceNumber", sequence_number(ts));
+            rjson::add(record.dynamodb, "StreamViewType", type);
            // TODO: SizeBytes
        }

@@ -992,6 +1010,10 @@ future<executor::request_return_type> executor::get_records(client_state& client
         * flags on CDC log, instead we use data to 
         * drive what is returned. This is (afaict)
         * consistent with dynamo streams
+         * 
+         * Note: BatchWriteItem will generate multiple records with
+         * the same timestamp, when write isolation is set to always
+         * (which triggers lwt), so we need to unpack them based on clustering key.
         */
        switch (op) {
        case cdc::operation::pre_image:
@@ -1000,14 +1022,14 @@ future<executor::request_return_type> executor::get_records(client_state& client
            auto item = rjson::empty_object();
            describe_single_item(*selection, row, attr_names, item, nullptr, true);
            describe_single_item(*selection, row, key_names, item);
-            rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
+            rjson::add(record.dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
            break;
        }
        case cdc::operation::update:
-            rjson::add(record, "eventName", "MODIFY");
+            rjson::add(record.record, "eventName", "MODIFY");
            break;
        case cdc::operation::insert:
-            rjson::add(record, "eventName", "INSERT");
+            rjson::add(record.record, "eventName", "INSERT");
            break;
        case cdc::operation::service_row_delete:
        case cdc::operation::service_partition_delete:
@@ -1015,28 +1037,41 @@ future<executor::request_return_type> executor::get_records(client_state& client
            auto user_identity = rjson::empty_object();
            rjson::add(user_identity, "Type", "Service");
            rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
-            rjson::add(record, "userIdentity", std::move(user_identity));
-            rjson::add(record, "eventName", "REMOVE");
+            rjson::add(record.record, "userIdentity", std::move(user_identity));
+            rjson::add(record.record, "eventName", "REMOVE");
            break;
        }
        default:
-            rjson::add(record, "eventName", "REMOVE");
+            rjson::add(record.record, "eventName", "REMOVE");
            break;
        }
        if (eor) {
-            maybe_add_record();
+            size_t index = 0;
+            for (auto& [_, rec] : records_map) {
+                rjson::add(rec.record, "awsRegion", rjson::from_string(dc_name));
+                rjson::add(rec.record, "eventID", event_id(iter.shard.id, *timestamp, index++));
+                rjson::add(rec.record, "eventSource", "scylladb:alternator");
+                rjson::add(rec.record, "eventVersion", "1.1");
+
+                rjson::add(rec.record, "dynamodb", std::move(rec.dynamodb));
+                rjson::push_back(records, std::move(rec.record));
+            }
+
+            records_map.clear();
            timestamp = ts;
-            if (limit == 0) {
+            if (records.Size() >= limit) {
+                // Note: we might have more than limit rows here - BatchWriteItem will emit multiple items
+                // with the same timestamp and we have no way of resume iteration midway through those,
+                // so we return all of them here.
                break;
            }
        }
    }

    auto ret = rjson::empty_object();
-    auto nrecords = records.Size();
    rjson::add(ret, "Records", std::move(records));

-    if (nrecords != 0) {
+    if (timestamp) {
        // #9642. Set next iterators threshold to > last
        shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
        // Note that here we unconditionally return NextShardIterator,
@@ -1087,6 +1122,7 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche

        cdc::options opts;
        opts.enabled(true);
+        // cdc::delta_mode is ignored by Alternator, so aim for the least overhead.
        opts.set_delta_mode(cdc::delta_mode::keys);
        opts.ttl(std::chrono::duration_cast<std::chrono::seconds>(dynamodb_streams_max_window).count());

--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -743,7 +743,7 @@
               "parameters":[
                  {
                     "name":"tag",
-                     "description":"the tag given to the snapshot",
+                     "description":"The snapshot tag to delete. If omitted, all snapshots are removed.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -751,7 +751,7 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma-separated keyspaces name that their snapshot will be deleted",
+                     "description":"Comma-separated list of keyspace names to delete snapshots from. If omitted, snapshots are deleted from all keyspaces.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -759,7 +759,7 @@
                  },
                  {
                     "name":"cf",
-                     "description":"an optional table name that its snapshot will be deleted",
+                     "description":"A table name used to filter which table's snapshots are deleted. If omitted or empty, snapshots for all tables are eligible. When provided together with 'kn', the table is looked up in each listed keyspace independently. For secondary indexes, the logical index name (e.g. 'myindex') can be used and is resolved automatically.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -1295,6 +1295,45 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_compaction",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger compaction of the key-value storage",
+               "type":"void",
+               "nickname":"logstor_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"major",
+                     "description":"When true, perform a major compaction",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/logstor_flush",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger flush of logstor storage",
+               "type":"void",
+               "nickname":"logstor_flush",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/storage_service/active_repair/",
         "operations":[
@@ -3127,6 +3166,83 @@
         ]
      },

+      {
+         "path":"/storage_service/vnode_tablet_migrations/keyspaces/{keyspace}",
+         "operations":[{
+             "method":"POST",
+             "summary":"Start vnodes-to-tablets migration for all tables in a keyspace",
+             "type":"void",
+             "nickname":"create_vnode_tablet_migration",
+             "produces":["application/json"],
+             "parameters":[
+                 {
+                     "name":"keyspace",
+                     "description":"Keyspace name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                 }
+             ]
+         },
+         {
+             "method":"GET",
+             "summary":"Get a keyspace's vnodes-to-tablets migration status",
+             "type":"vnode_tablet_migration_status",
+             "nickname":"get_vnode_tablet_migration",
+             "produces":["application/json"],
+             "parameters":[
+                 {
+                     "name":"keyspace",
+                     "description":"Keyspace name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                 }
+             ]
+         }]
+      },
+      {
+         "path":"/storage_service/vnode_tablet_migrations/node/storage_mode",
+         "operations":[{
+             "method":"PUT",
+             "summary":"Set the intended storage mode for this node during vnodes-to-tablets migration",
+             "type":"void",
+             "nickname":"set_vnode_tablet_migration_node_storage_mode",
+             "produces":["application/json"],
+             "parameters":[
+                 {
+                     "name":"intended_mode",
+                     "description":"Intended storage mode (tablets or vnodes)",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                 }
+             ]
+         }]
+      },
+      {
+         "path":"/storage_service/vnode_tablet_migrations/keyspaces/{keyspace}/finalization",
+         "operations":[{
+             "method":"POST",
+             "summary":"Finalize vnodes-to-tablets migration for all tables in a keyspace",
+             "type":"void",
+             "nickname":"finalize_vnode_tablet_migration",
+             "produces":["application/json"],
+             "parameters":[
+                 {
+                     "name":"keyspace",
+                     "description":"Keyspace name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                 }
+             ]
+         }]
+      },
      {
         "path":"/storage_service/quiesce_topology",
         "operations":[
@@ -3229,6 +3345,38 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_info",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Logstor segment information for one table",
+               "type":"table_logstor_info",
+               "nickname":"logstor_info",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"table",
+                     "description":"table name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/retrain_dict",
         "operations":[
@@ -3637,6 +3785,47 @@
            }
        }
      },
+        "logstor_hist_bucket":{
+         "id":"logstor_hist_bucket",
+         "properties":{
+            "bucket":{
+               "type":"long"
+            },
+            "count":{
+               "type":"long"
+            },
+            "min_data_size":{
+               "type":"long"
+            },
+            "max_data_size":{
+               "type":"long"
+            }
+         }
+        },
+        "table_logstor_info":{
+         "id":"table_logstor_info",
+         "description":"Per-table logstor segment distribution",
+         "properties":{
+            "keyspace":{
+               "type":"string"
+            },
+            "table":{
+               "type":"string"
+            },
+            "compaction_groups":{
+               "type":"long"
+            },
+            "segments":{
+               "type":"long"
+            },
+            "data_size_histogram":{
+               "type":"array",
+               "items":{
+                  "$ref":"logstor_hist_bucket"
+               }
+            }
+         }
+        },
      "tablet_repair_result":{
        "id":"tablet_repair_result",
        "description":"Tablet repair result",
@@ -3671,6 +3860,45 @@
               "description":"The resulting compression ratio (estimated on a random sample of files)"
            }
         }
+      },
+      "vnode_tablet_migration_node_status":{
+         "id":"vnode_tablet_migration_node_status",
+         "description":"Node storage mode info during vnodes-to-tablets migration",
+         "properties":{
+            "host_id":{
+               "type":"string",
+               "description":"The host ID"
+            },
+            "current_mode":{
+               "type":"string",
+               "description":"The current storage mode: `vnodes` or `tablets`"
+            },
+            "intended_mode":{
+               "type":"string",
+               "description":"The intended storage mode: `vnodes` or `tablets`"
+            }
+         }
+      },
+      "vnode_tablet_migration_status":{
+         "id":"vnode_tablet_migration_status",
+         "description":"Vnodes-to-tablets migration status for a keyspace",
+         "properties":{
+            "keyspace":{
+               "type":"string",
+               "description":"The keyspace name"
+            },
+            "status":{
+               "type":"string",
+               "description":"The migration status: `vnodes` (not started), `migrating_to_tablets` (in progress), or `tablets` (complete)"
+            },
+            "nodes":{
+               "type":"array",
+               "items":{
+                  "$ref":"vnode_tablet_migration_node_status"
+               },
+               "description":"Per-node storage mode information. Empty if the keyspace is not being migrated."
+            }
+         }
      }
   }
 }
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -209,6 +209,21 @@
               "parameters":[]
            }
         ]
+      },
+      {
+         "path":"/system/chosen_sstable_version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get sstable version currently chosen for use in new sstables",
+               "type":"string",
+               "nickname":"get_chosen_sstable_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
      }
   ]
 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -18,7 +18,9 @@
 #include "utils/assert.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
+#include <sstream>
 #include "db/data_listeners.hh"
+#include "utils/hash.hh"
 #include "storage_service.hh"
 #include "compaction/compaction_manager.hh"
 #include "unimplemented.hh"
@@ -342,6 +344,56 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
    return ret;
 }

+static
+future<json::json_return_type>
+rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
+        bool filters_provided = false;
+
+        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
+        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                table_filters.emplace(parse_fully_qualified_cf_name(filter));
+            }
+        }
+
+        std::unordered_set<sstring> keyspace_filters {};
+        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                keyspace_filters.emplace(std::move(filter));
+            }
+        }
+
+        // when the query is empty return immediately
+        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
+            apilog.debug("toppartitions query: processing results");
+            cf::toppartitions_query_results results;
+
+            results.read_cardinality = 0;
+            results.write_cardinality = 0;
+
+            return make_ready_future<json::json_return_type>(results);
+        }
+
+        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
+        api::req_param<unsigned> capacity(*req, "capacity", 256);
+        api::req_param<unsigned> list_size(*req, "list_size", 10);
+
+        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
+            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
+
+        return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
+            return run_toppartitions_query(q);
+        });
+}
+
 void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
    cf::get_column_family_name.set(r, [&db] (const_req req){
        std::vector<sstring> res;
@@ -1047,6 +1099,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
        });
    });

+    ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
+        return rest_toppartitions_generic(db, std::move(req));
+    });
+
    cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        if (!req->get_query_param("split_output").empty()) {
            fail(unimplemented::cause::API);
@@ -1213,6 +1269,7 @@ void unset_column_family(http_context& ctx, routes& r) {
    cf::get_sstable_count_per_level.unset(r);
    cf::get_sstables_for_key.unset(r);
    cf::toppartitions.unset(r);
+    ss::toppartitions_generic.unset(r);
    cf::force_major_compaction.unset(r);
    ss::get_load.unset(r);
    ss::get_metrics_load.unset(r);
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -23,7 +23,7 @@ void set_error_injection(http_context& ctx, routes& r) {

    hf::enable_injection.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
        sstring injection = req->get_path_param("injection");
-        bool one_shot = req->get_query_param("one_shot") == "True";
+        bool one_shot = strcasecmp(req->get_query_param("one_shot").c_str(), "true") == 0;
        auto params = co_await util::read_entire_stream_contiguous(*req->content_stream);

        const size_t max_params_size = 1024 * 1024;
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -17,9 +17,7 @@
 #include "gms/feature_service.hh"
 #include "schema/schema_builder.hh"
 #include "sstables/sstables_manager.hh"
-#include "utils/hash.hh"
 #include <optional>
-#include <sstream>
 #include <stdexcept>
 #include <time.h>
 #include <algorithm>
@@ -32,6 +30,7 @@
 #include <fmt/ranges.h>
 #include "service/raft/raft_group0_client.hh"
 #include "service/storage_service.hh"
+#include "service/topology_state_machine.hh"
 #include "service/load_meter.hh"
 #include "gms/feature_service.hh"
 #include "gms/gossiper.hh"
@@ -574,14 +573,6 @@ void unset_view_builder(http_context& ctx, routes& r) {
    cf::get_built_indexes.unset(r);
 }

-static future<json::json_return_type> describe_ring_as_json(sharded<service::storage_service>& ss, sstring keyspace) {
-    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring(keyspace), token_range_endpoints_to_json));
-}
-
-static future<json::json_return_type> describe_ring_as_json_for_table(const sharded<service::storage_service>& ss, sstring keyspace, sstring table) {
-    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
-}
-
 namespace {
 template <typename Key, typename Value>
 storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
@@ -612,56 +603,6 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
        co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
 }

-static
-future<json::json_return_type>
-rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
-        bool filters_provided = false;
-
-        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
-        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                table_filters.emplace(parse_fully_qualified_cf_name(filter));
-            }
-        }
-
-        std::unordered_set<sstring> keyspace_filters {};
-        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                keyspace_filters.emplace(std::move(filter));
-            }
-        }
-
-        // when the query is empty return immediately
-        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
-            apilog.debug("toppartitions query: processing results");
-            httpd::column_family_json::toppartitions_query_results results;
-
-            results.read_cardinality = 0;
-            results.write_cardinality = 0;
-
-            return make_ready_future<json::json_return_type>(results);
-        }
-
-        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
-        api::req_param<unsigned> capacity(*req, "capacity", 256);
-        api::req_param<unsigned> list_size(*req, "list_size", 10);
-
-        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
-            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
-
-        return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
-            return run_toppartitions_query(q);
-        });
-}
-
 static
 json::json_return_type
 rest_get_release_version(sharded<service::storage_service>& ss, const_req& req) {
@@ -729,13 +670,16 @@ rest_describe_ring(http_context& ctx, sharded<service::storage_service>& ss, std
        if (!req->param.exists("keyspace")) {
            throw bad_param_exception("The keyspace param is not provided");
        }
-        auto keyspace = req->get_path_param("keyspace");
+        auto keyspace = validate_keyspace(ctx, req);
        auto table = req->get_query_param("table");
+        utils::chunked_vector<dht::token_range_endpoints> ranges;
        if (!table.empty()) {
-            validate_table(ctx.db.local(), keyspace, table);
-            return describe_ring_as_json_for_table(ss, keyspace, table);
+            auto table_id = validate_table(ctx.db.local(), keyspace, table);
+            ranges = co_await ss.local().describe_ring_for_table(table_id);
+        } else {
+            ranges = co_await ss.local().describe_ring(keyspace);
        }
-        return describe_ring_as_json(ss, validate_keyspace(ctx, req));
+        co_return json::json_return_type(stream_range_as_array(std::move(ranges), token_range_endpoints_to_json));
 }

 static
@@ -833,6 +777,28 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)
        co_return json_void();
 }

+static
+future<json::json_return_type>
+rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+        bool major = false;
+        if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
+            major = validate_bool(major_param);
+        }
+        apilog.info("logstor_compaction: major={}", major);
+        auto& db = ctx.db;
+        co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
+        co_return json_void();
+}
+
+static
+future<json::json_return_type>
+rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
+        apilog.info("logstor_flush");
+        auto& db = ctx.db;
+        co_await replica::database::flush_logstor_separator_on_all_shards(db);
+        co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
@@ -1553,6 +1519,54 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
        });
 }

+static
+future<json::json_return_type>
+rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
+        auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
+        auto table = api::req_param<sstring>(*req, "table", {}).value;
+        if (table.empty()) {
+            table = api::req_param<sstring>(*req, "cf", {}).value;
+        }
+
+        if (keyspace.empty()) {
+            throw bad_param_exception("The query parameter 'keyspace' is required");
+        }
+        if (table.empty()) {
+            throw bad_param_exception("The query parameter 'table' is required");
+        }
+
+        keyspace = validate_keyspace(ctx, keyspace);
+        auto tid = validate_table(ctx.db.local(), keyspace, table);
+
+        auto& cf = ctx.db.local().find_column_family(tid);
+        if (!cf.uses_logstor()) {
+            throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
+        }
+
+        return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
+            return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
+                merged_stats += shard_stats;
+            }, [tid](const replica::database& db) {
+                return db.get_logstor_table_segment_stats(tid);
+            }).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
+                ss::table_logstor_info result;
+                result.keyspace = keyspace;
+                result.table = table;
+                result.compaction_groups = merged_stats.compaction_group_count;
+                result.segments = merged_stats.segment_count;
+
+                for (const auto& bucket : merged_stats.histogram) {
+                    ss::logstor_hist_bucket hist;
+                    hist.count = bucket.count;
+                    hist.max_data_size = bucket.max_data_size;
+                    result.data_size_histogram.push(std::move(hist));
+                }
+
+                return make_ready_future<json::json_return_type>(stream_object(result));
+            });
+        });
+}
+
 static
 future<json::json_return_type>
 rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
@@ -1709,6 +1723,69 @@ rest_tablet_balancing_enable(sharded<service::storage_service>& ss, std::unique_
        co_return json_void();
 }

+static
+future<json::json_return_type>
+rest_create_vnode_tablet_migration(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    if (!ss.local().get_feature_service().vnodes_to_tablets_migrations) {
+        apilog.warn("create_vnode_tablet_migration: called before the cluster feature was enabled");
+        throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
+    }
+    auto keyspace = validate_keyspace(ctx, req);
+    co_await ss.local().prepare_for_tablets_migration(keyspace);
+    co_return json_void();
+}
+
+static
+future<json::json_return_type>
+rest_get_vnode_tablet_migration(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    if (!ss.local().get_feature_service().vnodes_to_tablets_migrations) {
+        apilog.warn("get_vnode_tablet_migration: called before the cluster feature was enabled");
+        throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
+    }
+    auto keyspace = validate_keyspace(ctx, req);
+    auto status = co_await ss.local().get_tablets_migration_status(keyspace);
+
+    ss::vnode_tablet_migration_status result;
+    result.keyspace = status.keyspace;
+    result.status = status.status;
+    result.nodes._set = true;
+    for (const auto& node : status.nodes) {
+        ss::vnode_tablet_migration_node_status n;
+        n.host_id = fmt::to_string(node.host_id);
+        n.current_mode = node.current_mode;
+        n.intended_mode = node.intended_mode;
+        result.nodes.push(n);
+    }
+    co_return result;
+}
+
+static
+future<json::json_return_type>
+rest_set_vnode_tablet_migration_node_storage_mode(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    if (!ss.local().get_feature_service().vnodes_to_tablets_migrations) {
+        apilog.warn("set_vnode_tablet_migration_node_storage_mode: called before the cluster feature was enabled");
+        throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
+    }
+    auto mode_str = req->get_query_param("intended_mode");
+    auto mode = service::intended_storage_mode_from_string(mode_str);
+    co_await ss.local().set_node_intended_storage_mode(mode);
+    co_return json_void();
+}
+
+static
+future<json::json_return_type>
+rest_finalize_vnode_tablet_migration(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    if (!ss.local().get_feature_service().vnodes_to_tablets_migrations) {
+        apilog.warn("finalize_vnode_tablet_migration: called before the cluster feature was enabled");
+        throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
+    }
+    auto keyspace = validate_keyspace(ctx, req);
+    validate_keyspace(ctx, keyspace);
+
+    co_await ss.local().finalize_tablets_migration(keyspace);
+    co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_quiesce_topology(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1784,7 +1861,6 @@ rest_bind(FuncType func, BindArgs&... args) {

 void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
    ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
-    ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
    ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
    ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
    ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
@@ -1800,6 +1876,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
    ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
+    ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
+    ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
    ss::move.set(r, rest_bind(rest_move, ss));
    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
@@ -1848,6 +1926,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
    ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
    ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
+    ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
@@ -1857,6 +1936,10 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
    ss::repair_tablet.set(r, rest_bind(rest_repair_tablet, ctx, ss));
    ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
+    ss::create_vnode_tablet_migration.set(r, rest_bind(rest_create_vnode_tablet_migration, ctx, ss));
+    ss::get_vnode_tablet_migration.set(r, rest_bind(rest_get_vnode_tablet_migration, ctx, ss));
+    ss::set_vnode_tablet_migration_node_storage_mode.set(r, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss));
+    ss::finalize_vnode_tablet_migration.set(r, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss));
    ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
    sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
    ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
@@ -1864,7 +1947,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

 void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_token_endpoint.unset(r);
-    ss::toppartitions_generic.unset(r);
    ss::get_release_version.unset(r);
    ss::get_scylla_release_version.unset(r);
    ss::get_schema_version.unset(r);
@@ -1878,6 +1960,8 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reset_cleanup_needed.unset(r);
    ss::force_flush.unset(r);
    ss::force_keyspace_flush.unset(r);
+    ss::logstor_compaction.unset(r);
+    ss::logstor_flush.unset(r);
    ss::decommission.unset(r);
    ss::move.unset(r);
    ss::remove_node.unset(r);
@@ -1925,6 +2009,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_ownership.unset(r);
    ss::get_effective_ownership.unset(r);
    ss::sstable_info.unset(r);
+    ss::logstor_info.unset(r);
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
@@ -1934,6 +2019,10 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::del_tablet_replica.unset(r);
    ss::repair_tablet.unset(r);
    ss::tablet_balancing_enable.unset(r);
+    ss::create_vnode_tablet_migration.unset(r);
+    ss::get_vnode_tablet_migration.unset(r);
+    ss::set_vnode_tablet_migration_node_storage_mode.unset(r);
+    ss::finalize_vnode_tablet_migration.unset(r);
    ss::quiesce_topology.unset(r);
    sp::get_schema_versions.unset(r);
    ss::drop_quarantined_sstables.unset(r);
@@ -2024,6 +2113,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, opts);
            }
            co_return json_void();
+        } catch (const data_dictionary::no_such_column_family& e) {
+            throw httpd::bad_param_exception(e.what());
        } catch (...) {
            apilog.error("take_snapshot failed: {}", std::current_exception());
            throw;
@@ -2060,6 +2151,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        try {
            co_await snap_ctl.local().clear_snapshot(tag, keynames, column_family);
            co_return json_void();
+        } catch (const data_dictionary::no_such_column_family& e) {
+            throw httpd::bad_param_exception(e.what());
        } catch (...) {
            apilog.error("del_snapshot failed: {}", std::current_exception());
            throw;
--- a/api/system.cc
+++ b/api/system.cc
@@ -190,6 +190,13 @@ void set_system(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
        });
    });
+
+    hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx] {
+            auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
+            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
+        });
+    });
 }

 }
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -47,7 +47,7 @@ void cache::set_permission_loader(permission_loader_func loader) {
    _permission_loader = std::move(loader);
 }

-lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
+lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
    auto it = _roles.find(role);
    if (it == _roles.end()) {
        return {};
@@ -55,6 +55,16 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
    return it->second;
 }

+void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
+    for (const auto& [name, record] : _roles) {
+        func(name, *record);
+    }
+}
+
+size_t cache::roles_count() const noexcept {
+    return _roles.size();
+}
+
 future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
    std::unordered_map<resource, permission_set>* perms_cache;
    lw_shared_ptr<role_record> role_ptr;
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
+#include <string_view>
 #include <unordered_set>
 #include <unordered_map>

@@ -19,7 +20,7 @@
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/metrics_registration.hh>

-#include <absl/container/flat_hash_map.h>
+#include "absl-flat_hash_map.hh"

 #include "auth/permission.hh"
 #include "auth/common.hh"
@@ -42,8 +43,8 @@ public:
        std::unordered_set<role_name_t> member_of;
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
-        std::unordered_map<sstring, sstring> attributes;
-        std::unordered_map<sstring, permission_set> permissions;
+        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
+        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
    private:
        friend cache;
        // cached permissions include effects of role's inheritance
@@ -52,7 +53,7 @@ public:
    };

    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
-    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
+    lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
    void set_permission_loader(permission_loader_func loader);
    future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
    future<> prune(const resource& r);
@@ -61,8 +62,15 @@ public:
    future<> load_roles(std::unordered_set<role_name_t> roles);
    static bool includes_table(const table_id&) noexcept;

+    // Returns the number of roles in the cache.
+    size_t roles_count() const noexcept;
+
+    // The callback doesn't suspend (no co_await) so it observes the state
+    // of the cache atomically.
+    void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
+
 private:
-    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
+    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
    roles_map _roles;
    // anonymous permissions map exists mainly due to compatibility with
    // higher layers which use role_or_anonymous to get permissions.
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -14,6 +14,7 @@
 #include <fmt/ranges.h>

 #include "utils/to_string.hh"
+#include "utils/error_injection.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "cql3/query_processor.hh"
 #include "db/config.hh"
@@ -105,6 +106,9 @@ auth::authentication_option_set auth::certificate_authenticator::alterable_optio
 }

 future<std::optional<auth::authenticated_user>> auth::certificate_authenticator::authenticate(session_dn_func f) const {
+    if (auto user = utils::get_local_injector().inject_parameter("transport_early_auth_bypass")) {
+        co_return auth::authenticated_user{sstring(*user)};
+    }
    if (!f) {
        co_return std::nullopt;
    }
--- a/auth/maintenance_socket_authorizer.hh
+++ b/auth/maintenance_socket_authorizer.hh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "auth/default_authorizer.hh"
+#include "auth/permission.hh"
+
+namespace auth {
+
+// maintenance_socket_authorizer is used for clients connecting to the
+// maintenance socket. It grants all permissions unconditionally (like
+// AllowAllAuthorizer) while still supporting grant/revoke operations
+// (delegated to the underlying CassandraAuthorizer / default_authorizer).
+class maintenance_socket_authorizer : public default_authorizer {
+public:
+    using default_authorizer::default_authorizer;
+
+    ~maintenance_socket_authorizer() override = default;
+
+    future<> start() override {
+        return make_ready_future<>();
+    }
+
+    future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
+        return make_ready_future<permission_set>(permissions::ALL);
+    }
+};
+
+} // namespace auth
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -30,6 +30,7 @@
 #include "auth/default_authorizer.hh"
 #include "auth/ldap_role_manager.hh"
 #include "auth/maintenance_socket_authenticator.hh"
+#include "auth/maintenance_socket_authorizer.hh"
 #include "auth/maintenance_socket_role_manager.hh"
 #include "auth/password_authenticator.hh"
 #include "auth/role_or_anonymous.hh"
@@ -866,6 +867,12 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
    };
 }

+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
+    return [&qp] {
+        return std::make_unique<maintenance_socket_authorizer>(qp.local());
+    };
+}
+
 role_manager_factory make_maintenance_socket_role_manager_factory(
        sharded<cql3::query_processor>& qp,
        ::service::raft_group0_client& g0,
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -434,6 +434,11 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
        sharded<::service::migration_manager>& mm,
        sharded<cache>& cache);

+/// Creates a factory for the maintenance socket authorizer.
+/// This authorizer is not config-selectable and is only used for the maintenance socket.
+/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
+
 /// Creates a factory for the maintenance socket role manager.
 /// This role manager is not config-selectable and is only used for the maintenance socket.
 role_manager_factory make_maintenance_socket_role_manager_factory(
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -44,13 +44,12 @@ namespace auth {
 static logging::logger log("standard_role_manager");

 future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
-    auto name = sstring(role_name);
-    auto role = _cache.get(name);
+    auto role = _cache.get(role_name);
    if (!role) {
        return make_ready_future<std::optional<record>>(std::nullopt);
    }
    return make_ready_future<std::optional<record>>(std::make_optional(record{
-        .name = std::move(name),
+        .name = sstring(role_name),
        .is_superuser = role->is_superuser,
        .can_login = role->can_login,
        .member_of = role->member_of
@@ -393,51 +392,21 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
 }

 future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT * FROM {}.{}",
-            db::system_keyspace::NAME,
-            ROLE_MEMBERS_CF);
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_to_directly_granted_map roles_map;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles_map, roles_map.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
-    );
-
+    _cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
+        for (const auto& granted_role : record.member_of) {
+            roles_map.emplace(name, granted_role);
+        }
+    });
    co_return roles_map;
 }

 future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT {} FROM {}.{}",
-            meta::roles_table::role_col_name,
-            db::system_keyspace::NAME,
-            meta::roles_table::name);
-
-    // To avoid many copies of a view.
-    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::LOCAL_ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_set roles;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles, roles.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return row.get_as<sstring>(role_col_name_string);}
-    );
+    roles.reserve(_cache.roles_count());
+    _cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
+        roles.insert(name);
+    });
    co_return roles;
 }

@@ -460,31 +429,26 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
-            db::system_keyspace::NAME,
-            ROLE_ATTRIBUTES_CF);
-    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
-    if (!result_set->empty()) {
-        const cql3::untyped_result_set_row &row = result_set->one();
-        co_return std::optional<sstring>(row.get_as<sstring>("value"));
+    auto role = _cache.get(role_name);
+    if (!role) {
+        co_return std::nullopt;
    }
-    co_return std::optional<sstring>{};
+    auto it = role->attributes.find(attribute_name);
+    if (it != role->attributes.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
-    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
-                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
-                    if (att_val) {
-                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
-                    }
-                });
-            }).then([&role_to_att_val] () {
-                return make_ready_future<attribute_vals>(std::move(role_to_att_val));
-            });
-        });
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    attribute_vals result;
+    _cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
+        auto it = record.attributes.find(attribute_name);
+        if (it != record.attributes.end()) {
+            result.emplace(name, it->second);
+        }
    });
+    co_return result;
 }

 future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
--- a/cmake/FindLua.cmake
+++ b/cmake/FindLua.cmake
@@ -0,0 +1,47 @@
+#
+# Copyright 2025-present ScyllaDB
+#
+
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+
+# Custom FindLua module that uses pkg-config, matching configure.py's
+# approach.  CMake's built-in FindLua resolves to the versioned library
+# (e.g. liblua-5.4.so) instead of the unversioned symlink (liblua.so),
+# causing a name mismatch between the two build systems.
+
+find_package(PkgConfig REQUIRED)
+
+# configure.py: lua53 on Debian-like, lua on others
+pkg_search_module(PC_lua QUIET lua53 lua)
+
+find_library(Lua_LIBRARY
+  NAMES lua lua5.3 lua53
+  HINTS
+    ${PC_lua_LIBDIR}
+    ${PC_lua_LIBRARY_DIRS})
+
+find_path(Lua_INCLUDE_DIR
+  NAMES lua.h
+  HINTS
+    ${PC_lua_INCLUDEDIR}
+    ${PC_lua_INCLUDE_DIRS})
+
+mark_as_advanced(
+  Lua_LIBRARY
+  Lua_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(Lua
+  REQUIRED_VARS
+    Lua_LIBRARY
+    Lua_INCLUDE_DIR
+  VERSION_VAR PC_lua_VERSION)
+
+if(Lua_FOUND)
+  set(LUA_LIBRARIES ${Lua_LIBRARY})
+  set(LUA_INCLUDE_DIR ${Lua_INCLUDE_DIR})
+endif()
+
--- a/cmake/mode.Coverage.cmake
+++ b/cmake/mode.Coverage.cmake
@@ -1,5 +1,5 @@
 set(CMAKE_CXX_FLAGS_COVERAGE
-  "-fprofile-instr-generate -fcoverage-mapping -fprofile-list=${CMAKE_SOURCE_DIR}/coverage_sources.list"
+  "-fprofile-instr-generate -fcoverage-mapping"
  CACHE
  INTERNAL
  "")
@@ -8,18 +8,33 @@ update_build_flags(Coverage
  OPTIMIZATION_LEVEL "g")

 set(scylla_build_mode_Coverage "coverage")
+
+# Coverage mode sets cmake_build_type='Debug' for Seastar
+# (configure.py:515), so Seastar's pkg-config --cflags output
+# (configure.py:2252-2267, queried at configure.py:3039) includes debug
+# defines, sanitizer compile flags, and -fstack-clash-protection.
+# Seastar's CMake generator expressions only activate these for
+# Debug/Sanitize configs, so we add them explicitly for Coverage.
 set(Seastar_DEFINITIONS_COVERAGE
  SCYLLA_BUILD_MODE=${scylla_build_mode_Coverage}
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
+  SEASTAR_DEBUG
+  SEASTAR_DEFAULT_ALLOCATOR
+  SEASTAR_SHUFFLE_TASK_QUEUE
+  SEASTAR_DEBUG_SHARED_PTR
+  SEASTAR_DEBUG_PROMISE
+  SEASTAR_TYPE_ERASE_MORE)
 foreach(definition ${Seastar_DEFINITIONS_COVERAGE})
  add_compile_definitions(
    $<$<CONFIG:Coverage>:${definition}>)
 endforeach()

-set(CMAKE_STATIC_LINKER_FLAGS_COVERAGE
+add_compile_options(
+  $<$<CONFIG:Coverage>:-fsanitize=address>
+  $<$<CONFIG:Coverage>:-fsanitize=undefined>
+  $<$<CONFIG:Coverage>:-fsanitize=vptr>
+  $<$<CONFIG:Coverage>:-fstack-clash-protection>)
+
+set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
  "-fprofile-instr-generate -fcoverage-mapping")

 maybe_limit_stack_usage_in_KB(40 Coverage)
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -131,6 +131,7 @@ function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
  check_cxx_compiler_flag(${_stack_usage_threshold_flag} _stack_usage_flag_supported)
  if(_stack_usage_flag_supported)
    add_compile_options($<$<CONFIG:${config}>:${_stack_usage_threshold_flag}>)
+    add_compile_options($<$<CONFIG:${config}>:-Wno-error=stack-usage=>)
  endif()
 endfunction()

@@ -260,6 +261,23 @@ endif()

 # Force SHA1 build-id generation
 add_link_options("LINKER:--build-id=sha1")
+
+# Match configure.py: add -fno-lto globally. configure.py adds -fno-lto to
+# all binaries (except standalone cpp_apps like patchelf) via the per-binary
+# $libs variable. LTO-enabled targets (scylla binary in RelWithDebInfo) will
+# override with -flto=thin -ffat-lto-objects via enable_lto().
+add_link_options(-fno-lto)
+
+# Match configure.py:2633-2636 — sanitizer link flags for standalone binaries
+# (e.g. patchelf) that don't link Seastar.  Seastar-linked targets get these
+# via seastar_libs (configure.py:2649).
+# Coverage mode gets sanitizer link flags via the seastar target instead
+# (see CMakeLists.txt), matching configure.py where only seastar_libs_coverage
+# carries -fsanitize (not cxx_ld_flags).
+add_link_options(
+  $<$<CONFIG:Debug,Sanitize>:-fsanitize=address>
+  $<$<CONFIG:Debug,Sanitize>:-fsanitize=undefined>)
+
 include(CheckLinkerFlag)
 set(Scylla_USE_LINKER
    ""
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -44,6 +44,7 @@
 #include "dht/partition_filter.hh"
 #include "mutation_writer/shard_based_splitting_writer.hh"
 #include "mutation_writer/partition_based_splitting_writer.hh"
+#include "mutation_writer/token_group_based_splitting_writer.hh"
 #include "mutation/mutation_source_metadata.hh"
 #include "mutation/mutation_fragment_stream_validator.hh"
 #include "utils/assert.hh"
@@ -1933,6 +1934,7 @@ class resharding_compaction final : public compaction {
    };
    std::vector<estimated_values> _estimation_per_shard;
    std::vector<sstables::run_id> _run_identifiers;
+    bool _reshard_vnodes;
 private:
    // return estimated partitions per sstable for a given shard
    uint64_t partitions_per_sstable(shard_id s) const {
@@ -1945,7 +1947,11 @@ public:
        : compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
        , _estimation_per_shard(smp::count)
        , _run_identifiers(smp::count)
+        , _reshard_vnodes(descriptor.options.as<compaction_type_options::reshard>().vnodes_resharding)
    {
+        if (_reshard_vnodes && !_owned_ranges) {
+            on_internal_error(clogger, "Resharding vnodes requires owned_ranges");
+        }
        for (auto& sst : _sstables) {
            const auto& shards = sst->get_shards_for_this_sstable();
            auto size = sst->bytes_on_disk();
@@ -1983,8 +1989,25 @@ public:
    }

    mutation_reader_consumer make_interposer_consumer(mutation_reader_consumer end_consumer) override {
-        return [end_consumer = std::move(end_consumer)] (mutation_reader reader) mutable -> future<> {
-            return mutation_writer::segregate_by_shard(std::move(reader), std::move(end_consumer));
+        auto owned_ranges = _reshard_vnodes ? _owned_ranges : nullptr;
+        return [end_consumer = std::move(end_consumer), owned_ranges = std::move(owned_ranges)] (mutation_reader reader) mutable -> future<> {
+            if (owned_ranges) {
+                auto classify = [owned_ranges, it = owned_ranges->begin(), idx = mutation_writer::token_group_id(0)] (dht::token t) mutable -> mutation_writer::token_group_id {
+                    dht::token_comparator cmp;
+                    while (it != owned_ranges->end() && it->after(t, cmp)) {
+                        clogger.debug("Token {} is after current range {}: advancing to the next range", t, *it);
+                        ++it;
+                        ++idx;
+                    }
+                    if (it == owned_ranges->end() || !it->contains(t, cmp)) {
+                        on_internal_error(clogger, fmt::format("Token {} is outside of owned ranges", t));
+                    }
+                    return idx;
+                };
+                return mutation_writer::segregate_by_token_group(std::move(reader), std::move(classify), std::move(end_consumer));
+            } else {
+                return mutation_writer::segregate_by_shard(std::move(reader), std::move(end_consumer));
+            }
        };
    }

--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -87,6 +87,8 @@ public:
        drop_unfixable_sstables drop_unfixable = drop_unfixable_sstables::no;
    };
    struct reshard {
+        // If set, resharding compaction will apply the owned_ranges to segregate sstables in vnode boundaries.
+        bool vnodes_resharding = false;
    };
    struct reshape {
    };
@@ -115,8 +117,8 @@ public:
        return compaction_type_options(reshape{});
    }

-    static compaction_type_options make_reshard() {
-        return compaction_type_options(reshard{});
+    static compaction_type_options make_reshard(bool vnodes_resharding = false) {
+        return compaction_type_options(reshard{.vnodes_resharding = vnodes_resharding});
    }

    static compaction_type_options make_regular() {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1268,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
    if (dsm && (this_shard_id() == 0)) {
        _out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
            if (threshold_reached) {
-                return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
+                return container().invoke_on_all([] (compaction_manager& cm) {
+                    cm._in_critical_disk_utilization_mode = true;
+                    return cm.drain();
+                });
            }
-            return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
+            return container().invoke_on_all([] (compaction_manager& cm) {
+                cm._in_critical_disk_utilization_mode = false;
+                cm.enable();
+            });
        });
    }

@@ -2348,6 +2354,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
 }

+std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
+    std::exception_ptr ex;
+    if (_in_critical_disk_utilization_mode) {
+        ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
+    } else {
+        ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
+    }
+    return ex;
+}
+
 future<std::vector<sstables::shared_sstable>>
 compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
@@ -2357,8 +2373,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
    // We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
    // which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
    if (is_disabled()) {
-        co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
-                                                                                         "reason might be out of space prevention", sst->get_filename()))));
+        co_return coroutine::exception(make_disabled_exception(t));
    }
    std::vector<sstables::shared_sstable> ret;

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -115,6 +115,8 @@ private:
    uint32_t _disabled_state_count = 0;

    bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
+    // precondition: is_disabled() is true.
+    std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);

    std::optional<future<>> _stop_future;

@@ -170,6 +172,7 @@ private:
    shared_tombstone_gc_state _shared_tombstone_gc_state;

    utils::disk_space_monitor::subscription _out_of_space_subscription;
+    bool _in_critical_disk_utilization_mode = false;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -132,7 +132,7 @@ distribute_reshard_jobs(sstables::sstable_directory::sstable_open_info_vector so
 // A creator function must be passed that will create an SSTable object in the correct shard,
 // and an I/O priority must be specified.
 future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory::sstable_open_info_vector shared_info, replica::table& table,
-                           compaction::compaction_sstable_creator_fn creator, compaction::owned_ranges_ptr owned_ranges_ptr, tasks::task_info parent_info)
+                           compaction::compaction_sstable_creator_fn creator, compaction::owned_ranges_ptr owned_ranges_ptr, bool vnodes_resharding, tasks::task_info parent_info)
 {
    // Resharding doesn't like empty sstable sets, so bail early. There is nothing
    // to reshard in this shard.
@@ -160,13 +160,22 @@ future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory::
    // There is a semaphore inside the compaction manager in run_resharding_jobs. So we
    // parallel_for_each so the statistics about pending jobs are updated to reflect all
    // jobs. But only one will run in parallel at a time
-    auto& t = table.try_get_compaction_group_view_with_static_sharding();
+    //
+    // The compaction group view is used here only for job registration and gate-holding;
+    // resharding never reads or writes the group's own SSTables. With static (vnode)
+    // sharding there is exactly one group per shard; with tablets there may be many.
+    // In either case, any registered group suffices.
+    auto* cg = table.get_any_compaction_group();
+    if (!cg) {
+        on_internal_error(tasks::tmlogger, format("No compaction group found for table {}.{}", table.schema()->ks_name(), table.schema()->cf_name()));
+    }
+    auto& t = cg->view_for_unrepaired_data();
    co_await coroutine::parallel_for_each(buckets, [&] (std::vector<sstables::shared_sstable>& sstlist) mutable {
        return table.get_compaction_manager().run_custom_job(t, compaction_type::Reshard, "Reshard compaction", [&] (compaction_data& info, compaction_progress_monitor& progress_monitor) -> future<> {
            auto erm = table.get_effective_replication_map(); // keep alive around compaction.

            compaction_descriptor desc(sstlist);
-            desc.options = compaction_type_options::make_reshard();
+            desc.options = compaction_type_options::make_reshard(vnodes_resharding);
            desc.creator = creator;
            desc.sharder = &erm->get_sharder(*table.schema());
            desc.owned_ranges = owned_ranges_ptr;
@@ -906,7 +915,7 @@ future<> table_resharding_compaction_task_impl::run() {
        if (_owned_ranges_ptr) {
            local_owned_ranges_ptr = make_lw_shared<const dht::token_range_vector>(*_owned_ranges_ptr);
        }
-        auto task = co_await compaction_module.make_and_start_task<shard_resharding_compaction_task_impl>(parent_info, _status.keyspace, _status.table, _status.id, _dir, db, _creator, std::move(local_owned_ranges_ptr), destinations);
+        auto task = co_await compaction_module.make_and_start_task<shard_resharding_compaction_task_impl>(parent_info, _status.keyspace, _status.table, _status.id, _dir, db, _creator, std::move(local_owned_ranges_ptr), _vnodes_resharding, destinations);
        co_await task->done();
    }));

@@ -926,12 +935,14 @@ shard_resharding_compaction_task_impl::shard_resharding_compaction_task_impl(tas
        replica::database& db,
        compaction_sstable_creator_fn creator,
        compaction::owned_ranges_ptr local_owned_ranges_ptr,
+        bool vnodes_resharding,
        std::vector<replica::reshard_shard_descriptor>& destinations) noexcept
    : resharding_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), std::move(table), "", parent_id)
    , _dir(dir)
    , _db(db)
    , _creator(std::move(creator))
    , _local_owned_ranges_ptr(std::move(local_owned_ranges_ptr))
+    , _vnodes_resharding(vnodes_resharding)
    , _destinations(destinations)
 {
    _expected_workload = _destinations[this_shard_id()].size();
@@ -941,7 +952,7 @@ future<> shard_resharding_compaction_task_impl::run() {
    auto& table = _db.find_column_family(_status.keyspace, _status.table);
    auto info_vec = std::move(_destinations[this_shard_id()].info_vec);
    tasks::task_info info{_status.id, _status.shard};
-    co_await reshard(_dir.local(), std::move(info_vec), table, _creator, std::move(_local_owned_ranges_ptr), info);
+    co_await reshard(_dir.local(), std::move(info_vec), table, _creator, std::move(_local_owned_ranges_ptr), _vnodes_resharding, info);
    co_await _dir.local().move_foreign_sstables(_dir);
 }

--- a/compaction/task_manager_module.hh
+++ b/compaction/task_manager_module.hh
@@ -693,6 +693,7 @@ private:
    sharded<replica::database>& _db;
    compaction_sstable_creator_fn _creator;
    compaction::owned_ranges_ptr _owned_ranges_ptr;
+    bool _vnodes_resharding;
 public:
    table_resharding_compaction_task_impl(tasks::task_manager::module_ptr module,
            std::string keyspace,
@@ -700,12 +701,14 @@ public:
            sharded<sstables::sstable_directory>& dir,
            sharded<replica::database>& db,
            compaction_sstable_creator_fn creator,
-            compaction::owned_ranges_ptr owned_ranges_ptr) noexcept
+            compaction::owned_ranges_ptr owned_ranges_ptr,
+            bool vnodes_resharding) noexcept
        : resharding_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "table", std::move(keyspace), std::move(table), "", tasks::task_id::create_null_id())
        , _dir(dir)
        , _db(db)
        , _creator(std::move(creator))
        , _owned_ranges_ptr(std::move(owned_ranges_ptr))
+        , _vnodes_resharding(vnodes_resharding)
    {}
 protected:
    virtual future<> run() override;
@@ -718,6 +721,7 @@ private:
    replica::database& _db;
    compaction_sstable_creator_fn _creator;
    compaction::owned_ranges_ptr _local_owned_ranges_ptr;
+    bool _vnodes_resharding;
    std::vector<replica::reshard_shard_descriptor>& _destinations;
 public:
    shard_resharding_compaction_task_impl(tasks::task_manager::module_ptr module,
@@ -728,6 +732,7 @@ public:
            replica::database& db,
            compaction_sstable_creator_fn creator,
            compaction::owned_ranges_ptr local_owned_ranges_ptr,
+            bool vnodes_resharding,
            std::vector<replica::reshard_shard_descriptor>& destinations) noexcept;
 protected:
    virtual future<> run() override;
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -397,6 +397,17 @@ commitlog_total_space_in_mb: -1
 #      you can cache more hot rows
 # column_index_size_in_kb: 64

+# sstable format version for newly written sstables.
+# Currently allowed values are `me` and `ms`.
+# If not specified in the config, this defaults to `me`.
+#
+# The difference between `me` and `ms` are the data structures used
+# in the primary index.
+# In short, `ms` needs more CPU during sstable writes,
+# but should behave better during reads,
+# although it might behave worse for very long clustering keys.
+sstable_format: ms
+
 # Auto-scaling of the promoted index prevents running out of memory
 # when the promoted index grows too large (due to partitions with many rows
 # vs. too small column_index_size_in_kb).  When the serialized representation
@@ -477,6 +488,7 @@ commitlog_total_space_in_mb: -1
 # compressed.
 # can be:  all  - all traffic is compressed
 #          dc   - traffic between different datacenters is compressed
+#          rack - traffic between different racks is compressed
 #          none - nothing is compressed.
 # internode_compression: none

@@ -572,8 +584,7 @@ commitlog_total_space_in_mb: -1
 audit: "table"
 #
 # List of statement categories that should be audited.
-# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
-audit_categories: "DCL,AUTH,ADMIN"
+audit_categories: "DCL,DDL,AUTH,ADMIN"
 #
 # List of tables that should be audited.
 # audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
--- a/configure.py
+++ b/configure.py
@@ -896,6 +896,9 @@ scylla_core = (['message/messaging_service.cc',
                'replica/multishard_query.cc',
                'replica/mutation_dump.cc',
                'replica/querier.cc',
+                'replica/logstor/segment_manager.cc',
+                'replica/logstor/logstor.cc',
+                'replica/logstor/write_buffer.cc',
                'mutation/atomic_cell.cc',
                'mutation/canonical_mutation.cc',
                'mutation/frozen_mutation.cc',
@@ -1467,6 +1470,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/query.idl.hh',
        'idl/idl_test.idl.hh',
        'idl/commitlog.idl.hh',
+        'idl/logstor.idl.hh',
        'idl/tracing.idl.hh',
        'idl/consistency_level.idl.hh',
        'idl/cache_temperature.idl.hh',
@@ -1704,12 +1708,14 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
+    'test/boost/sstable_tablet_streaming.cc',
    'test/boost/statement_restrictions_test.cc',
    'test/boost/storage_proxy_test.cc',
    'test/boost/tablets_test.cc',
    'test/boost/tracing_test.cc',
    'test/boost/user_function_test.cc',
    'test/boost/user_types_test.cc',
+    'test/boost/vector_index_test.cc',
    'test/boost/view_build_test.cc',
    'test/boost/view_complex_test.cc',
    'test/boost/view_schema_ckey_test.cc',
@@ -2227,16 +2233,20 @@ abseil_libs = ['absl/' + lib for lib in [
    'container/libabsl_raw_hash_set.a',
    'synchronization/libabsl_synchronization.a',
    'synchronization/libabsl_graphcycles_internal.a',
+    'synchronization/libabsl_kernel_timeout_internal.a',
    'debugging/libabsl_stacktrace.a',
    'debugging/libabsl_symbolize.a',
    'debugging/libabsl_debugging_internal.a',
    'debugging/libabsl_demangle_internal.a',
+    'debugging/libabsl_demangle_rust.a',
+    'debugging/libabsl_decode_rust_punycode.a',
+    'debugging/libabsl_utf8_for_code_point.a',
+    'debugging/libabsl_borrowed_fixup_buffer.a',
    'time/libabsl_time.a',
    'time/libabsl_time_zone.a',
    'numeric/libabsl_int128.a',
    'hash/libabsl_hash.a',
    'hash/libabsl_city.a',
-    'hash/libabsl_low_level_hash.a',
    'base/libabsl_malloc_internal.a',
    'base/libabsl_spinlock_wait.a',
    'base/libabsl_base.a',
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -201,6 +201,10 @@ public:
        return _clustering_columns_restrictions;
    }

+    const expr::expression& get_nonprimary_key_restrictions() const {
+        return _nonprimary_key_restrictions;
+    }
+
    // Get a set of columns restricted by the IS NOT NULL restriction.
    // IS NOT NULL is a special case that is handled separately from other restrictions.
    const std::unordered_set<const column_definition*> get_not_null_columns() const;
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -265,7 +265,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
        return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(
                exceptions::invalid_request_exception(
-                        format("Consistency level {} is not allowed for write operations", cl)));
+                        format("Write consistency level {} is forbidden by the current configuration "
+                               "setting of write_consistency_levels_disallowed. Please use a different "
+                               "consistency level, or remove {} from write_consistency_levels_disallowed "
+                               "set in the configuration.", cl, cl)));
    }

    for (size_t i = 0; i < _statements.size(); ++i) {
@@ -277,7 +280,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        _stats.statements_in_cas_batches += _statements.size();
        return execute_with_conditions(qp, options, query_state).then([guardrail_state, cl] (auto result) {
            if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-                result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+                result->add_warning(format("Using write consistency level {} listed on the "
+                                           "write_consistency_levels_warned is not recommended.", cl));
            }
            return result;
        });
@@ -297,7 +301,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        }
        auto result = make_shared<cql_transport::messages::result_message::void_message>();
        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+            result->add_warning(format("Using write consistency level {} listed on the "
+                                       "write_consistency_levels_warned is not recommended.", cl));
        }
        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(result));
    });
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -59,6 +59,8 @@ const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";

 const sstring cf_prop_defs::KW_TABLETS = "tablets";

+const sstring cf_prop_defs::KW_STORAGE_ENGINE = "storage_engine";
+
 schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
    schema::extensions_map er;
    for (auto& p : exts.schema_extensions()) {
@@ -106,6 +108,7 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
        KW_COMPRESSION, KW_CRC_CHECK_CHANCE,  KW_ID, KW_PAXOSGRACESECONDS,
        KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
+        KW_STORAGE_ENGINE,
    });
    static std::set<sstring> obsolete_keywords({
        sstring("index_interval"),
@@ -196,6 +199,20 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        }
        db::tablet_options::validate(*tablet_options_map);
    }
+
+    if (has_property(KW_STORAGE_ENGINE)) {
+        auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor") {
+            if (!db.features().logstor) {
+                throw exceptions::configuration_exception(format("The experimental feature 'logstor' must be enabled in order to use the 'logstor' storage engine."));
+            }
+            if (!db.get_config().enable_logstor()) {
+                throw exceptions::configuration_exception(format("The configuration option 'enable_logstor' must be set to true in the configuration in order to use the 'logstor' storage engine."));
+            }
+        } else {
+            throw exceptions::configuration_exception(format("Illegal value for '{}'", KW_STORAGE_ENGINE));
+        }
+    }
 }

 std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
@@ -396,6 +413,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
    if (auto tablet_options_opt = get_map(KW_TABLETS)) {
        builder.set_tablet_options(std::move(*tablet_options_opt));
    }
+
+    if (has_property(KW_STORAGE_ENGINE)) {
+        auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor") {
+            builder.set_logstor();
+        }
+    }
 }

 void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -64,6 +64,8 @@ public:

    static const sstring KW_TABLETS;

+    static const sstring KW_STORAGE_ENGINE;
+
    // FIXME: In origin the following consts are in CFMetaData.
    static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
    static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include <boost/algorithm/string.hpp>
 #include <seastar/core/coroutine.hh>
 #include "create_index_statement.hh"
 #include "db/config.hh"
@@ -35,8 +36,10 @@
 #include "db/schema_tables.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/concrete_types.hh"
+#include "types/vector.hh"
 #include "db/tags/extension.hh"
 #include "tombstone_gc_extension.hh"
+#include "index/secondary_index.hh"

 #include <stdexcept>

@@ -116,6 +119,58 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
    }
 }

+// Cassandra SAI compatibility: detect the StorageAttachedIndex class name
+// used by Cassandra to create vector and metadata indexes.
+static bool is_sai_class_name(const sstring& class_name) {
+    return class_name == "org.apache.cassandra.index.sai.StorageAttachedIndex"
+        || boost::iequals(class_name, "storageattachedindex")
+        || boost::iequals(class_name, "sai");
+}
+
+// Returns true if the custom class name refers to a vector-capable index
+// (either ScyllaDB's native vector_index or Cassandra's SAI).
+static bool is_vector_capable_class(const sstring& class_name) {
+    return class_name == "vector_index" || is_sai_class_name(class_name);
+}
+
+// When the custom class is SAI, verify that at least one target is a
+// vector column and rewrite the class to ScyllaDB's native "vector_index".
+// Non-vector single-column targets and multi-column (local-index partition
+// key) targets are skipped — they are treated as filtering columns by
+// vector_index::check_target().
+static void maybe_rewrite_sai_to_vector_index(
+        const schema& schema,
+        const std::vector<::shared_ptr<index_target>>& targets,
+        index_specific_prop_defs& props) {
+    if (!props.custom_class || !is_sai_class_name(*props.custom_class)) {
+        return;
+    }
+    for (const auto& target : targets) {
+        auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
+        if (!ident) {
+            // Multi-column target (local-index partition key) — skip.
+            continue;
+        }
+        auto cd = schema.get_column_definition((*ident)->name());
+        if (!cd) {
+            // Nonexistent column — skip; vector_index::validate() will catch it.
+            continue;
+        }
+        if (dynamic_cast<const vector_type_impl*>(cd->type.get())) {
+            props.custom_class = "vector_index";
+            return;
+        }
+    }
+    throw exceptions::invalid_request_exception(
+        "StorageAttachedIndex (SAI) is only supported on vector columns; "
+        "use a secondary index for non-vector columns");
+}
+
+static bool is_vector_index(const index_options_map& options) {
+    auto class_it = options.find(db::index::secondary_index::custom_class_option_name);
+    return class_it != options.end() && is_vector_capable_class(class_it->second);
+}
+
 view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
        const data_dictionary::database& db) const
 {
@@ -265,8 +320,8 @@ create_index_statement::validate(query_processor& qp, const service::client_stat

    _idx_properties->validate();

-    // FIXME: This is ugly and can be improved.
-    const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index";
+
+    const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
    const bool uses_view_properties = _view_properties.properties()->count() > 0
            || _view_properties.use_compact_storage()
            || _view_properties.defined_ordering().size() > 0;
@@ -352,6 +407,8 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
        targets.emplace_back(raw_target->prepare(*schema));
    }

+    maybe_rewrite_sai_to_vector_index(*schema, targets, *_idx_properties);
+
    if (_idx_properties && _idx_properties->custom_class) {
        auto custom_index_factory = secondary_index::secondary_index_manager::get_custom_class_factory(*_idx_properties->custom_class);
        if (!custom_index_factory) {
@@ -697,7 +754,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s
                                                           const index_options_map& options)
 {
    index_options_map new_options = options;
-    auto target_option = secondary_index::target_parser::serialize_targets(targets);
+    auto target_option = is_vector_index(options)
+        ? secondary_index::vector_index::serialize_targets(targets)
+        : secondary_index::target_parser::serialize_targets(targets);
    new_options.emplace(index_target::target_option_name, target_option);

    const auto& first_target = targets.front()->value;
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -9,6 +9,7 @@
 */


+#include "cql3/statements/cf_prop_defs.hh"
 #include "utils/assert.hh"
 #include <inttypes.h>
 #include <boost/regex.hpp>
@@ -266,6 +267,13 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
        stmt_warning("CREATE TABLE WITH COMPACT STORAGE is deprecated and will eventually be removed in a future version.");
    }

+    if (_properties.properties()->has_property(cf_prop_defs::KW_STORAGE_ENGINE)) {
+        auto storage_engine = _properties.properties()->get_string(cf_prop_defs::KW_STORAGE_ENGINE, "");
+        if (storage_engine == "logstor" && !_column_aliases.empty()) {
+            throw exceptions::configuration_exception("The 'logstor' storage engine cannot be used with tables that have clustering columns");
+        }
+    }
+
    auto& key_aliases = _key_aliases[0];
    std::vector<data_type> key_types;
    for (auto&& alias : key_aliases) {
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -273,7 +273,10 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
        co_return coroutine::exception(
                std::make_exception_ptr(exceptions::invalid_request_exception(
-                        format("Consistency level {} is not allowed for write operations", cl))));
+                        format("Write consistency level {} is forbidden by the current configuration "
+                               "setting of write_consistency_levels_disallowed. Please use a different "
+                               "consistency level, or remove {} from write_consistency_levels_disallowed "
+                               "set in the configuration.", cl, cl))));
    }

    _restrictions->validate_primary_key(options);
@@ -281,7 +284,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
    if (has_conditions()) {
        auto result = co_await execute_with_condition(qp, qs, options);
        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+            result->add_warning(format("Using write consistency level {} listed on the "
+                                       "write_consistency_levels_warned is not recommended.", cl));
        }
        co_return result;
    }
@@ -303,7 +307,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    auto result = seastar::make_shared<cql_transport::messages::result_message::void_message>();
    if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
-        result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+        result->add_warning(format("Using write consistency level {} listed on the "
+                                   "write_consistency_levels_warned is not recommended.", cl));
    }
    if (keys_size_one) {
        auto&& table = s->table();
--- a/cql3/statements/strong_consistency/modification_statement.cc
+++ b/cql3/statements/strong_consistency/modification_statement.cc
@@ -52,6 +52,7 @@ future<shared_ptr<result_message>> modification_statement::execute_without_check
    }

    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
+
    const auto mutate_result = co_await coordinator.get().mutate(_statement->s,
        keys[0].start()->value().token(),
        [&](api::timestamp_type ts) {
@@ -65,7 +66,7 @@ future<shared_ptr<result_message>> modification_statement::execute_without_check
                    raw_cql_statement, muts.size()));
            }
            return std::move(*muts.begin());
-        });
+        }, timeout, qs.get_client_state().get_abort_source());

    using namespace service::strong_consistency;
    if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
--- a/cql3/statements/strong_consistency/select_statement.cc
+++ b/cql3/statements/strong_consistency/select_statement.cc
@@ -42,7 +42,7 @@ future<::shared_ptr<result_message>> select_statement::do_execute(query_processo
    const auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
    auto query_result = co_await coordinator.get().query(_query_schema, *read_command,
-        key_ranges, state.get_trace_state(), timeout);
+        key_ranges, state.get_trace_state(), timeout, state.get_client_state().get_abort_source());

    using namespace service::strong_consistency;
    if (const auto* redirect = get_if<need_redirect>(&query_result)) {
@@ -54,4 +54,4 @@ future<::shared_ptr<result_message>> select_statement::do_execute(query_processo
        read_command, options, now);
 }

-}
+}
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -250,8 +250,8 @@ void keyspace_metadata::validate(const gms::feature_service& fs, const locator::
    if (params.consistency && !fs.strongly_consistent_tables) {
        throw exceptions::configuration_exception("The strongly_consistent_tables feature must be enabled to use a consistency option");
    }
-    if (params.consistency && *params.consistency == data_dictionary::consistency_config_option::global) {
-        throw exceptions::configuration_exception("Global consistency is not supported yet");
+    if (params.consistency && *params.consistency == data_dictionary::consistency_config_option::local) {
+        throw exceptions::configuration_exception("Local consistency is not supported yet");
    }
 }

--- a/db/config.cc
+++ b/db/config.cc
@@ -679,6 +679,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The directory where hints files are stored if hinted handoff is enabled.")
    , view_hints_directory(this, "view_hints_directory", value_status::Used, "",
        "The directory where materialized-view updates are stored while a view replica is unreachable.")
+    , logstor_directory(this, "logstor_directory", value_status::Used, "",
+        "The directory where data files for logstor storage are stored.")
    , saved_caches_directory(this, "saved_caches_directory", value_status::Unused, "",
        "The directory location where table key and row caches are stored.")
    /**
@@ -862,6 +864,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "* offheap_objects  Native memory, eliminating NIO buffer heap overhead.")
    , memtable_cleanup_threshold(this, "memtable_cleanup_threshold", value_status::Invalid, .11,
        "Ratio of occupied non-flushing memtable size to total permitted size for triggering a flush of the largest memtable. Larger values mean larger flushes and less compaction, but also less concurrent flush activity, which can make it difficult to keep your disks saturated under heavy write load.")
+    , logstor_disk_size_in_mb(this, "logstor_disk_size_in_mb", value_status::Used, 2048,
+        "Total size in megabytes allocated for logstor storage on disk.")
+    , logstor_file_size_in_mb(this, "logstor_file_size_in_mb", value_status::Used, 32,
+        "Total size in megabytes allocated for each logstor data file on disk.")
+    , logstor_separator_delay_limit_ms(this, "logstor_separator_delay_limit_ms", value_status::Used, 100,
+        "Maximum delay in milliseconds for logstor separator debt control.")
+    , logstor_separator_max_memory_in_mb(this, "logstor_separator_max_memory_in_mb", value_status::Used, 256,
+        "Maximum memory in megabytes for logstor separator memory buffers.")
    , file_cache_size_in_mb(this, "file_cache_size_in_mb", value_status::Unused, 512,
        "Total memory to use for SSTable-reading buffers.")
    , memtable_flush_queue_size(this, "memtable_flush_queue_size", value_status::Unused, 4,
@@ -1281,6 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted).")
    , enable_cache(this, "enable_cache", value_status::Used, true, "Enable cache.")
    , enable_commitlog(this, "enable_commitlog", value_status::Used, true, "Enable commitlog.")
+    , enable_logstor(this, "enable_logstor", value_status::Used, false, "Enable the logstor storage engine.")
    , volatile_system_keyspace_for_testing(this, "volatile_system_keyspace_for_testing", value_status::Used, false, "Don't persist system keyspace - testing only!")
    , api_port(this, "api_port", value_status::Used, 10000, "Http Rest API port.")
    , api_address(this, "api_address", value_status::Used, "", "Http Rest API address.")
@@ -1571,7 +1582,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "\tnone   : No auditing enabled.\n"
        "\tsyslog : Audit messages sent to Syslog.\n"
        "\ttable  : Audit messages written to column family named audit.audit_log.\n")
-    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
+    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,DDL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
    , audit_tables(this, "audit_tables", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of table names (<keyspace>.<table>) that will be audited.")
    , audit_keyspaces(this, "audit_keyspaces", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of keyspaces that will be audited. All tables in those keyspaces will be audited")
    , audit_unix_socket_path(this, "audit_unix_socket_path", value_status::Used, "/dev/log", "The path to the unix socket used for writing to syslog. Only applicable when audit is set to syslog.")
@@ -1692,6 +1703,7 @@ void db::config::setup_directories() {
    maybe_in_workdir(data_file_directories, "data");
    maybe_in_workdir(hints_directory, "hints");
    maybe_in_workdir(view_hints_directory, "view_hints");
+    maybe_in_workdir(logstor_directory, "logstor");
    maybe_in_workdir(saved_caches_directory, "saved_caches");
 }

@@ -1861,7 +1873,8 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},
        {"tablets", feature::UNUSED},
        {"views-with-tablets", feature::UNUSED},
-        {"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES}
+        {"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES},
+        {"logstor", feature::LOGSTOR}
    };
 }

--- a/db/config.hh
+++ b/db/config.hh
@@ -117,7 +117,8 @@ struct experimental_features_t {
        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
-        STRONGLY_CONSISTENT_TABLES
+        STRONGLY_CONSISTENT_TABLES,
+        LOGSTOR,
    };
    static std::map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
@@ -201,6 +202,7 @@ public:
    named_value<uint64_t> data_file_capacity;
    named_value<sstring> hints_directory;
    named_value<sstring> view_hints_directory;
+    named_value<sstring> logstor_directory;
    named_value<sstring> saved_caches_directory;
    named_value<sstring> commit_failure_policy;
    named_value<sstring> disk_failure_policy;
@@ -244,6 +246,10 @@ public:
    named_value<bool> defragment_memory_on_idle;
    named_value<sstring> memtable_allocation_type;
    named_value<double> memtable_cleanup_threshold;
+    named_value<uint32_t> logstor_disk_size_in_mb;
+    named_value<uint32_t> logstor_file_size_in_mb;
+    named_value<uint32_t> logstor_separator_delay_limit_ms;
+    named_value<uint32_t> logstor_separator_max_memory_in_mb;
    named_value<uint32_t> file_cache_size_in_mb;
    named_value<uint32_t> memtable_flush_queue_size;
    named_value<uint32_t> memtable_flush_writers;
@@ -364,6 +370,7 @@ public:
    named_value<bool> enable_in_memory_data_store;
    named_value<bool> enable_cache;
    named_value<bool> enable_commitlog;
+    named_value<bool> enable_logstor;
    named_value<bool> volatile_system_keyspace_for_testing;
    named_value<uint16_t> api_port;
    named_value<sstring> api_address;
--- a/db/corrupt_data_handler.cc
+++ b/db/corrupt_data_handler.cc
@@ -22,7 +22,7 @@ corrupt_data_handler::corrupt_data_handler(register_metrics rm) {
        _metrics.add_group("corrupt_data", {
                sm::make_counter("entries_reported", _stats.corrupt_data_reported,
                               sm::description("Counts the number of corrupt data instances reported to the corrupt data handler. "
-                                               "A non-zero value indicates that the database suffered data corruption."))
+                                               "A non-zero value indicates that the database suffered data corruption.")).set_skip_when_empty()
                });
    }
 }
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -50,9 +50,7 @@ future<> hint_endpoint_manager::do_store_hint(schema_ptr s, lw_shared_ptr<const
    size_t mut_size = fm->representation().size();
    shard_stats().size_of_hints_in_progress += mut_size;

-    if (utils::get_local_injector().enter("slow_down_writing_hints")) {
-        co_await seastar::sleep(std::chrono::seconds(10));
-    }
+    co_await utils::get_local_injector().inject("slow_down_writing_hints", std::chrono::seconds(10));

    try {
        const auto shared_lock = co_await get_shared_lock(file_update_mutex());
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -186,7 +186,7 @@ void manager::register_metrics(const sstring& group_name) {
            sm::description("Number of unexpected errors during sending, sending will be retried later")),

        sm::make_counter("corrupted_files", _stats.corrupted_files,
-                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
+                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")).set_skip_when_empty(),

        sm::make_gauge("pending_drains",
                        sm::description("Number of tasks waiting in the queue for draining hints"),
--- a/db/rate_limiter.cc
+++ b/db/rate_limiter.cc
@@ -206,7 +206,7 @@ void rate_limiter_base::register_metrics() {
                sm::description("Number of times a lookup returned an already allocated entry.")),

        sm::make_counter("failed_allocations", _metrics.failed_allocations,
-                sm::description("Number of times the rate limiter gave up trying to allocate.")),
+                sm::description("Number of times the rate limiter gave up trying to allocate.")).set_skip_when_empty(),

        sm::make_counter("probe_count", _metrics.probe_count,
                sm::description("Number of probes made during lookups.")),
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -174,7 +174,7 @@ cache_tracker::setup_metrics() {
        sm::make_counter("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
        sm::make_counter("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
        sm::make_counter("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
-        sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload),
+        sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload).set_skip_when_empty(),
        sm::make_counter("rows_processed_from_memtable", _stats.rows_processed_from_memtable,
            sm::description("total number of rows in memtables which were processed during cache update on memtable flush")),
        sm::make_counter("rows_dropped_from_memtable", _stats.rows_dropped_from_memtable,
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -336,6 +336,8 @@ schema_ptr scylla_tables(schema_features features) {
        // since it is written to only after the cluster feature is enabled.
        sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false));

+        sb.with_column("storage_engine", utf8_type);
+
        sb.with_hash_version();
        s = sb.build();
    }
@@ -1676,6 +1678,9 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times
            m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp));
        }
    }
+    if (table->logstor_enabled()) {
+        m.set_clustered_cell(ckey, "storage_engine", "logstor", timestamp);
+    }
    // In-memory tables are deprecated since scylla-2024.1.0
    // FIXME: delete the column when there's no live version supporting it anymore.
    // Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature
@@ -2161,6 +2166,13 @@ static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, sche
        auto tablet_options = db::tablet_options(*opt_map);
        builder.set_tablet_options(tablet_options.to_map());
    }
+    if (auto storage_engine = table_row.get<sstring>("storage_engine")) {
+        if (*storage_engine == "logstor") {
+            builder.set_logstor();
+        } else {
+            throw std::invalid_argument(format("Invalid value for storage_engine: {}", *storage_engine));
+        }
+    }
 }

 schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, const data_dictionary::user_types_storage& user_types, schema_ptr cdc_schema, std::optional<table_schema_version> version)
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -144,7 +144,7 @@ static std::vector<sstring> get_keyspaces(const schema& s, const replica::databa
 /**
 * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
 */
-static dht::partition_range as_ring_position_range(dht::token_range& r) {
+static dht::partition_range as_ring_position_range(const dht::token_range& r) {
    std::optional<wrapping_interval<dht::ring_position>::bound> start_bound, end_bound;
    if (r.start()) {
        start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
@@ -156,11 +156,14 @@ static dht::partition_range as_ring_position_range(dht::token_range& r) {
 }

 /**
- * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
+ * Add a new range_estimates for the specified range, considering the sstables associated
+ * with the table identified by `cf_id` across all shards.
 */
-static future<system_keyspace::range_estimates> estimate(const replica::column_family& cf, const token_range& r) {
-    int64_t count{0};
-    utils::estimated_histogram hist{0};
+static future<system_keyspace::range_estimates> estimate(replica::database& db, table_id cf_id, schema_ptr schema, const token_range& r) {
+    struct shard_estimate {
+        int64_t count = 0;
+        utils::estimated_histogram hist{0};
+    };
    auto from_bytes = [] (auto& b) {
        return dht::token::from_sstring(utf8_type->to_string(b));
    };
@@ -169,14 +172,35 @@ static future<system_keyspace::range_estimates> estimate(const replica::column_f
        wrapping_interval<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
        dht::token_comparator(),
        [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
-    for (auto&& r : ranges) {
-        auto rp_range = as_ring_position_range(r);
-        for (auto&& sstable : cf.select_sstables(rp_range)) {
-            count += co_await sstable->estimated_keys_for_range(r);
-            hist.merge(sstable->get_stats_metadata().estimated_partition_size);
+
+    // Estimate partition count and size distribution from sstables on a single shard.
+    auto estimate_on_shard = [cf_id, ranges] (replica::database& local_db) -> future<shard_estimate> {
+        auto table_ptr = local_db.get_tables_metadata().get_table_if_exists(cf_id);
+        if (!table_ptr) {
+            co_return shard_estimate{};
        }
-    }
-    co_return system_keyspace::range_estimates{cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
+        auto& cf = *table_ptr;
+        shard_estimate result;
+        for (auto&& r : ranges) {
+            auto rp_range = as_ring_position_range(r);
+            for (auto&& sstable : cf.select_sstables(rp_range)) {
+                result.count += co_await sstable->estimated_keys_for_range(r);
+                result.hist.merge(sstable->get_stats_metadata().estimated_partition_size);
+            }
+        }
+        co_return result;
+    };
+
+    // Combine partial results from two shards.
+    auto reduce = [] (shard_estimate a, const shard_estimate& b) {
+        a.count += b.count;
+        a.hist.merge(b.hist);
+        return a;
+    };
+
+    auto aggregate = co_await db.container().map_reduce0(std::move(estimate_on_shard), shard_estimate{}, std::move(reduce));
+    int64_t mean_size = aggregate.count > 0 ? aggregate.hist.mean() : 0;
+    co_return system_keyspace::range_estimates{std::move(schema), r.start, r.end, aggregate.count, mean_size};
 }

 /**
@@ -321,7 +345,7 @@ size_estimates_mutation_reader::estimates_for_current_keyspace(std::vector<token
        auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
        for (auto&& r : rows_to_estimate) {
            auto& cf = _db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
-            estimates.push_back(co_await estimate(cf, r.tokens));
+            estimates.push_back(co_await estimate(_db, cf.schema()->id(), cf.schema(), r.tokens));
            if (estimates.size() >= _slice.partition_row_limit()) {
                co_return estimates;
            }
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -18,8 +18,11 @@
 #include <seastar/coroutine/parallel_for_each.hh>
 #include "db/snapshot-ctl.hh"
 #include "db/snapshot/backup_task.hh"
+#include "db/schema_tables.hh"
+#include "index/secondary_index_manager.hh"
 #include "replica/database.hh"
 #include "replica/global_table_ptr.hh"
+#include "replica/schema_describe_helper.hh"
 #include "sstables/sstables_manager.hh"
 #include "service/storage_proxy.hh"

@@ -154,14 +157,56 @@ future<> snapshot_ctl::do_take_cluster_column_family_snapshot(std::vector<sstrin
    );
 }

+sstring snapshot_ctl::resolve_table_name(const sstring& ks_name, const sstring& name) const {
+    try {
+        _db.local().find_uuid(ks_name, name);
+        return name;
+    } catch (const data_dictionary::no_such_column_family&) {
+        // The name may be a logical index name (e.g. "myindex").
+        // Only indexes with a backing view have a separate backing table
+        // that can be snapshotted. Custom indexes such as vector indexes
+        // do not, so keep rejecting them here rather than mapping them to
+        // a synthetic name.
+        auto schema = _db.local().find_indexed_table(ks_name, name);
+        if (schema) {
+            const auto& im = schema->all_indices().at(name);
+            if (db::schema_tables::view_should_exist(im)) {
+                return secondary_index::index_table_name(name);
+            }
+        }
+        throw;
+    }
+}
+
 future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
+    for (auto& t : tables) {
+        t = resolve_table_name(ks_name, t);
+    }
    co_await check_snapshot_not_exist(ks_name, tag, tables);
    co_await replica::database::snapshot_tables_on_all_shards(_db, ks_name, std::move(tables), std::move(tag), opts);
 }

 future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
-    return run_snapshot_modify_operation([this, tag = std::move(tag), keyspace_names = std::move(keyspace_names), cf_name = std::move(cf_name)] {
-        return _db.local().clear_snapshot(tag, keyspace_names, cf_name);
+    co_return co_await run_snapshot_modify_operation([this, tag = std::move(tag), keyspace_names = std::move(keyspace_names), cf_name = std::move(cf_name)] (this auto) -> future<> {
+        // clear_snapshot enumerates keyspace_names and uses cf_name as a
+        // filter in each. When cf_name needs resolution (e.g. logical index
+        // name -> backing table name), the result may differ per keyspace,
+        // so resolve and clear individually.
+        if (!cf_name.empty() && !keyspace_names.empty()) {
+            std::vector<std::pair<sstring, sstring>> resolved_targets;
+            resolved_targets.reserve(keyspace_names.size());
+
+            // Resolve every keyspace first so a later failure doesn't delete
+            // snapshots that were already matched in earlier keyspaces.
+            for (const auto& ks_name : keyspace_names) {
+                resolved_targets.emplace_back(ks_name, resolve_table_name(ks_name, cf_name));
+            }
+            for (auto& [ks_name, resolved_cf_name] : resolved_targets) {
+                co_await _db.local().clear_snapshot(tag, {ks_name}, std::move(resolved_cf_name));
+            }
+            co_return;
+        }
+        co_await _db.local().clear_snapshot(std::move(tag), std::move(keyspace_names), cf_name);
    });
 }

@@ -170,7 +215,26 @@ snapshot_ctl::get_snapshot_details() {
    using snapshot_map = std::unordered_map<sstring, db_snapshot_details>;

    co_return co_await run_snapshot_list_operation(coroutine::lambda([this] () -> future<snapshot_map> {
-        return _db.local().get_snapshot_details();
+        auto details = co_await _db.local().get_snapshot_details();
+
+        for (auto& [snapshot_name, snapshot_details] : details) {
+            for (auto& table : snapshot_details) {
+                auto schema = _db.local().as_data_dictionary().try_find_table(
+                        table.ks, table.cf);
+                if (!schema || !schema->schema()->is_view()) {
+                    continue;
+                }
+
+                auto helper = replica::make_schema_describe_helper(
+                        schema->schema(), _db.local().as_data_dictionary());
+                if (helper.type == schema_describe_helper::type::index) {
+                    table.cf = secondary_index::index_name_from_table_name(
+                            table.cf);
+                }
+            }
+        }
+
+        co_return details;
    }));
 }

@@ -235,4 +299,4 @@ future<int64_t> snapshot_ctl::true_snapshots_size(sstring ks, sstring cf) {
    }));
 }

-}
+}
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -133,6 +133,12 @@ private:

    future<> check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter = {});

+    // Resolve a user-provided table name that may be a logical index name
+    // (e.g. "myindex") to its backing column family name (e.g.
+    // "myindex_index"). Returns the name unchanged if it already
+    // matches a column family.
+    sstring resolve_table_name(const sstring& ks_name, const sstring& name) const;
+
    future<> run_snapshot_modify_operation(noncopyable_function<future<>()> &&);

    template <typename Func>
@@ -151,4 +157,4 @@ private:
    future<> do_take_cluster_column_family_snapshot(std::vector<sstring> ks_names, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
 };

-}
+}
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -281,6 +281,7 @@ schema_ptr system_keyspace::topology() {
            .with_column("cleanup_status", utf8_type)
            .with_column("supported_features", set_type_impl::get_instance(utf8_type, true))
            .with_column("request_id", timeuuid_type)
+            .with_column("intended_storage_mode", utf8_type)
            .with_column("ignore_nodes", set_type_impl::get_instance(uuid_type, true), column_kind::static_column)
            .with_column("new_cdc_generation_data_uuid", timeuuid_type, column_kind::static_column)
            .with_column("new_keyspace_rf_change_ks_name", utf8_type, column_kind::static_column) // deprecated
@@ -323,6 +324,7 @@ schema_ptr system_keyspace::topology_requests() {
            .with_column("snapshot_tag", utf8_type)
            .with_column("snapshot_expiry", timestamp_type)
            .with_column("snapshot_skip_flush", boolean_type)
+            .with_column("finalize_migration_ks_name", utf8_type)
            .set_comment("Topology request tracking")
            .with_hash_version()
            .build();
@@ -3052,7 +3054,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
        co_return ret;
    }

-    const bool strongly_consistent_tables = _db.features().strongly_consistent_tables;
+    const bool tablet_balancing_not_supported = _db.features().strongly_consistent_tables || _db.features().logstor;

    for (auto& row : *rs) {
        if (!row.has("host_id")) {
@@ -3169,6 +3171,11 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            }
        }

+        std::optional<service::intended_storage_mode> storage_mode;
+        if (row.has("intended_storage_mode")) {
+            storage_mode = service::intended_storage_mode_from_string(row.get_as<sstring>("intended_storage_mode"));
+        }
+
        std::unordered_map<raft::server_id, service::replica_state>* map = nullptr;
        if (nstate == service::node_state::normal) {
            map = &ret.normal_nodes;
@@ -3193,7 +3200,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            map->emplace(host_id, service::replica_state{
                nstate, std::move(datacenter), std::move(rack), std::move(release_version),
                ring_slice, shard_count, ignore_msb, std::move(supported_features),
-                service::cleanup_status_from_string(cleanup_status), request_id});
+                service::cleanup_status_from_string(cleanup_status), request_id, storage_mode});
        }
    }

@@ -3289,7 +3296,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            ret.session = service::session_id(some_row.get_as<utils::UUID>("session"));
        }

-        if (strongly_consistent_tables) {
+        if (tablet_balancing_not_supported) {
            ret.tablet_balancing_enabled = false;
        } else if (some_row.has("tablet_balancing_enabled")) {
            ret.tablet_balancing_enabled = some_row.get_as<bool>("tablet_balancing_enabled");
@@ -3506,6 +3513,9 @@ system_keyspace::topology_requests_entry system_keyspace::topology_request_row_t
            entry.snapshot_expiry = row.get_as<db_clock::time_point>("snapshot_expiry");
        }
    }
+    if (row.has("finalize_migration_ks_name")) {
+        entry.finalize_migration_ks_name = row.get_as<sstring>("finalize_migration_ks_name");
+    }

    return entry;
 }
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -427,6 +427,7 @@ public:
        std::optional<sstring> snapshot_tag;
        std::optional<db_clock::time_point> snapshot_expiry;
        bool snapshot_skip_flush;
+        std::optional<sstring> finalize_migration_ks_name;
    };
    using topology_requests_entries = std::unordered_map<utils::UUID, system_keyspace::topology_requests_entry>;

--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2647,7 +2647,7 @@ future<> view_builder::add_new_view(view_ptr view, build_step& step) {
    }

    if (this_shard_id() == smp::count - 1) {
-        co_await utils::get_local_injector().inject("add_new_view_pause_last_shard", utils::wait_for_message(5min));
+        inject_failure("add_new_view_fail_last_shard");
    }

    co_await _sys_ks.register_view_for_building(view->ks_name(), view->cf_name(), step.current_token());
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -143,10 +143,18 @@ dht::token_range view_building_worker::get_tablet_token_range(table_id table_id,
 }

 future<> view_building_worker::drain() {
+    auto drain_started = std::exchange(_drain_started, started_drain::yes);
+    if (drain_started == started_drain::no) {
+        _drain_finished = shared_future(do_drain());
+    }
+    return _drain_finished.get_future();
+}
+
+future<> view_building_worker::do_drain() {
    if (!_as.abort_requested()) {
        _as.request_abort();
    }
-    _state._mutex.broken();
+    co_await _staging_sstables_mutex.wait();
    _staging_sstables_mutex.broken();
    _sstables_to_register_event.broken();
    if (this_shard_id() == 0) {
@@ -156,7 +164,9 @@ future<> view_building_worker::drain() {
        co_await std::move(state_observer);
        co_await _mnotifier.unregister_listener(this);
    }
-    co_await _state.clear();
+    co_await _state._mutex.wait();
+    _state._mutex.broken();
+    co_await _state.drain();
    co_await uninit_messaging_service();
 }

@@ -200,9 +210,7 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    while (!_as.abort_requested()) {
        bool sleep = false;
        try {
-            auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
            co_await create_staging_sstable_tasks();
-            lock.return_all();
            _as.check();
            co_await _sstables_to_register_event.when();
        } catch (semaphore_aborted&) {
@@ -227,13 +235,45 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    }
 }

+future<std::vector<foreign_ptr<semaphore_units<>>>> view_building_worker::lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards) {
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    // Collect `_staging_sstables_mutex` locks from multiple shards,
+    // so other shards won't interact with their `_staging_sstables` map
+    // until the caller releases them.
+    std::vector<foreign_ptr<semaphore_units<>>> locks;
+    locks.resize(smp::count);
+    // Locks are acquired from multiple shards in parallel.
+    // This is the only place where multiple-shard locks are acquired at once
+    // and the method is called only once at a time (from `create_staging_sstable_tasks()`
+    // on shard 0), so no deadlock may occur.
+    co_await coroutine::parallel_for_each(shards, [&locks, &sharded_vbw = container()] (auto shard_id) -> future<> {
+        auto lock_ptr = co_await smp::submit_to(shard_id, [&sharded_vbw] () -> future<foreign_ptr<semaphore_units<>>> {
+            auto& vbw = sharded_vbw.local();
+            auto lock = co_await get_units(vbw._staging_sstables_mutex, 1, vbw._as);
+            co_return make_foreign(std::move(lock));
+        });
+        locks[shard_id] = std::move(lock_ptr);
+    });
+    co_return std::move(locks);
+}
+
 future<> view_building_worker::create_staging_sstable_tasks() {
+    // Explicitly lock shard0 beforehand to prevent other shards from modifying `_sstables_to_register` from `register_staging_sstable_tasks()`
+    auto lock0 = co_await get_units(_staging_sstables_mutex, 1, _as);
+
    if (_sstables_to_register.empty()) {
        co_return;
    }

-    utils::chunked_vector<canonical_mutation> cmuts;
+    auto shards = _sstables_to_register 
+        | std::views::values 
+        | std::views::join 
+        | std::views::transform([] (const auto& sst_info) { return sst_info.shard; }) 
+        | std::ranges::to<std::flat_set<shard_id>>();
+    shards.erase(0); // We're already holding shard0 lock
+    auto locks = co_await lock_staging_mutex_on_multiple_shards(std::move(shards));

+    utils::chunked_vector<canonical_mutation> cmuts;
    auto guard = co_await _group0.client().start_operation(_as);
    auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
    for (auto& [table_id, sst_infos]: _sstables_to_register) {
@@ -460,6 +500,16 @@ static std::unordered_set<table_id> get_ids_of_all_views(replica::database& db,
    }) | std::ranges::to<std::unordered_set>();;
 }

+void view_building_worker::state::start_batch(std::unique_ptr<batch> batch) {
+    if (_drained) {
+        on_internal_error(vbw_logger, "view_building_worker::state was already drained");
+    } else if (_batch) {
+        on_internal_error(vbw_logger, fmt::format("view_building_worker::state::start_batch(): some batch (tasks: {}) is already running", _batch->tasks | std::views::keys));
+    }
+    _batch = std::move(batch);
+    _batch->start();
+}
+
 // If `state::processing_base_table` is different that the `view_building_state::currently_processed_base_table`,
 // clear the state, save and flush new base table
 future<> view_building_worker::state::update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as) {
@@ -485,6 +535,10 @@ future<> view_building_worker::state::clean_up_after_batch() {

 // Flush base table, set is as currently processing base table and save which views exist at the time of flush
 future<> view_building_worker::state::flush_base_table(replica::database& db, table_id base_table_id, abort_source& as) {
+    if (_drained) {
+        on_internal_error(vbw_logger, "view_building_worker::state was already drained");
+    }
+
    auto cf = db.find_column_family(base_table_id).shared_from_this();
    co_await when_all(cf->await_pending_writes(), cf->await_pending_streams());
    co_await flush_base(cf, as);
@@ -503,6 +557,11 @@ future<> view_building_worker::state::clear() {
    flushed_views.clear();
 }

+future<> view_building_worker::state::drain() {
+    _drained = true;
+    co_await clear();
+}
+
 view_building_worker::batch::batch(sharded<view_building_worker>& vbw, std::unordered_map<utils::UUID, view_building_task> tasks, table_id base_id, locator::tablet_replica replica)
    : base_id(base_id)
    , replica(replica)
@@ -667,24 +726,34 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
 }

 future<> view_building_worker::do_process_staging(table_id table_id, dht::token last_token) {
-    if (_staging_sstables[table_id].empty()) {
+    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
+    std::vector<sstables::shared_sstable> sstables_to_process;
+
+    try {
+        // Acquire `_staging_sstables_mutex` to prevent `create_staging_sstable_tasks()` from
+        // concurrently modifying `_staging_sstables` (moving entries from `_sstables_to_register`)
+        // while we read them.
+        auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
+        auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
+        auto tid = tablet_map.get_tablet_id(last_token);
+        auto tablet_range = tablet_map.get_token_range(tid);
+
+        // Select sstables belonging to the tablet (identified by `last_token`)
+        for (auto& sst: _staging_sstables[table_id]) {
+            auto sst_last_token = sst->get_last_decorated_key().token();
+            if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
+                sstables_to_process.push_back(sst);
+            }
+        }
+        lock.return_all();
+    } catch (semaphore_aborted&) {
+        vbw_logger.warn("Semaphore was aborted while waiting to removed processed sstables for table {}", table_id);
        co_return;
    }

-    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
-    auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
-    auto tid = tablet_map.get_tablet_id(last_token);
-    auto tablet_range = tablet_map.get_token_range(tid);
-
-    // Select sstables belonging to the tablet (identified by `last_token`)
-    std::vector<sstables::shared_sstable> sstables_to_process;
-    for (auto& sst: _staging_sstables[table_id]) {
-        auto sst_last_token = sst->get_last_decorated_key().token();
-        if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
-            sstables_to_process.push_back(sst);
-        }
+    if (sstables_to_process.empty()) {
+        co_return;
    }
-
    co_await _vug.process_staging_sstables(std::move(table), sstables_to_process);

    try {
@@ -799,8 +868,8 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
        }

        // Create and start the batch
-        _state._batch = std::make_unique<batch>(container(), std::move(tasks), *building_state.currently_processed_base_table, my_replica);
-        _state._batch->start();
+        auto batch = std::make_unique<view_building_worker::batch>(container(), std::move(tasks), *building_state.currently_processed_base_table, my_replica);
+        _state.start_batch(std::move(batch));
    }

    if (std::ranges::all_of(ids, [&] (auto& id) { return !_state._batch->tasks.contains(id); })) {
--- a/db/view/view_building_worker.hh
+++ b/db/view/view_building_worker.hh
@@ -14,6 +14,7 @@
 #include <seastar/core/shared_future.hh>
 #include <unordered_map>
 #include <unordered_set>
+#include <flat_set>
 #include "locator/abstract_replication_strategy.hh"
 #include "locator/tablets.hh"
 #include "raft/raft.hh"
@@ -98,11 +99,14 @@ class view_building_worker : public seastar::peering_sharded_service<view_buildi
        std::unordered_set<table_id> flushed_views;

        semaphore _mutex = semaphore(1);
+        bool _drained = false;
        // All of the methods below should be executed while holding `_mutex` unit!
+        void start_batch(std::unique_ptr<batch> batch);
        future<> update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as);
        future<> flush_base_table(replica::database& db, table_id base_table_id, abort_source& as);
        future<> clean_up_after_batch();
        future<> clear();
+        future<> drain();
    };

    // Wrapper which represents information needed to create
@@ -169,14 +173,24 @@ private:
    future<> do_process_staging(table_id base_id, dht::token last_token);

    future<> run_staging_sstables_registrator();
-    // Caller must hold units from `_staging_sstables_mutex`
+    // Acquires `_staging_sstables_mutex` on all shards internally,
+    // so callers must not hold `_staging_sstables_mutex` when invoking it.
    future<> create_staging_sstable_tasks();
    future<> discover_existing_staging_sstables();
    std::unordered_map<table_id, std::vector<staging_sstable_task_info>> discover_local_staging_sstables(building_tasks building_tasks);
+    // Acquire `_staging_sstables_mutex` on multiple shards in parallel.
+    // Must be called only from shard 0.
+    // Must be called ONLY by `create_staging_sstable_tasks()` and only once at a time to avoid deadlock.
+    future<std::vector<foreign_ptr<semaphore_units<>>>> lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards);

    void init_messaging_service();
    future<> uninit_messaging_service();
    future<std::vector<utils::UUID>> work_on_tasks(raft::term_t term, std::vector<utils::UUID> ids);
+
+    using started_drain = bool_class<struct started_drain_tag>;
+    started_drain _drain_started = started_drain::no;
+    shared_future<> _drain_finished;
+    future<> do_drain();
 };

 }
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -99,7 +99,7 @@ public:

                set_cell(cr, "up", gossiper.is_alive(hostid));
                if (gossiper.is_shutdown(endpoint)) {
-                    set_cell(cr, "status", gossiper.get_gossip_status(endpoint));
+                    set_cell(cr, "status", "shutdown");
                } else {
                    set_cell(cr, "status", boost::to_upper_copy<std::string>(fmt::format("{}", ss.get_node_state(hostid))));
                }
@@ -224,12 +224,12 @@ public:
            }

            if (_db.find_keyspace(e.name).get_replication_strategy().uses_tablets()) {
-                co_await _db.get_tables_metadata().for_each_table_gently([&, this] (table_id, lw_shared_ptr<replica::table> table) -> future<> {
+                co_await _db.get_tables_metadata().for_each_table_gently([&, this] (table_id tid, lw_shared_ptr<replica::table> table) -> future<> {
                    if (table->schema()->ks_name() != e.name) {
                        co_return;
                    }
                    const auto& table_name = table->schema()->cf_name();
-                    utils::chunked_vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring_for_table(e.name, table_name);
+                    utils::chunked_vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring_for_table(tid);
                    co_await emit_ring(result, e.key, table_name, std::move(ranges));
                });
            } else {
--- a/dht/token.hh
+++ b/dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
    after_all_keys,
 };

+// Represents a token for partition keys.
+// Has a disengaged state, which sorts before all engaged states.
+struct raw_token {
+    int64_t value;
+
+    /// Constructs a disengaged token.
+    raw_token() : value(std::numeric_limits<int64_t>::min()) {}
+
+    /// Constructs an engaged token.
+    /// The token must be of token_kind::key kind.
+    explicit raw_token(const token&);
+
+    explicit raw_token(int64_t v) : value(v) {};
+
+    std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
+    std::strong_ordering operator<=>(const token& o) const noexcept;
+
+    /// Returns true iff engaged.
+    explicit operator bool() const noexcept {
+        return value != std::numeric_limits<int64_t>::min();
+    }
+};
+
+using raw_token_opt = seastar::optimized_optional<raw_token>;
+
 class token {
    // INT64_MIN is not a legal token, but a special value used to represent
    // infinity in token intervals.
@@ -52,6 +77,10 @@ public:

    constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}

+    token(raw_token raw) noexcept
+        : token(raw ? kind::key : kind::before_all_keys, raw.value)
+    { }
+
    // This constructor seems redundant with the bytes_view constructor, but
    // it's necessary for IDL, which passes a deserialized_bytes_proxy here.
    // (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
@@ -223,6 +252,29 @@ public:
    }
 };

+inline
+raw_token::raw_token(const token& t)
+    : value(t.raw())
+{
+#ifdef DEBUG
+    assert(t._kind == token::kind::key);
+#endif
+}
+
+inline
+std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
+    switch (o._kind) {
+        case token::kind::after_all_keys:
+            return std::strong_ordering::less;
+        case token::kind::before_all_keys:
+            // before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
+            // So we can order them by just comparing raw values.
+            [[fallthrough]];
+        case token::kind::key:
+            return value <=> o._data;
+    }
+}
+
 inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
    if (l1 == l2) {
        return std::strong_ordering::equal;
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
    }
 };

+template <>
+struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const dht::raw_token& t, FormatContext& ctx) const {
+        if (!t) {
+            return fmt::format_to(ctx.out(), "null");
+        }
+        return fmt::format_to(ctx.out(), "{}", t.value);
+    }
+};
+
 namespace std {

 template<>
--- a/dist/common/scripts/scylla_swap_setup
+++ b/dist/common/scripts/scylla_swap_setup
@@ -9,6 +9,7 @@

 import os
 import sys
+import shlex
 import argparse
 import psutil
 from pathlib import Path
@@ -103,16 +104,41 @@ if __name__ == '__main__':
        run('dd if=/dev/zero of={} bs=1M count={}'.format(swapfile, swapsize_mb), shell=True, check=True)
    swapfile.chmod(0o600)
    run('mkswap -f {}'.format(swapfile), shell=True, check=True)
+
+    mount_point = find_mount_point(swap_directory)
+    mount_unit = out(f'systemd-escape -p --suffix=mount {shlex.quote(str(mount_point))}')
+
+    # Add DefaultDependencies=no to the swap unit to avoid getting the default
+    # Before=swap.target dependency. We apply this to all clouds, but the
+    # requirement came from Azure:
+    #
+    # On Azure, the swap directory is on the Azure ephemeral disk (mounted on /mnt).
+    # However, cloud-init makes this mount (i.e., the mnt.mount unit) depend on
+    # the network (After=network-online.target). By extension, this means that
+    # the swap unit depends on the network. If we didn't use DefaultDependencies=no,
+    # then the swap unit would be part of the swap.target which other services
+    # assume to be a local boot target, so we would end up with dependency cycles
+    # such as:
+    #
+    # swap.target -> mnt-swapfile.swap -> mnt.mount -> network-online.target -> network.target -> systemd-resolved.service -> tmp.mount -> swap.target
+    #
+    # By removing the automatic Before=swap.target, the swap unit is no longer
+    # part of swap.target, avoiding such cycles. The swap will still be
+    # activated via WantedBy=multi-user.target.
    unit_data = '''
 [Unit]
 Description=swapfile
+DefaultDependencies=no
+After={}
+Conflicts=umount.target
+Before=umount.target

 [Swap]
 What={}

 [Install]
 WantedBy=multi-user.target
-'''[1:-1].format(swapfile)
+'''[1:-1].format(mount_unit, swapfile)
    with swapunit.open('w') as f:
        f.write(unit_data)
    systemd_unit.reload()
--- a/dist/common/sysconfig/scylla-node-exporter
+++ b/dist/common/sysconfig/scylla-node-exporter
@@ -1 +1 @@
-SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
+SCYLLA_NODE_EXPORTER_ARGS="--collector.interrupts --collector.ethtool.metrics-include='(bw_in_allowance_exceeded|bw_out_allowance_exceeded|conntrack_allowance_exceeded|conntrack_allowance_available|linklocal_allowance_exceeded)' --collector.ethtool --collector.systemd --collector.systemd.unit-include='^(scylla-server|systemd-coredump.*)\.service$' --no-collector.hwmon --no-collector.bcache --no-collector.btrfs --no-collector.fibrechannel --no-collector.infiniband --no-collector.ipvs --no-collector.nfs --no-collector.nfsd --no-collector.powersupplyclass --no-collector.rapl --no-collector.tapestats --no-collector.thermal_zone --no-collector.udp_queues --no-collector.zfs"
--- a/docs/_utils/redirects.yaml
+++ b/docs/_utils/redirects.yaml
@@ -1,6 +1,12 @@
 ### a dictionary of redirections
 #old path: new path

+# Move the Upgrade Support (About Upgrade) page
+
+/stable/upgrade/about-upgrade.html: https://docs.scylladb.com/stable/versioning/upgrade-policy.html 
+/branch-2025.4/upgrade/about-upgrade.html: https://docs.scylladb.com/stable/versioning/upgrade-policy.html 
+/branch-2026.1/upgrade/about-upgrade.html: https://docs.scylladb.com/stable/versioning/upgrade-policy.html 
+
 # Move the OS Support page

 /stable/getting-started/os-support.html: https://docs.scylladb.com/stable/versioning/os-support-per-version.html
--- a/docs/alternator/network.md
+++ b/docs/alternator/network.md
@@ -31,7 +31,7 @@ was used. Alternator currently supports two compression algorithms, `gzip`
 and `deflate`, both standardized in ([RFC 9110](https://www.rfc-editor.org/rfc/rfc9110.html)).
 Other standard compression types which are listed in
 [IANA's HTTP Content Coding Registry](https://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding),
-including `zstd` ([RFC 8878][https://www.rfc-editor.org/rfc/rfc8878.html]),
+including `zstd` ([RFC 8878](https://www.rfc-editor.org/rfc/rfc8878.html)),
 are not yet supported by Alternator.

 Note that HTTP's compression only compresses the request's _body_ - not the
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -139,7 +139,7 @@ The ``WHERE`` clause
 ~~~~~~~~~~~~~~~~~~~~

 The ``WHERE`` clause specifies which rows must be queried. It is composed of relations on the columns that are part of
-the ``PRIMARY KEY``.
+the ``PRIMARY KEY``, and relations can be joined only with ``AND`` (``OR`` and other logical operators are not supported).

 Not all relations are allowed in a query. For instance, non-equal relations (where ``IN`` is considered as an equal
 relation) on a partition key are not supported (see the use of the ``TOKEN`` method below to do non-equal queries on
@@ -200,6 +200,23 @@ The tuple notation may also be used for ``IN`` clauses on clustering columns::
     WHERE userid = 'john doe'
       AND (blog_title, posted_at) IN (('John''s Blog', '2012-01-01'), ('Extreme Chess', '2014-06-01'))

+This tuple notation is different from boolean grouping. For example, the following query is not supported::
+
+    SELECT * FROM users
+     WHERE (country = 'BR' AND state = 'SP')
+
+because parentheses are only allowed around a single relation, so this works: ``(country = 'BR') AND (state = 'SP')``, but this does not: ``(country = 'BR' AND state = 'SP')``.
+Similarly, an extended query of the form of::
+
+    SELECT * FROM users
+     WHERE (country = 'BR' AND state = 'SP')
+       OR (country = 'BR' AND state = 'RJ')
+
+won't work due to both: grouping boolean expressions and not supporting ``OR``, so when possible,
+rewrite such queries with ``IN`` on the varying column, for example
+``country = 'BR' AND state IN ('SP', 'RJ')``, or run multiple queries and merge
+the results client-side.
+
 The ``CONTAINS`` operator may only be used on collection columns (lists, sets, and maps). In the case of maps,
 ``CONTAINS`` applies to the map values. The ``CONTAINS KEY`` operator may only be used on map columns and applies to the
 map keys.
--- a/docs/cql/guardrails.rst
+++ b/docs/cql/guardrails.rst
@@ -0,0 +1,236 @@
+.. highlight:: cql
+
+.. _cql-guardrails:
+
+CQL Guardrails
+==============
+
+ScyllaDB provides a set of configurable guardrail parameters that help operators
+enforce best practices and prevent misconfigurations that could degrade cluster
+health, availability, or performance. Guardrails operate at two severity levels:
+
+* **Warn**: The request succeeds, but the server includes a warning in the CQL
+  response. Depending on the specific guardrail, the warning may also be logged on the server side.
+* **Fail**: The request is rejected with an error/exception (the specific type
+  depends on the guardrail). The user must correct the request or adjust the
+  guardrail configuration to proceed.
+
+.. note::
+
+   Guardrails are checked only when a statement is
+   executed. They do not retroactively validate existing keyspaces, tables, or
+   previously completed writes.
+
+For the full list of configuration properties, including types, defaults, and
+liveness information, see :doc:`Configuration Parameters </reference/configuration-parameters>`.
+
+.. _guardrails-replication-factor:
+
+Replication Factor Guardrails
+-----------------------------
+
+These four parameters control the minimum and maximum allowed replication factor
+(RF) values. They are evaluated whenever a ``CREATE KEYSPACE`` or
+``ALTER KEYSPACE`` statement is executed. Each data center's RF is checked
+individually.
+
+An RF of ``0`` — which means "do not replicate to this data center" — is
+always allowed and never triggers a guardrail.
+
+A threshold value of ``-1`` disables the corresponding check.
+
+``minimum_replication_factor_warn_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF is set to a value greater than ``0`` and lower than
+this threshold, the server attaches a warning to the CQL response identifying
+the offending data center and RF value.
+
+**When to use.** The default of ``3`` is the standard recommendation for
+production clusters. An RF below ``3`` means that the cluster cannot tolerate
+even a single node failure without data loss or read unavailability (assuming
+``QUORUM`` consistency). Keep this at ``3`` unless your deployment has specific
+constraints (e.g., a development or test cluster with fewer than 3 nodes).
+
+``minimum_replication_factor_fail_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF is set to a value greater than ``0`` and lower than
+this threshold, the request is rejected with a ``ConfigurationException``
+identifying the offending data center and RF value.
+
+**When to use.** Enable this parameter (e.g., set to ``3``) in production
+environments where allowing a low RF would be operationally dangerous. Unlike
+the warn threshold, this provides a hard guarantee that no keyspace can be
+created or altered to have an RF below the limit.
+
+``maximum_replication_factor_warn_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF exceeds this threshold, the server attaches a warning to the CQL response identifying
+the offending data center and RF value.
+
+**When to use.** An excessively high RF increases write amplification and
+storage costs proportionally. For example, an RF of ``5`` means every write
+is replicated to five nodes. Set this threshold to alert operators who
+may unintentionally set an RF that is too high.
+
+``maximum_replication_factor_fail_threshold``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If any data center's RF exceeds this threshold, the request is rejected with a ``ConfigurationException``
+identifying the offending data center and RF value.
+
+**When to use.** Enable this parameter to prevent accidental creation of
+keyspaces with an unreasonably high RF. An extremely high RF wastes storage and
+network bandwidth and can lead to write latency spikes. This is a hard limit —
+the keyspace creation or alteration will not proceed until the RF is lowered.
+
+**Metrics.** ScyllaDB exposes per-shard metrics that track the number of
+times each replication factor guardrail has been triggered:
+
+* ``scylla_cql_minimum_replication_factor_warn_violations``
+* ``scylla_cql_minimum_replication_factor_fail_violations``
+* ``scylla_cql_maximum_replication_factor_warn_violations``
+* ``scylla_cql_maximum_replication_factor_fail_violations``
+
+A sustained increase in any of these metrics indicates that
+``CREATE KEYSPACE`` or ``ALTER KEYSPACE`` requests are hitting the configured
+thresholds.
+
+.. _guardrails-replication-strategy:
+
+Replication Strategy Guardrails
+-------------------------------
+
+These two parameters control which replication strategies trigger warnings or
+are rejected when a keyspace is created or altered.
+
+``replication_strategy_warn_list``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
+statement is on this list, the server attaches a warning to the CQL response
+identifying the discouraged strategy and the affected keyspace.
+
+**When to use.** ``SimpleStrategy`` is not recommended for production use.
+It places replicas without awareness of data center or rack topology, which
+can undermine fault tolerance in multi-DC deployments. Even in single-DC
+deployments, ``NetworkTopologyStrategy`` is recommended because it keeps the
+schema ready for future topology changes.
+
+The default configuration warns on ``SimpleStrategy``, which is appropriate
+for most deployments. If you have existing keyspaces that use
+``SimpleStrategy``, see :doc:`Update Topology Strategy From Simple to Network
+</operating-scylla/procedures/cluster-management/update-topology-strategy-from-simple-to-network>`
+for the migration procedure.
+
+``replication_strategy_fail_list``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the replication strategy used in a ``CREATE KEYSPACE`` or ``ALTER KEYSPACE``
+statement is on this list, the request is rejected with a
+``ConfigurationException`` identifying the forbidden strategy and the affected
+keyspace.
+
+**When to use.** In production environments, add ``SimpleStrategy`` to this
+list to enforce ``NetworkTopologyStrategy`` across all keyspaces. This helps
+prevent new production keyspaces from being created with a topology-unaware
+strategy.
+
+**Metrics.** The following per-shard metrics track replication strategy
+guardrail violations:
+
+* ``scylla_cql_replication_strategy_warn_list_violations``
+* ``scylla_cql_replication_strategy_fail_list_violations``
+
+.. _guardrails-write-consistency-level:
+
+Write Consistency Level Guardrails
+----------------------------------
+
+These two parameters control which consistency levels (CL) are allowed for
+write operations (``INSERT``, ``UPDATE``, ``DELETE``, and ``BATCH``
+statements).
+
+Be aware that adding warnings to CQL responses can significantly increase
+network traffic and reduce overall throughput.
+
+``write_consistency_levels_warned``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If a write operation uses a consistency level on this list, the server attaches
+a warning to the CQL response identifying the discouraged consistency level.
+
+**When to use.** Use this parameter to alert application developers when they
+use a consistency level that, while technically functional, is not recommended
+for the workload. Common examples:
+
+* **Warn on** ``ANY``: writes at ``ANY`` are acknowledged as soon as at least
+  one node (including a coordinator acting as a hinted handoff store) receives
+  the mutation. This means data may not be persisted on any replica node at
+  the time of acknowledgement, risking data loss if the coordinator fails
+  before hinted handoff completes.
+* **Warn on** ``ALL``: writes at ``ALL`` require every replica to acknowledge
+  the write. If any single replica is down, the write fails. This significantly
+  reduces write availability.
+
+``write_consistency_levels_disallowed``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If a write operation uses a consistency level on this list, the request is
+rejected with an ``InvalidRequestException`` identifying the forbidden
+consistency level.
+
+**When to use.** Use this parameter to hard-block consistency levels that are
+considered unsafe for your deployment:
+
+* **Disallow** ``ANY``: in production environments, ``ANY`` is almost never
+  appropriate. It provides the weakest durability guarantee and is a common
+  source of data-loss incidents when operators or application developers use it
+  unintentionally.
+* **Disallow** ``ALL``: in clusters where high write availability is critical,
+  blocking ``ALL`` prevents a single node failure from causing write
+  unavailability.
+
+**Metrics.** The following per-shard metrics track write consistency level
+guardrail violations:
+
+* ``scylla_cql_write_consistency_levels_warned_violations``
+* ``scylla_cql_write_consistency_levels_disallowed_violations``
+
+Additionally, ScyllaDB exposes the
+``scylla_cql_writes_per_consistency_level`` metric, labeled by consistency
+level, which tracks the total number of write requests per CL. This metric is
+useful for understanding the current write-CL distribution across the cluster
+*before* deciding which levels to warn on or disallow. For example, querying
+this metric can reveal whether any application is inadvertently using ``ANY``
+or ``ALL`` for writes.
+
+.. _guardrails-compact-storage:
+
+Compact Storage Guardrail
+-------------------------
+
+``enable_create_table_with_compact_storage``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This boolean parameter controls whether ``CREATE TABLE`` statements with the
+deprecated ``COMPACT STORAGE`` option are allowed. Unlike the other guardrails,
+it acts as a simple on/off switch rather than using separate warn and fail
+thresholds.
+
+**When to use.** Leave this at the default (``false``) for all new
+deployments. ``COMPACT STORAGE`` is a legacy feature that will be permanently
+removed in a future version of ScyllaDB. Set to ``true`` only if you have a specific,
+temporary need to create compact storage tables (e.g., compatibility with legacy
+applications during a migration). For details on the ``COMPACT STORAGE`` option, see
+:ref:`Compact Tables <compact-tables>` in the Data Definition documentation.
+
+Additional References
+---------------------
+
+* :doc:`Consistency Level </cql/consistency>`
+* :doc:`Data Definition (CREATE/ALTER KEYSPACE) </cql/ddl>`
+* :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
+* :doc:`Metrics Reference </reference/metrics>`
--- a/docs/cql/index.rst
+++ b/docs/cql/index.rst
@@ -17,6 +17,7 @@ CQL Reference
   secondary-indexes
   time-to-live
   functions
+   guardrails
   wasm
   json
   mv
@@ -46,6 +47,7 @@ It allows you to create keyspaces and tables, insert and query tables, and more.
  * :doc:`Data Types </cql/types>`
  * :doc:`Definitions </cql/definitions>`
  * :doc:`Global Secondary Indexes </cql/secondary-indexes>`
+  * :doc:`CQL Guardrails </cql/guardrails>`
  * :doc:`Expiring Data with Time to Live (TTL) </cql/time-to-live>`
  * :doc:`Functions </cql/functions>`
  * :doc:`JSON Support </cql/json>`
--- a/docs/cql/secondary-indexes.rst
+++ b/docs/cql/secondary-indexes.rst
@@ -261,8 +261,51 @@ The following options are supported for vector indexes. All of them are optional
 |                              | * ``true``: Enable rescoring.                                                                            |               |
 |                              | * ``false``: Disable rescoring.                                                                          |               |
 +------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
+| ``source_model``             | The name of the embedding model that produced the vectors (e.g., ``"ada002"``). Cassandra client         | *(none)*      |
+|                              | libraries such as CassIO send this option to tag the index with the model. Cassandra SAI rejects it as   |               |
+|                              | an unrecognized property; ScyllaDB accepts and preserves it in ``DESCRIBE`` output for compatibility     |               |
+|                              | with those libraries, but does not act on it.                                                            |               |
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+


+.. _cassandra-sai-compatibility:
+
+Cassandra SAI Compatibility for Vector Search
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ScyllaDB accepts the Cassandra ``StorageAttachedIndex`` (SAI) class name in ``CREATE CUSTOM INDEX``
+statements **for vector columns**. Cassandra libraries such as
+`CassIO <https://cassio.org/>`_ and `LangChain <https://www.langchain.com/>`_ use SAI to create
+vector indexes; ScyllaDB recognizes these statements for compatibility.
+
+When ScyllaDB encounters an SAI class name on a **vector column**, the index is automatically
+created as a native ``vector_index``. The following class names are recognized:
+
+* ``org.apache.cassandra.index.sai.StorageAttachedIndex`` (exact case required)
+* ``StorageAttachedIndex`` (case-insensitive)
+* ``SAI`` (case-insensitive)
+
+Example::
+
+   -- Cassandra SAI statement accepted by ScyllaDB:
+   CREATE CUSTOM INDEX ON my_table (embedding)
+   USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'
+   WITH OPTIONS = {'similarity_function': 'COSINE'};
+
+   -- Equivalent to:
+   CREATE CUSTOM INDEX ON my_table (embedding)
+   USING 'vector_index'
+   WITH OPTIONS = {'similarity_function': 'COSINE'};
+
+The ``similarity_function`` option is supported by both Cassandra SAI and ScyllaDB.
+
+.. note::
+
+   SAI class names are only supported on **vector columns**. Using an SAI class name on a
+   non-vector column (e.g., ``text`` or ``int``) will result in an error. General SAI
+   indexing of non-vector columns is not supported by ScyllaDB; use a
+   :doc:`secondary index </cql/secondary-indexes>` instead.
+
 .. _drop-index-statement:

 DROP INDEX
--- a/docs/dev/audit.md
+++ b/docs/dev/audit.md
@@ -1,111 +0,0 @@
-# Introduction
-
-Similar to the approach described in CASSANDRA-12151, we add the
-concept of an audit specification.  An audit has a target (syslog or a
-table) and a set of events/actions that it wants recorded.  We
-introduce new CQL syntax for Scylla users to describe and manipulate
-audit specifications.
-
-Prior art:
- Microsoft SQL Server [audit
-  description](https://docs.microsoft.com/en-us/sql/relational-databases/security/auditing/sql-server-audit-database-engine?view=sql-server-ver15)
- pgAudit [docs](https://github.com/pgaudit/pgaudit/blob/master/README.md)
- MySQL audit_log docs in
-  [MySQL](https://dev.mysql.com/doc/refman/8.0/en/audit-log.html) and
-  [Azure](https://docs.microsoft.com/en-us/azure/mysql/concepts-audit-logs)
- DynamoDB can [use CloudTrail](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/logging-using-cloudtrail.html) to log all events
-
-# CQL extensions
-
-## Create an audit
-
-```cql
-CREATE AUDIT [IF NOT EXISTS] audit-name WITH TARGET { SYSLOG | table-name }
-[ AND TRIGGER KEYSPACE IN (ks1, ks2, ks3) ]
-[ AND TRIGGER TABLE IN (tbl1, tbl2, tbl3) ]
-[ AND TRIGGER ROLE IN (usr1, usr2, usr3) ]
-[ AND TRIGGER CATEGORY IN (cat1, cat2, cat3) ]
-;
-```
-
-From this point on, every database event that matches all present
-triggers will be recorded in the target.  When the target is a table,
-it behaves like the [current
-design](https://docs.scylladb.com/operating-scylla/security/auditing/#table-storage).
-
-The audit name must be different from all other audits, unless IF NOT
-EXISTS precedes it, in which case the existing audit must be identical
-to the new definition.  Case sensitivity and length limit are the same
-as for table names.
-
-A trigger kind (ie, `KEYSPACE`, `TABLE`, `ROLE`, or `CATEGORY`) can be
-specified at most once.
-
-## Show an audit
-
-```cql
-DESCRIBE AUDIT [audit-name ...];
-```
-
-Prints definitions of all audits named herein.  If no names are
-provided, prints all audits.
-
-## Delete an audit
-
-```cql
-DROP AUDIT audit-name;
-```
-
-Stops logging events specified by this audit.  Doesn't impact the
-already logged events.  If the target is a table, it remains as it is.
-
-## Alter an audit
-
-```cql
-ALTER AUDIT audit-name WITH {same syntax as CREATE}
-```
-
-Any trigger provided will be updated (or newly created, if previously
-absent).  To drop a trigger, use `IN *`.
-
-## Permissions
-
-Only superusers can modify audits or turn them on and off.
-
-Only superusers can read tables that are audit targets; no user can
-modify them.  Only superusers can drop tables that are audit targets,
-after the audit itself is dropped.  If a superuser doesn't drop a
-target table, it remains in existence indefinitely.
-
-# Implementation
-
-## Efficient trigger evaluation
-
-```c++
-namespace audit {
-
-/// Stores triggers from an AUDIT statement.
-class triggers {
-    // Use trie structures for speedy string lookup.
-    optional<trie> _ks_trigger, _tbl_trigger, _usr_trigger;
-
-    // A logical-AND filter.
-    optional<unsigned> _cat_trigger;
-
-public:
-    /// True iff every non-null trigger matches the corresponding ainf element.
-    bool should_audit(const audit_info& ainf);
-};
-
-} // namespace audit
-```
-
-To prevent modification of target tables, `audit::inspect()` will
-check the statement and throw if it is disallowed, similar to what
-`check_access()` currently does.
-
-## Persisting audit definitions
-
-Obviously, an audit definition must survive a server restart and stay
-consistent among all nodes in a cluster.  We'll accomplish both by
-storing audits in a system table.
--- a/docs/dev/compare-build-systems.md
+++ b/docs/dev/compare-build-systems.md
@@ -0,0 +1,155 @@
+# Comparing Build Systems: configure.py vs CMake
+
+ScyllaDB has two build systems: the primary `configure.py` + Ninja pipeline
+and an alternative CMake build (used mainly for IDE integration — CLion,
+clangd, etc.).  Both must produce equivalent compilation and link commands.
+
+`scripts/compare_build_systems.py` verifies this by parsing the `build.ninja`
+files generated by each system and comparing:
+
+1. **Per-file compilation flags** — defines, warnings, optimization, language
+   flags for every Scylla source file.
+2. **Link target sets** — are the same executables produced by both systems?
+3. **Per-target linker settings** — link flags and libraries for every common
+   executable.
+
+`configure.py` is treated as the baseline.  CMake should match it.
+
+## Quick start
+
+```bash
+# Compare a single mode
+scripts/compare_build_systems.py -m dev
+
+# Compare all modes
+scripts/compare_build_systems.py
+
+# Verbose output — show per-file and per-target differences
+scripts/compare_build_systems.py -m debug -v
+```
+
+The script automatically configures both build systems into a temporary
+directory for every run — the user's existing build tree is never touched.
+No manual `configure.py` or `cmake` invocation is required.
+
+## Mode mapping
+
+| configure.py | CMake            |
+|--------------|------------------|
+| `debug`      | `Debug`          |
+| `dev`        | `Dev`            |
+| `release`    | `RelWithDebInfo` |
+| `sanitize`   | `Sanitize`       |
+| `coverage`   | `Coverage`       |
+
+## Examples
+
+```bash
+# Check dev mode only (fast, most common during development)
+scripts/compare_build_systems.py -m dev
+
+# Check all modes
+scripts/compare_build_systems.py
+
+# CI mode: quiet, strict (exit 1 on any diff)
+scripts/compare_build_systems.py --ci
+
+# Verbose output for debugging a specific mode
+scripts/compare_build_systems.py -m sanitize -v
+
+# Quiet mode — only prints summary and errors
+scripts/compare_build_systems.py -m dev -q
+```
+
+## Exit codes
+
+| Code | Meaning                                                                  |
+|------|--------------------------------------------------------------------------|
+| `0`  | All checked modes match                                                  |
+| `1`  | Differences found                                                        |
+| `2`  | Configuration failure or some modes could not be compared (e.g. skipped) |
+
+## What it ignores
+
+The script intentionally ignores certain structural differences that are
+inherent to how the two build systems work:
+
+- **Include paths** (`-I`, `-isystem`) — directory layout differs between
+  the two systems.
+- **LTO/PGO flags** — these are configuration-dependent options, not
+  mode-inherent.
+- **Internal library targets** — CMake creates intermediate static/shared
+  libraries (e.g., `scylla-main`, `test-lib`, abseil targets) while
+  `configure.py` links `.o` files directly.
+- **Per-component Boost defines** — CMake adds `BOOST_REGEX_DYN_LINK` etc.
+  per component; `configure.py` uses a single `BOOST_ALL_DYN_LINK`.
+
+## Typical workflow
+
+After modifying `CMakeLists.txt` or `cmake/mode.*.cmake`:
+
+```bash
+# 1. Run the comparison (auto-configures both systems in a temp dir)
+scripts/compare_build_systems.py -m dev -v
+
+# 2. Fix any differences, repeat
+```
+
+## AI agent workflow
+
+When the script reports mismatches, you can paste its summary output into
+an AI coding agent (GitHub Copilot, etc.) and ask it to fix the
+discrepancies.  The agent has access to both `configure.py` and the
+CMake files and can resolve most differences automatically.
+
+### Example interaction
+
+**1. Run the script:**
+
+```bash
+scripts/compare_build_systems.py
+```
+
+**2. Copy the summary and paste it to the agent:**
+
+> I ran `scripts/compare_build_systems.py` and got:
+>
+> ```
+> Summary
+> ══════════════════════════════════════════════════════════════════════
+>   debug      (CMake: Debug          ):  ✗ MISMATCH
+>     Compilation: 3 files with flag diffs, 1 sources only in configure.py
+>       only-configure.py  defines: -DSOME_FLAG  (3 files)
+>     Link targets: 1 only in configure.py
+>     Linker: 2 targets with lib diffs
+>       lib only in CMake: boost_filesystem  (2 targets)
+>   dev        (CMake: Dev            ):  ✗ MISMATCH
+>     Compilation: 1 sources only in configure.py
+>     Link targets: 1 only in configure.py
+>   release    (CMake: RelWithDebInfo ):  ✓ MATCH
+>   sanitize   (CMake: Sanitize       ):  ✓ MATCH
+>   coverage   (CMake: Coverage       ):  ✓ MATCH
+> ```
+>
+> Please fix all issues and commit according to project guidelines.
+
+**3. The agent will:**
+
+- Identify each discrepancy (missing sources, missing targets, extra
+  libraries, missing defines).
+- Trace root causes — e.g., a test added to `configure.py` but not to
+  `test/boost/CMakeLists.txt`, or an unnecessary `Boost::filesystem`
+  link in a CMake target.
+- Apply fixes to the appropriate `CMakeLists.txt` files.
+- Re-run cmake and the comparison script to verify the fix.
+- Commit each fix to the correct commit in the series (using
+  `git commit --fixup` + `git rebase --autosquash`).
+
+### Tips
+
+- **Paste the full summary block** — the inline diff details (compilation,
+  link targets, linker) give the agent enough context to act without
+  scrolling through verbose output.
+- **Use `-v` for stubborn issues** — if the agent needs per-file or
+  per-target detail, re-run with `-v` and paste the relevant section.
+
--- a/docs/dev/counters.md
+++ b/docs/dev/counters.md
@@ -0,0 +1,81 @@
+# Counters
+
+Counters are special kinds of cells which value can only be incremented, decremented, read and (with some limitations) deleted. In particular, once deleted, that counter cannot be used again. For example:
+
+```cql
+> UPDATE cf SET my_counter = my_counter + 6 WHERE pk = 0
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+  0 |          6
+
+(1 rows)
+> UPDATE cf SET my_counter = my_counter - 1 WHERE pk = 0
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+  0 |          5
+
+(1 rows)
+> DELETE my_counter FROM cf WHERE pk = 0;
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+
+(0 rows)
+> UPDATE cf SET my_counter = my_counter + 3 WHERE pk = 0
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+
+(0 rows)
+```
+
+## Counters representation
+Counters are represented as sets of, so called, shards which are triples containing:
+* counter id – uuid identifying the writer owning that shard (see below)
+* logical clock – incremented each time the owning writer modifies the shard value
+* current value – sum of increments and decrements done by the owning writer
+
+During each write operation one of the replicas is chosen as a leader. The leader reads its shard, increments logical clock, updates current value and then sends the new version of its shard to the other replicas.
+
+Shards owned by the same writer are merged (see below) so that each counter cell contains only one shard per counter id. Reading the actual counter value requires summing values of all shards.
+
+### Counter id
+
+The counter id is a 128-bit UUID that identifies which writer owns a shard. How it is assigned depends on whether the table uses vnodes or tablets.
+
+**Vnodes:** the counter id is the host id of the node that owns the shard. Each node in the cluster gets a unique counter id, so the number of shards in a counter cell grows with the number of distinct nodes that have ever written to it.
+
+**Tablets:** the counter id is rack-based rather than node-based. It is a deterministic type-3 (name-based) UUID derived from the string `"<datacenter>:<rack>"`. All nodes in the same rack share the same counter id.
+
+During tablet migration, since there are two active replicas in a rack and in order to avoid conflicts, the node that is a *pending replica* uses the **negated** rack UUID as its counter id.
+
+This bounds the number of shards in a counter cell to at most `2 × (number of racks)` regardless of node replacements.
+
+### Merging and reconciliation
+Reconciliation of two counters requires merging all shards belonging to the same counter id. The rule is: the shard with the highest logical clock wins.
+
+Since support of deleting counters is limited so that once deleted they cannot be used again, during reconciliation tombstones win with live counter cell regardless of their timestamps.
+
+### Digest
+Computing a digest of counter cells needs to be done based solely on the shard contents (counter id, value, logical clock) rather than any structural metadata.
+
+## Writes
+1. Counter update starts with a client sending counter delta as a long (CQL3 `bigint`) to the coordinator.
+2. CQL3 creates a `CounterMutation` containing a `counter_update` cell which is just a delta.
+3. Coordinator chooses the leader of the counter update and sends it the mutation. The leader is always one of the replicas owning the partition the modified counter belongs to.
+4. Now, the leader needs to transform counter deltas into shards. To do that it reads the current value of the shard it owns, and produces a new shard with the value modified by the delta and the logical clock incremented.
+5. The mutation with the newly created shard is both used to update the memtable on the leader as well as sent to the other nodes for replication.
+
+### Choosing leader
+Choosing a replica which becomes a leader for a counter update is completely at the coordinator discretion. It is not a static role in any way and any concurrent update could be forwarded to a different leader. This means that all problems related to leader election are avoided.
+
+The coordinator chooses the leader using the following algorithm:
+
+1. If the coordinator can be a leader it chooses itself.
+2. Otherwise, a random replica from the local DC is chosen.
+3. If there is no eligible node available in the local DC the replica closest to the coordinator (according to the snitch) is chosen.
+
+## Reads
+Querying counter values is much simpler than updating it. First part of the read operation is performed as for all other cell types. When counter cells from different sources are being reconciled their shards are merged. Once the final counter cell value is known and the `CounterCell` is serialised, current values of all shards are summed up and the output of serialisation is a long integer.
--- a/docs/dev/docker-hub.md
+++ b/docs/dev/docker-hub.md
@@ -192,14 +192,10 @@ For example, to configure ScyllaDB to use listen address `10.0.0.5`:
 $ docker run --name some-scylla -d scylladb/scylla --listen-address 10.0.0.5
 ```

-**Since: 1.4**
-
 #### `--alternator-address ADDR`

 The `--alternator-address` command line option configures the Alternator API listen address. The default value is the same as `--listen-address`.

-**Since: 3.2**
-
 #### `--alternator-port PORT`

 The `--alternator-port` command line option configures the Alternator API listen port. The Alternator API is disabled by default. You need to specify the port to enable it.
@@ -210,22 +206,16 @@ For example, to configure ScyllaDB to listen to Alternator API at port `8000`:
 $ docker run --name some-scylla -d scylladb/scylla --alternator-port 8000
 ```

-**Since: 3.2**
-
 #### `--alternator-https-port PORT`

 The `--alternator-https-port` option is similar to `--alternator-port`, just enables an encrypted (HTTPS) port. Either the `--alternator-https-port` or `--alternator-http-port`, or both, can be used to enable Alternator.

 Note that the `--alternator-https-port` option also requires that files `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key` be inserted into the image. These files contain an SSL certificate and key, respectively.

-**Since: 4.2**
-
 #### `--alternator-write-isolation policy`

 The `--alternator-write-isolation` command line option chooses between four allowed write isolation policies described in docs/alternator/alternator.md. This option must be specified if Alternator is enabled - it does not have a default.

-**Since: 4.1**
-
 #### `--broadcast-address ADDR`

 The `--broadcast-address` command line option configures the IP address the ScyllaDB instance tells other ScyllaDB nodes in the cluster to connect to.
@@ -304,8 +294,6 @@ For example, to skip running I/O setup:
 $ docker run --name some-scylla -d scylladb/scylla --io-setup 0
 ```

-**Since: 4.3**
-
 #### `--cpuset CPUSET`

 The `--cpuset` command line option restricts ScyllaDB to run on only on CPUs specified by `CPUSET`.
@@ -341,26 +329,18 @@ For example, to enable the User Defined Functions (UDF) feature:
 $ docker run --name some-scylla -d scylladb/scylla --experimental-feature=udf
 ```

-**Since: 2.0**
-
 #### `--disable-version-check`

 The `--disable-version-check` disable the version validation check.

-**Since: 2.2**
-
 #### `--authenticator AUTHENTICATOR`

 The `--authenticator` command lines option allows to provide the authenticator class ScyllaDB will use. By default ScyllaDB uses the `AllowAllAuthenticator` which performs no credentials checks. The second option is using the `PasswordAuthenticator` parameter, which relies on username/password pairs to authenticate users.

-**Since: 2.3**
-
 #### `--authorizer AUTHORIZER`

 The `--authorizer` command lines option allows to provide the authorizer class ScyllaDB will use. By default ScyllaDB uses the `AllowAllAuthorizer` which allows any action to any user. The second option is using the `CassandraAuthorizer` parameter, which stores permissions in `system.permissions` table.

-**Since: 2025.4**
-
 #### `--dc NAME`

 The `--dc` command line option sets the datacenter name for the ScyllaDB node.
--- a/docs/dev/logstor.md
+++ b/docs/dev/logstor.md
@@ -0,0 +1,124 @@
+# Logstor
+
+## Introduction
+
+Logstor is a log-structured storage engine for ScyllaDB optimized for key-value workloads. It provides an alternative storage backend for key-value tables - tables with a partition key only, with no clustering columns.
+
+Unlike the traditional LSM-tree based storage, logstor uses a log-structured approach with in-memory indexing, making it particularly suitable for workloads with frequent overwrites and point lookups.
+
+## Architecture
+
+Logstor consists of several key components:
+
+### Components
+
+#### Primary Index
+
+The primary index is entirely in memory and it maps a partition key to its location in the log segments. It consists of a B-tree per each table that is ordered token.
+
+#### Segment Manager
+
+The `segment_manager` handles the allocation and management of fixed-size segments (default 128KB). Segments are grouped into large files (default 32MB). Key responsibilities include:
+
+- **Segment allocation**: Provides segments for writing new data
+- **Space reclamation**: Tracks free space in each segment
+- **Compaction**: Copies live data from sparse segments to reclaim space
+- **Recovery**: Scans segments on startup to rebuild the index
+- **Separator**: Rewrites segments that have records from different compaction groups into new segments that are separated by compaction group.
+
+The data in the segments consists of records of type `log_record`. Each record contains the value for some key as a `canonical_mutation` and additional metadata.
+
+The `segment_manager` receives new writes via a `write_buffer` and writes them sequentially to the active segment with 4k-block alignment.
+
+#### Write Buffer
+
+The `write_buffer` manages a buffer of log records and handles the serialization of the records including headers and alignment. It can be used to write multiple records to the buffer and then write the buffer to the segment manager.
+
+The `buffered_writer` manages multiple write buffers for user writes, an active buffer and multiple flushing ones, to batch writes and manage backpressure.
+
+### Data Flow
+
+**Write Path:**
+1. Application writes mutation to logstor
+2. Mutation is converted to a log record
+3. Record is written to write buffer
+4. The buffer is switched and written to the active segment.
+5. Index is updated with new record locations
+6. Old record locations (for overwrites) are marked as free
+
+**Read Path:**
+1. Application requests data for a partition key
+2. Index lookup returns record location
+3. Segment manager reads record from disk
+4. Record is deserialized into a mutation and returned
+
+**Separator:**
+1. When a record is written to the active segment, it is also written to its compaction group's separator buffer. The separator buffer holds a reference to the original segment.
+2. The separator buffer is flushed when it's full, or requested to flush for other reason. It is written into a new segment in the compaction group, and it updates the location of the records from the original mixed segments to the new segments in the compaction group.
+3. After the separator buffer is flushed and all records from the original segment are moved, it releases the reference of the segment. When there are no more reference to the segment it is freed.
+
+**Compaction:**
+1. The amount of live data is tracked for each segment in its segment_descriptor. The segment descriptors are stored in a histogram by live data.
+2. A segment set from a single compaction group is submitted for compaction.
+3. Compaction picks segments for compaction from the segment set. It chooses segments with the lowest utilization such that compacting them results in net gain of free segments.
+4. It reads the segments, finding all live records, and writing them into a write buffer. When the buffer is full it is flushed into a new segment, and for each recording updating the index location to the new location.
+5. After all live records are rewritten the old segments are freed.
+
+## Usage
+
+### Enabling Logstor
+
+To use logstor, enable it in the configuration:
+
+```yaml
+enable_logstor: true
+
+experimental_features:
+  - logstor
+```
+
+### Creating Tables
+
+Tables using logstor must have no clustering columns, and created with the `storage_engine` property equals to 'logstor':
+
+```cql
+CREATE TABLE keyspace.user_profiles (
+    user_id uuid PRIMARY KEY,
+    name text,
+    email text,
+    metadata frozen<map<text, text>>
+) WITH storage_engine = 'logstor';
+```
+
+### Basic Operations
+
+**Insert/Update:**
+
+```cql
+INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'value1');
+INSERT INTO keyspace.table_name (pk, v) VALUES (2, 'value2');
+
+-- Overwrite with new value
+INSERT INTO keyspace.table_name (pk, v) VALUES (1, 'updated_value');
+```
+
+Currently, updates must write the full row. Updating individual columns is not yet supported. Each write replaces the entire partition.
+
+**Select:**
+
+```cql
+SELECT * FROM keyspace.table_name WHERE pk = 1;
+-- Returns: (1, 'updated_value')
+
+SELECT pk, v FROM keyspace.table_name WHERE pk = 2;
+-- Returns: (2, 'value2')
+
+SELECT * FROM keyspace.table_name;
+-- Returns: (1, 'updated_value'), (2, 'value2')
+```
+
+**Delete:**
+
+```cql
+DELETE FROM keyspace.table_name WHERE pk = 1;
+```
--- a/docs/dev/secondary_index.md
+++ b/docs/dev/secondary_index.md
@@ -37,8 +37,17 @@ Global index's target is usually just the indexed column name, unless the index
 - index on map, set or list values: VALUES(v)
 - index on map entries: ENTRIES(v)

-Their serialization is just string representation, so:
-"v", "FULL(v)", "KEYS(v)", "VALUES(v)", "ENTRIES(v)" are all valid targets.
+Their serialization uses lowercase type names as prefixes, except for `full` which is serialized
+as just the column name (without any prefix):
+`"v"`, `"keys(v)"`, `"values(v)"`, `"entries(v)"` are valid targets; a frozen full collection
+index on column `v` is stored simply as `"v"` (same as a regular index).
+
+If the column name contains characters that could be confused with the above formats
+(e.g., a name containing parentheses or braces), it is escaped using the CQL
+quoted-identifier syntax (column_identifier::to_cql_string()), which wraps the
+name in double quotes and doubles any embedded double-quote characters. For example,
+a column named `hEllo` is stored as `"hEllo"`, and a column named `keys(m)` is
+stored as `"keys(m)"`.

 ## Local index

--- a/docs/dev/system-keyspaces.md
+++ b/docs/dev/system-keyspaces.md
@@ -0,0 +1,67 @@
+# System Keyspaces Overview
+
+This page gives a high-level overview of several internal keyspaces and what they are used for.
+
+## Table of Contents
+
+- [system_replicated_keys](#system_replicated_keys)
+- [system_distributed](#system_distributed)
+- [system_distributed_everywhere](#system_distributed_everywhere)
+- [system_auth](#system_auth)
+- [system](#system)
+- [system_schema](#system_schema)
+- [system_traces](#system_traces)
+- [system_audit/audit](#system_auditaudit)
+
+## `system_replicated_keys`
+
+Internal keyspace for encryption-at-rest key material used by the replicated key provider. It stores encrypted data keys so nodes can retrieve the correct key IDs when reading encrypted data.
+
+This keyspace is created as an internal system keyspace and uses `EverywhereStrategy` so key metadata is available on every node. It is not intended for user data.
+
+## `system_distributed`
+
+Internal distributed metadata keyspace used for cluster-wide coordination data that is shared across nodes.
+
+In practice, it is used for metadata such as:
+
+- materialized view build coordination state
+- CDC stream/timestamp metadata exposed to clients
+- service level definitions used by workload prioritization
+
+This keyspace is managed by Scylla and is not intended for application tables.
+It is created as an internal keyspace (historically with `SimpleStrategy` and RF=3 by default).
+
+## `system_distributed_everywhere`
+
+Legacy keyspace. It is no longer used.
+
+## `system_auth`
+
+Legacy auth keyspace name kept primarily for compatibility.
+
+Auth tables have moved to the `system` keyspace (`roles`, `role_members`, `role_permissions`, and related auth state). `system_auth` may still exist for compatibility with legacy tooling/queries, but it is no longer where current auth state is primarily stored.
+
+## `system`
+
+This keyspace is local one, so each node has its own, independent content for tables in this keyspace. For some tables, the content is coordinated at a higher level (RAFT), but not via the traditional replication systems (storage proxy).
+
+See the detailed table-level documentation here: [system_keyspace](system_keyspace.md)
+
+## `system_schema`
+
+This keyspace is local one, so each node has its own, independent content for tables in this keyspace. All tables in this keyspace are coordinated via the schema replication system.
+
+See the detailed table-level documentation here: [system_schema_keyspace](system_schema_keyspace.md)
+
+## `system_traces`
+
+Internal tracing keyspace used for query tracing and slow-query logging records (`sessions`, `events`, and related index/log tables).
+
+This keyspace is written by Scylla's tracing subsystem for diagnostics and observability. It is operational metadata, not user application data (historically created with `SimpleStrategy` and RF=2).
+
+## `system_audit`/`audit`
+
+Internal audit-logging keyspace used to persist audit events when table-backed auditing is enabled.
+
+Scylla's audit table storage is implemented as an internal audit keyspace for audit records (for example, auth/admin/DCL activity depending on audit configuration). In current code this keyspace is named `audit`, while operational material may refer to it as its historical name (`system_audit`). It is intended for security/compliance observability, not for application data.
--- a/docs/dev/system_keyspace.md
+++ b/docs/dev/system_keyspace.md
@@ -1611,6 +1611,7 @@ CREATE TABLE system.topology (
    cleanup_status text,
    datacenter text,
    ignore_msb int,
+    intended_storage_mode text,
    node_state text,
    num_tokens int,
    rack text,
@@ -1663,6 +1664,7 @@ CREATE TABLE system.topology (
 - `tokens_string`: Alternative representation of tokens
 - `shard_count`: Number of shards on the node
 - `ignore_msb`: MSB bits to ignore for token calculation
+- `intended_storage_mode`: Intended storage mode for tables under vnodes-to-tablets migration. The node switches to this mode on next restart.
 - `cleanup_status`: Status of cleanup operations
 - `supported_features`: Features supported by this node
 - `request_id`: ID of the current topology request for this node
--- a/docs/dev/topology-over-raft.md
+++ b/docs/dev/topology-over-raft.md
@@ -700,6 +700,7 @@ CREATE TABLE system.topology (
    host_id uuid,
    datacenter text,
    ignore_msb int,
+    intended_storage_mode text,
    node_state text,
    num_tokens int,
    rack text,
@@ -741,6 +742,7 @@ Each node has a clustering row in the table where its `host_id` is the clusterin
 - `datacenter`         -  a name of the datacenter the node belongs to
 - `rack`               -  a name of the rack the node belongs to
 - `ignore_msb`         -  the value of the node's `murmur3_partitioner_ignore_msb_bits` parameter
+- `intended_storage_mode` - if set, it indicates the intended storage mode for tables under vnodes-to-tablets migration
 - `shard_count`        -  the node's `smp::count`
 - `release_version`    -  the node's `version::current()` (corresponding to a Cassandra version, used by drivers)
 - `node_state`         -  current state of the node (as described earlier)
--- a/docs/dev/vector_index.md
+++ b/docs/dev/vector_index.md
@@ -0,0 +1,10 @@
+# Vector index in Scylla
+
+Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format:
+
+- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v`
+- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}`
+- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}`
+- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}`
+
+The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured.
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -289,7 +289,7 @@ Yes, but it will require running a full repair (or cleanup) to change the replic
 - If you're reducing the replication factor, run ``nodetool cleanup <updated Keyspace>`` on the keyspace you modified to remove surplus replicated data.
  Cleanup runs on a per-node basis.
 - If you're increasing the replication factor, refer to :doc:`How to Safely Increase the RF </kb/rf-increase>`
- Note that you need to provide the keyspace namr. If you do not, the cleanup or repair operation runs on all keyspaces for the specific node.
+- Note that you need to provide the keyspace name. If you do not, the cleanup or repair operation runs on all keyspaces for the specific node.

 Why can't I set ``listen_address`` to listen to 0.0.0.0 (all my addresses)?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -52,7 +52,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
+               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|


        #. Install ScyllaDB packages.
@@ -125,7 +125,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
+               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|

        #. Install ScyllaDB packages.

@@ -133,19 +133,19 @@ Install ScyllaDB
    
               sudo yum install scylla

-            Running the command installs the latest official version of ScyllaDB Open Source.
-            Alternatively, you can to install a specific patch version:
+            Running the command installs the latest official version of ScyllaDB.
+            Alternatively, you can install a specific patch version:

            .. code-block:: console
    
               sudo yum install scylla-<your patch version>

-            Example: The following example shows the command to install ScyllaDB 5.2.3.
+            Example: The following example shows installing ScyllaDB 2025.3.1.

            .. code-block:: console
               :class: hide-copy-button
    
-               sudo yum install scylla-5.2.3
+               sudo yum install scylla-2025.3.1

 .. include:: /getting-started/_common/setup-after-install.rst

--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -36,11 +36,8 @@ release versions, run:
  curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases


-Versions 2025.1 and Later
-==============================
-
-Run the command with the ``--scylla-version`` option to specify the version
-you want to install.
+To install a non-default version, run the command with the ``--scylla-version``
+option to specify the version you want to install.

 **Example**

@@ -50,20 +47,4 @@ you want to install.
  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|


-Versions Earlier than 2025.1
-================================
-
-To install a supported version of *ScyllaDB Enterprise*, run the command with:
-
-* ``--scylla-product scylla-enterprise`` to specify that you want to install
-  ScyllaDB Entrprise.
-* ``--scylla-version`` to specify the version you want to install.
-
-For example:
-
-.. code:: console
-  
-  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
-
-
 .. include:: /getting-started/_common/setup-after-install.rst
--- a/docs/operating-scylla/admin.rst
+++ b/docs/operating-scylla/admin.rst
@@ -181,6 +181,7 @@ internode_compression controls whether traffic between nodes is compressed.

 * all  - all traffic is compressed.
 * dc   - traffic between different datacenters is compressed.
+* rack - traffic between different racks is compressed.
 * none - nothing is compressed (default).

 Configuring TLS/SSL in scylla.yaml
--- a/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
+++ b/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
@@ -57,12 +57,11 @@ To enable shared dictionaries:
    internode_compression_enable_advanced: true
    rpc_dict_training_when: when_leader

-.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
+.. note::

-             Trained dictionaries contain randomly chosen samples of data transferred between
-             nodes. The data samples are persisted in the Raft log, which is not encrypted.
-             As a result, some data from otherwise encrypted tables might be stored on disk
-             unencrypted.
+   Some dictionary training data may be encrypted using storage-level encryption
+   (if enabled) instead of database-level encryption, meaning protection is
+   applied at the storage layer rather than within the database itself.


 Reference
--- a/docs/operating-scylla/procedures/config-change/index.rst
+++ b/docs/operating-scylla/procedures/config-change/index.rst
@@ -10,6 +10,7 @@ ScyllaDB Configuration Procedures
   How to do a Rolling Restart <rolling-restart>
   Advanced Internode (RPC) Compression <advanced-internode-compression>
   Shared-dictionary compression for SSTables <sstable-dictionary-compression>
+   Migrate a Keyspace from Vnodes to Tablets <migrate-vnodes-to-tablets>

 Procedures to change ScyllaDB Configuration settings.

@@ -22,3 +23,5 @@ Procedures to change ScyllaDB Configuration settings.
 * :doc:`Advanced Internode (RPC) Compression </operating-scylla/procedures/config-change/advanced-internode-compression>`

 * :doc:`Shared-dictionary compression for SSTables </operating-scylla/procedures/config-change/sstable-dictionary-compression>`
+
+* :doc:`Migrate a Keyspace from Vnodes to Tablets </operating-scylla/procedures/config-change/migrate-vnodes-to-tablets>`
--- a/docs/operating-scylla/procedures/config-change/migrate-vnodes-to-tablets.rst
+++ b/docs/operating-scylla/procedures/config-change/migrate-vnodes-to-tablets.rst
@@ -0,0 +1,393 @@
+Migrate a Keyspace from Vnodes to Tablets
+==========================================
+
+This procedure describes how to migrate an existing keyspace from vnodes
+to tablets. Tablets are designed to be the long-term replacement for vnodes,
+offering numerous benefits such as faster topology operations, automatic load
+balancing, automatic cleanups, and improved streaming performance. Migrating to
+tablets is strongly recommended. See :doc:`Data Distribution with Tablets </architecture/tablets/>`
+for details.
+
+.. note::
+
+   The migration is an online operation. This means that the keyspace remains
+   fully available to users throughout the migration, provided that its
+   replication factor is greater than 1. Reads and writes continue to be served
+   using vnodes until the migration is finished.
+
+.. warning::
+
+   During the migration, you should expect degraded performance on the migrating
+   keyspace. The reasons are the following:
+
+   * **Rolling restart**: Each node must upgrade its storage from vnodes to
+     tablets. This is an offline operation happening on startup, so a restart is
+     needed. Upon restart, each node performs a heavy and time-consuming
+     resharding operation to reorganize its data based on tablets, and remains
+     offline until this operation completes. Resharding may last from minutes to
+     hours, depending on the amount of data that the node holds. At this time,
+     the node cannot serve any requests.
+   * **Unbalanced tablets**: The initial tablet layout mirrors the vnode layout.
+     The tablet load balancer does not rebalance tablets until the migration is
+     finished, so some shards may carry more data than others during the
+     migration. The imbalance is expected to be more prominent in clusters with
+     very large nodes (hundreds of vCPUs).
+   * **Loss of shard awareness**: During the migration and until the rolling
+     restart is complete, the cluster is in a mixed state with some nodes using
+     vnodes and others using tablets. In this state, queries may cause
+     cross-shard operations within nodes, reducing performance.
+
+   The performance will return to normal after the migration finishes and the
+   tablet load balancer rebalances the data.
+
+Prerequisites
+-------------
+
+* All nodes in the cluster must be **up and running**. You can check the status
+  of all nodes with
+  :doc:`nodetool status </operating-scylla/nodetool-commands/status/>`.
+* All nodes must be running ScyllaDB 2026.2 or later.
+
+Limitations
+-----------
+
+The current migration procedure has the following limitations:
+
+* The total number of **vnode tokens** in the cluster must be a **power of two**
+  and the tokens must be **evenly spaced** across the token ring. This is
+  verified automatically when starting the migration.
+* **No schema changes** during the migration. Do not create, alter, or drop
+  tables in the migrating keyspace until the migration is finished.
+* **No topology changes** during the migration. Do not add, remove, decommission,
+  or replace nodes while a migration is in progress.
+* **No TRUNCATE** on tables in the migrating keyspace during the migration.
+* Only **CQL base tables** can be migrated. Materialized views, secondary
+  indexes, CDC tables, and Alternator tables are not supported.
+* Tables with **counters** or **LWTs** cannot be migrated.
+
+Overview
+--------
+
+The migration consists of three phases:
+
+1. **Prepare**: Create tablet maps for all tables in the keyspace. Each tablet
+   inherits its token range and replica set from the corresponding vnode range.
+2. **Storage upgrade**: Restart each node one at a time, upgrading its storage
+   from vnodes to tablets. Upon restart, the node begins resharding data into
+   tablets. This is a storage-layer operation and is unrelated to ScyllaDB
+   version upgrades.
+3. **Finalize**: Once all nodes have been upgraded, commit the migration by
+   clearing the migration state and switching the keyspace schema to tablets.
+
+During the first two phases, the migration is reversible; you can roll back to
+vnodes. However, once the migration is finalized, it cannot be reversed.
+
+.. note::
+
+   In the following sections, any reference to "upgrade" or "downgrade" of a
+   node will refer to the migration of its storage from vnodes to tablets or
+   vice versa. Do not confuse it with version upgrades/downgrades.
+
+Procedure
+---------
+
+#. Prepare the keyspace for migration:
+
+   #. Create tablet maps for all tables in the keyspace:
+
+      .. code-block:: console
+
+         scylla nodetool migrate-to-tablets start <keyspace>
+
+   #. Verify that the keyspace is in ``migrating_to_tablets`` state and all nodes are still using vnodes:
+
+      .. code-block:: console
+
+         scylla nodetool migrate-to-tablets status <keyspace>
+
+      **Example:**
+
+      .. code-block:: console
+
+         $ scylla nodetool migrate-to-tablets status ks
+         Keyspace: ks
+         Status: migrating_to_tablets
+
+         Nodes:
+         Host ID                                Status
+         99d8de76-3954-4727-911a-6a07251b180c   uses vnodes
+         0b5fd6f6-9670-4faf-a480-ad58cf119007   uses vnodes
+         017dd39a-3d06-4c8a-8ac4-379f9e595607   uses vnodes
+
+   .. _upgrade-nodes:
+
+#. Upgrade all nodes to tablets:
+
+   #. Pick a node.
+
+   #. Mark the node for upgrade to tablets:
+
+      .. note::
+
+         This is a node-local operation. Use the IP address of the node that
+         you are upgrading.
+
+      .. caution::
+
+         Do not mark more than one node for upgrade at the same time. Even if
+         you restart them serially, unexpected restarts can happen for various
+         reasons (crashes, power failures, etc.) leading to parallel node
+         upgrades which can reduce availability.
+
+      .. code-block:: console
+
+         scylla nodetool -h <node-ip> migrate-to-tablets upgrade
+
+   #. Verify that the node status changed from ``vnodes`` to ``migrating to tablets``:
+
+      .. code-block:: console
+
+         scylla nodetool migrate-to-tablets status <keyspace>
+
+      **Example:**
+
+      .. code-block:: console
+
+         $ scylla nodetool migrate-to-tablets status ks
+         Keyspace: ks
+         Status: migrating_to_tablets
+
+         Nodes:
+         Host ID                                Status
+         99d8de76-3954-4727-911a-6a07251b180c   migrating to tablets  <---
+         0b5fd6f6-9670-4faf-a480-ad58cf119007   uses vnodes
+         017dd39a-3d06-4c8a-8ac4-379f9e595607   uses vnodes
+
+   #. Drain and stop the node:
+
+      .. code-block:: console
+
+         scylla nodetool -h <node-ip> drain
+
+      .. include:: /rst_include/scylla-commands-stop-index.rst
+
+   #. Restart the node:
+
+      .. include:: /rst_include/scylla-commands-start-index.rst
+
+   #. Wait until the node is UP and has returned to the ScyllaDB cluster using :doc:`nodetool status </operating-scylla/nodetool-commands/status/>`.
+      This operation may take a long time due to resharding. To monitor
+      resharding progress, use the task manager API:
+
+      .. code-block:: console
+
+         scylla nodetool tasks list compaction -h <node-ip> --keyspace <keyspace> | grep -i reshard
+
+   #. Verify that the node status changed from ``migrating to tablets`` to ``uses tablets``:
+
+      .. code-block:: console
+
+         scylla nodetool migrate-to-tablets status <keyspace>
+
+      **Example:**
+
+      .. code-block:: console
+
+         $ scylla nodetool migrate-to-tablets status ks
+         Keyspace: ks
+         Status: migrating_to_tablets
+
+         Nodes:
+         Host ID                                Status
+         99d8de76-3954-4727-911a-6a07251b180c   uses tablets  <---
+         0b5fd6f6-9670-4faf-a480-ad58cf119007   uses vnodes
+         017dd39a-3d06-4c8a-8ac4-379f9e595607   uses vnodes
+
+   #. Move to the next node and repeat from step a until all nodes are upgraded.
+
+#. Finalize the migration:
+
+   .. warning::
+
+      Finalization **cannot be undone**. Once the migration is finalized, the
+      keyspace cannot be switched back to vnodes.
+
+   #. Issue the finalization request:
+
+      .. code-block:: console
+
+         scylla nodetool migrate-to-tablets finalize <keyspace>
+
+   #. Verify that the keyspace status changed to ``tablets``:
+
+      .. code-block:: console
+
+         scylla nodetool migrate-to-tablets status <keyspace>
+
+      **Example:**
+
+      .. code-block:: console
+
+         $ scylla nodetool migrate-to-tablets finalize ks
+         Keyspace: ks
+         Status: tablets
+
+Rollback Procedure
+------------------
+
+.. note::
+
+   Rollback is only possible **before finalization**. Once the migration is
+   finalized, it cannot be reversed.
+
+If you need to abort the migration **before finalization**, you can roll back
+by downgrading each node back to vnodes. The rollback procedure is the
+following:
+
+#. Find all nodes that have been upgraded to tablets (status: ``uses tablets``)
+   or they are in the process of upgrading to tablets (status: ``migrating to tablets``):
+
+   .. code-block:: console
+
+      scylla nodetool migrate-to-tablets status <keyspace>
+
+   **Example:**
+
+   .. code-block:: console
+
+      $ scylla nodetool migrate-to-tablets status ks
+      Keyspace: ks
+      Status: migrating_to_tablets
+
+      Nodes:
+      Host ID                                Status
+      99d8de76-3954-4727-911a-6a07251b180c   uses tablets  <---
+      0b5fd6f6-9670-4faf-a480-ad58cf119007   migrating to tablets  <---
+      017dd39a-3d06-4c8a-8ac4-379f9e595607   uses vnodes
+
+#. For **each upgraded or upgrading node** in the cluster, perform a downgrade
+   (one node at a time):
+
+   #. Mark the node for downgrade:
+
+      .. code-block:: console
+
+         scylla nodetool -h <node-ip> migrate-to-tablets downgrade
+
+   #. Check the node status. The status for a previously upgraded node should
+      change from ``uses tablets`` to ``migrating to vnodes``. The status for a
+      previously upgrading node should change from ``migrating to tablets`` to
+      ``uses vnodes`` or ``migrating to vnodes``:
+
+      .. code-block:: console
+
+         scylla nodetool migrate-to-tablets status <keyspace>
+
+      **Example:**
+
+      .. code-block:: console
+
+         $ scylla nodetool migrate-to-tablets status ks
+         Keyspace: ks
+         Status: migrating_to_tablets
+
+         Nodes:
+         Host ID                                Status
+         99d8de76-3954-4727-911a-6a07251b180c   migrating to vnodes  <---
+         0b5fd6f6-9670-4faf-a480-ad58cf119007   migrating to tablets
+         017dd39a-3d06-4c8a-8ac4-379f9e595607   uses vnodes
+
+   #. If the node status is ``uses vnodes``, the downgrade is complete. Move to
+      the next node and repeat from step a.
+
+   #. If the node is ``migrating to vnodes``, restart it to complete the
+      downgrade:
+
+      #. Drain and stop the node:
+
+         .. code-block:: console
+
+            scylla nodetool -h <node-ip> drain
+
+         .. include:: /rst_include/scylla-commands-stop-index.rst
+
+      #. Restart the node:
+
+         .. include:: /rst_include/scylla-commands-start-index.rst
+
+      #. Wait until the node is UP and has returned to the ScyllaDB cluster using :doc:`nodetool status </operating-scylla/nodetool-commands/status/>`.
+         This operation may take a long time due to resharding. To monitor
+         resharding progress, use the task manager API:
+
+         .. code-block:: console
+
+            scylla nodetool tasks list compaction -h <node-ip> --keyspace <keyspace> | grep -i reshard
+
+      #. Verify that the node status changed from ``migrating to vnodes`` to ``uses vnodes``:
+
+         .. code-block:: console
+
+            scylla nodetool migrate-to-tablets status <keyspace>
+
+         **Example:**
+
+         .. code-block:: console
+
+            $ scylla nodetool migrate-to-tablets status ks
+            Keyspace: ks
+            Status: migrating_to_tablets
+
+            Nodes:
+            Host ID                                Status
+            99d8de76-3954-4727-911a-6a07251b180c   uses vnodes  <---
+            0b5fd6f6-9670-4faf-a480-ad58cf119007   migrating to tablets
+            017dd39a-3d06-4c8a-8ac4-379f9e595607   uses vnodes
+
+      #. Move to the next node and repeat from step a until all nodes are
+         downgraded.
+
+#. Once all nodes have been downgraded, finalize the rollback:
+
+   .. code-block:: console
+
+      scylla nodetool migrate-to-tablets finalize <keyspace>
+
+Migrating multiple keyspaces
+----------------------------
+
+Migrating multiple keyspaces simultaneously is supported. The procedure is the
+same as with a single keyspace except that the preparation and finalization
+steps need to be repeated for each keyspace. However, note that a new migration
+cannot be started once another migration is in the upgrade phase. The migrations
+need to be prepared and finalized together.
+
+To migrate multiple keyspaces simultaneously, follow these steps:
+
+#. For **each keyspace**, prepare it for migration:
+
+   .. code-block:: console
+
+      scylla nodetool migrate-to-tablets start <keyspace1>
+      scylla nodetool migrate-to-tablets start <keyspace2>
+      ...
+
+   Verify that all keyspaces are in ``migrating_to_tablets`` state before
+   proceeding:
+
+   .. code-block:: console
+
+      scylla nodetool migrate-to-tablets status <keyspace1>
+      scylla nodetool migrate-to-tablets status <keyspace2>
+      ...
+
+#. Upgrade all nodes in the cluster following the same :ref:`procedure <upgrade-nodes>`
+   as for a single keyspace. Each node restart reshards all keyspaces under
+   migration in one pass.
+
+#. For **each keyspace**, finalize the migration:
+
+   .. code-block:: console
+
+      scylla nodetool migrate-to-tablets finalize <keyspace1>
+      scylla nodetool migrate-to-tablets finalize <keyspace2>
+      ...
--- a/docs/operating-scylla/security/auditing.rst
+++ b/docs/operating-scylla/security/auditing.rst
@@ -2,8 +2,8 @@
 ScyllaDB Auditing Guide
 ========================

-Auditing allows the administrator to monitor activities on a Scylla cluster, including queries and data changes. 
-The information is stored in a Syslog or a Scylla table.
+Auditing allows the administrator to monitor activities on a ScyllaDB cluster, including CQL queries and data changes, as well as Alternator (DynamoDB-compatible API) requests.
+The information is stored in a Syslog or a ScyllaDB table.

 Prerequisite
 ------------
@@ -14,15 +14,15 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
 Enabling Audit
 ---------------

-By default, table auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
+By default, auditing is **enabled** with the ``table`` backend. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
 You can set the following options:

 * ``none`` - Audit is disabled.
-* ``table`` - Audit is enabled, and messages are stored in a Scylla table (default).
+* ``table`` - Audit is enabled, and messages are stored in a ScyllaDB table (default).
 * ``syslog`` - Audit is enabled, and messages are sent to Syslog.
-* ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.
+* ``syslog,table`` - Audit is enabled, and messages are stored in a ScyllaDB table and sent to Syslog.

-Configuring any other value results in an error at Scylla startup.
+Configuring any other value results in an error at ScyllaDB startup.

 Configuring Audit
 -----------------
@@ -34,7 +34,9 @@ Flag                Default Value                       Description
 ==================  ==================================  ========================================================================================================================
 audit_categories    "DCL,AUTH,ADMIN"                                  Comma-separated list of statement categories that should be audited
 ------------------  ----------------------------------  ------------------------------------------------------------------------------------------------------------------------
-audit_tables        “”                                  Comma-separated list of table names that should be audited, in the format of <keyspacename>.<tablename>
+audit_tables        “”                                  Comma-separated list of table names that should be audited, in the format ``<keyspace_name>.<table_name>``.
+                                                        
+                                                        For Alternator tables use the ``alternator.<table_name>`` format (see :ref:`alternator-auditing`).
 ------------------  ----------------------------------  ------------------------------------------------------------------------------------------------------------------------
 audit_keyspaces     “”                                  Comma-separated list of keyspaces that should be audited. You must specify at least one keyspace.
                                                        If you leave this option empty, no keyspace will be audited.
@@ -47,30 +49,137 @@ You can use DCL, AUTH, and ADMIN audit categories without including any keyspace
 audit_categories parameter description
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-=========  =========================================================================================
-Parameter  Logs Description
-=========  =========================================================================================
-AUTH       Logs login events
---------  -----------------------------------------------------------------------------------------
-DML        Logs insert, update, delete, and other data manipulation language (DML) events
---------  -----------------------------------------------------------------------------------------
-DDL        Logs object and role create, alter, drop, and other data definition language (DDL) events
---------  -----------------------------------------------------------------------------------------
-DCL        Logs grant, revoke, create role, drop role, and list roles events
---------  -----------------------------------------------------------------------------------------
-QUERY      Logs all queries
---------  -----------------------------------------------------------------------------------------
-ADMIN      Logs service level operations: create, alter, drop, attach, detach, list.
+=========  =========================================================================================  ====================
+Parameter  Logs Description                                                                           Applies To
+=========  =========================================================================================  ====================
+AUTH       Logs login events                                                                           CQL
+---------  -----------------------------------------------------------------------------------------  --------------------
+DML        Logs insert, update, delete, and other data manipulation language (DML) events              CQL, Alternator
+---------  -----------------------------------------------------------------------------------------  --------------------
+DDL        Logs object and role create, alter, drop, and other data definition language (DDL) events   CQL, Alternator
+---------  -----------------------------------------------------------------------------------------  --------------------
+DCL        Logs grant, revoke, create role, drop role, and list roles events                           CQL
+---------  -----------------------------------------------------------------------------------------  --------------------
+QUERY      Logs all queries                                                                            CQL, Alternator
+---------  -----------------------------------------------------------------------------------------  --------------------
+ADMIN      Logs service level operations: create, alter, drop, attach, detach, list.                   CQL
           For :ref:`service level <workload-priorization-service-level-management>`
           auditing.
-=========  =========================================================================================
+=========  =========================================================================================  ====================
+
+For details on auditing Alternator operations, see :ref:`alternator-auditing`.

 Note that enabling audit may negatively impact performance and audit-to-table may consume extra storage. That's especially true when auditing DML and QUERY categories, which generate a high volume of audit messages.

+.. _alternator-auditing:
+
+Auditing Alternator Requests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When auditing is enabled, Alternator (DynamoDB-compatible API) requests are audited using the same
+backends and the same filtering configuration (``audit_categories``, ``audit_keyspaces``,
+``audit_tables``) as CQL operations. No additional configuration is needed.
+
+Both successful and failed Alternator requests are audited.
+
+Alternator Operation Categories
+""""""""""""""""""""""""""""""""
+
+Each Alternator API operation is assigned to one of the standard audit categories:
+
+=========  ====================================================================================================
+Category   Alternator Operations
+=========  ====================================================================================================
+DDL        CreateTable, DeleteTable, UpdateTable, TagResource, UntagResource, UpdateTimeToLive
+---------  ----------------------------------------------------------------------------------------------------
+DML        PutItem, UpdateItem, DeleteItem, BatchWriteItem
+---------  ----------------------------------------------------------------------------------------------------
+QUERY      GetItem, BatchGetItem, Query, Scan, DescribeTable, ListTables, DescribeEndpoints,
+           ListTagsOfResource, DescribeTimeToLive, DescribeContinuousBackups,
+           ListStreams, DescribeStream, GetShardIterator, GetRecords
+=========  ====================================================================================================
+
+.. note:: AUTH, DCL, and ADMIN categories do not apply to Alternator operations. These categories
+   are specific to CQL authentication, authorization, and service-level management.
+
+Operation Field Format
+"""""""""""""""""""""""
+
+For CQL operations, the ``operation`` field in the audit log contains the raw CQL query string.
+For Alternator operations, the format is:
+
+.. code-block:: none
+
+   <OperationName>|<JSON request body>
+
+For example:
+
+.. code-block:: none
+
+   PutItem|{"TableName":"my_table","Item":{"p":{"S":"pk_val"},"c":{"S":"ck_val"},"v":{"S":"data"}}}
+
+.. note:: The full JSON request body is included in the ``operation`` field. For batch operations
+   (such as BatchWriteItem), this can be very large (up to 16 MB).
+
+Keyspace and Table Filtering for Alternator
+""""""""""""""""""""""""""""""""""""""""""""
+
+The real keyspace name of an Alternator table ``T`` is ``alternator_T``.
+The ``audit_tables`` config flag uses the shorthand format ``alternator.T`` to refer to such
+tables -- the parser expands it to the real keyspace name automatically.
+For ``audit_keyspaces``, use the real keyspace name directly.
+
+For example, to audit an Alternator table called ``my_table_name`` use either of the below:
+
+.. code-block:: yaml
+
+   # Using audit_tables - use 'alternator' as the keyspace name:
+   audit_tables: "alternator.my_table_name"
+
+   # Using audit_keyspaces - use the real keyspace name:
+   audit_keyspaces: "alternator_my_table_name"
+
+**Global and batch operations**: Some Alternator operations are not scoped to a single table:
+
+* ``ListTables`` and ``DescribeEndpoints`` have no associated keyspace or table.
+* ``BatchWriteItem`` and ``BatchGetItem`` may span multiple tables.
+
+These operations are logged whenever their category matches ``audit_categories``, regardless of
+``audit_keyspaces`` or ``audit_tables`` filters. Their ``keyspace_name`` field is empty, and for
+batch operations the ``table_name`` field contains a pipe-separated (``|``) list of all involved table names.
+
+**DynamoDB Streams operations**: For streams-related operations (``DescribeStream``, ``GetShardIterator``,
+``GetRecords``), the ``table_name`` field contains the base table name and the CDC log table name
+separated by a pipe (e.g., ``my_table|my_table_scylla_cdc_log``).
+
+Alternator Audit Log Examples
+""""""""""""""""""""""""""""""
+
+Syslog output example (PutItem):
+
+.. code-block:: shell
+
+   Mar 18 10:15:03 ip-10-143-2-108 scylla-audit[28387]: node="10.143.2.108", category="DML", cl="LOCAL_QUORUM", error="false", keyspace="alternator_my_table", query="PutItem|{\"TableName\":\"my_table\",\"Item\":{\"p\":{\"S\":\"pk_val\"}}}", client_ip="127.0.0.1", table="my_table", username="anonymous"
+
+Table output example (PutItem):
+
+.. code-block:: shell
+
+   SELECT * FROM audit.audit_log ;
+
+returns:
+
+.. code-block:: none
+
+    date                    | node         | event_time                           | category | consistency  | error | keyspace_name         | operation                                                                        | source    | table_name | username  |
+   -------------------------+--------------+--------------------------------------+----------+--------------+-------+-----------------------+----------------------------------------------------------------------------------+-----------+------------+-----------+
+   2026-03-18 00:00:00+0000 | 10.143.2.108 | 3429b1a5-2a94-11e8-8f4e-000000000001 |      DML | LOCAL_QUORUM | False | alternator_my_table   | PutItem|{"TableName":"my_table","Item":{"p":{"S":"pk_val"}}}                     | 127.0.0.1 |   my_table | anonymous |
+   (1 row)
+
 Configuring Audit Storage
 ---------------------------

-Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a Scylla :ref:`table <auditing-table-storage>` or both.
+Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a ScyllaDB :ref:`table <auditing-table-storage>` or both.

 .. _auditing-syslog-storage:

@@ -99,13 +208,13 @@ Storing Audit Messages in Syslog
      # All tables in those keyspaces will be audited
      audit_keyspaces: "mykespace"

-#. Restart the Scylla node.
+#. Restart the ScyllaDB node.

 .. include:: /rst_include/scylla-commands-restart-index.rst

-By default, audit messages are written to the same destination as Scylla :doc:`logging </getting-started/logging>`, with ``scylla-audit`` as the process name.
+By default, audit messages are written to the same destination as ScyllaDB :doc:`logging </getting-started/logging>`, with ``scylla-audit`` as the process name.

-Logging output example (drop table): 
+Logging output example (CQL drop table):

 .. code-block:: shell

@@ -123,7 +232,7 @@ To redirect the Syslog output to a file, follow the steps below (available only
 Storing Audit Messages in a Table
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Messages are stored in a Scylla table named ``audit.audit_log``. 
+Messages are stored in a ScyllaDB table named ``audit.audit_log``. 

 For example:

@@ -170,11 +279,11 @@ For example:
      # All tables in those keyspaces will be audited
      audit_keyspaces: "mykespace"

-#. Restart Scylla node.
+#. Restart the ScyllaDB node.

   .. include:: /rst_include/scylla-commands-restart-index.rst

-   Table output example (drop table):
+   Table output example (CQL drop table):

   .. code-block:: shell

@@ -196,7 +305,7 @@ Storing Audit Messages in a Table and Syslog Simultaneously

 **Procedure**

-#. Follow both procedures from above, and set the ``audit`` parameter in the ``scylla.yaml`` file to both ``syslog`` and ``table``. You need to restart scylla only once.
+#. Follow both procedures from above, and set the ``audit`` parameter in the ``scylla.yaml`` file to both ``syslog`` and ``table``. You need to restart ScyllaDB only once.

   To have both syslog and table you need to specify both backends separated by a comma:

--- a/docs/upgrade/about-upgrade.rst
+++ b/docs/upgrade/about-upgrade.rst
@@ -1,41 +0,0 @@
-================
-About Upgrade
-================
-
-ScyllaDB upgrade is a rolling procedure - it does not require a full cluster
-shutdown and is performed without any downtime or disruption of service.
-
-To ensure a successful upgrade, follow
-the :doc:`documented upgrade procedures <upgrade-guides/index>` tested by
-ScyllaDB. This means that:
-
-* You should follow the upgrade policy:
-
-   * Starting with version **2025.4**, upgrades can **skip minor versions** if:
-
-       * They remain within the same major version (for example, upgrading
-         directly from *2025.1 → 2025.4* is supported).
-       * You upgrade to the next major version (for example, upgrading
-         directly from *2025.3 → 2026.1* is supported).
-
-   * For versions **prior to 2025.4**, upgrades must be performed consecutively—
-     each successive X.Y version must be installed in order, **without skipping
-     any major or minor version** (for example, upgrading directly from 2025.1 → 2025.3
-     is not supported).
-   * You cannot skip major versions. Upgrades must move from one major version to
-     the next using the documented major-version upgrade path.
-   * You should upgrade to a supported version of ScyllaDB.
-     See `ScyllaDB Version Support <https://docs.scylladb.com/stable/versioning/version-support.html>`_.
-* Before you upgrade to the next version, the whole cluster (each node) must
-  be upgraded to the previous version.
-* You cannot perform an upgrade by replacing the nodes in the cluster with new
-  nodes with a different ScyllaDB version. You should never add a new node with
-  a different version to a cluster - if you
-  :doc:`add a node </operating-scylla/procedures/cluster-management/add-node-to-cluster>`,
-  it must have the same X.Y.Z (major.minor.patch) version as the other nodes in
-  the cluster.
-
-Upgrading to each patch version by following the Maintenance Release Upgrade
-Guide is optional. However, we recommend upgrading to the latest patch release
-for your version before upgrading to a new version.
-
--- a/docs/upgrade/index.rst
+++ b/docs/upgrade/index.rst
@@ -5,7 +5,6 @@ Upgrade ScyllaDB
 .. toctree::
   :titlesonly:

-   About Upgrade <about-upgrade>
   Upgrade Guides <upgrade-guides/index>


--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -5,6 +5,7 @@ Upgrade ScyllaDB
 .. toctree::
   
   ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
+   ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
   ScyllaDB Image <ami-upgrade>


--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/upgrade-guide-from-2025.x-to-2026.1.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/upgrade-guide-from-2025.x-to-2026.1.rst
@@ -20,7 +20,7 @@ This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS,
 and Ubuntu. See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ 
 for information about supported versions. It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.

-See :doc:`About Upgrade </upgrade/about-upgrade/>` for the ScyllaDB upgrade policy.
+See `Upgrade Policy <https://docs.scylladb.com/stable/versioning/upgrade-policy.html>`_ for the ScyllaDB upgrade policy.

 Before You Upgrade ScyllaDB
 ==============================
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.x.y-to-2026.x.z.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.x.y-to-2026.x.z.rst
@@ -0,0 +1,268 @@
+.. |SCYLLA_NAME| replace:: ScyllaDB
+
+.. |SRC_VERSION| replace:: 2026.x.y
+.. |NEW_VERSION| replace:: 2026.x.z
+
+==========================================================================
+Upgrade - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| (Patch Upgrades)
+==========================================================================
+
+This document describes a step-by-step procedure for upgrading from
+|SCYLLA_NAME| |SRC_VERSION|  to |SCYLLA_NAME| |NEW_VERSION| (where "z" is
+the latest available version), and rolling back to version |SRC_VERSION|
+if necessary.
+
+This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
+CentOS, Debian, and Ubuntu.
+See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
+for information about supported versions.
+
+It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
+
+See `Upgrade Policy <https://docs.scylladb.com/stable/versioning/upgrade-policy.html>`_ for the ScyllaDB upgrade policy.
+
+Upgrade Procedure
+=================
+
+.. note::
+   Apply the following procedure **serially** on each node. Do not move to the next
+   node before validating that the node is up and running the new version.
+
+A ScyllaDB upgrade is a rolling procedure that does **not** require a full cluster
+shutdown. For each of the nodes in the cluster, you will:
+
+#. Drain the node and back up the data.
+#. Backup configuration file.
+#. Stop ScyllaDB.
+#. Download and install new ScyllaDB packages.
+#. Start ScyllaDB.
+#. Validate that the upgrade was successful.
+
+**Before** upgrading, check which version you are running now using
+``scylla --version``. Note the current version in case you want to roll back
+the upgrade.
+
+**During** the rolling upgrade it is highly recommended:
+
+* Not to use new |NEW_VERSION| features.
+* Not to run administration functions, like repairs, refresh, rebuild or add
+  or remove nodes. See
+  `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending
+  ScyllaDB Manager's scheduled or running repairs.
+* Not to apply schema changes.
+
+Upgrade Steps
+=============
+
+Back up the data
+------------------------------
+
+Back up all the data to an external device. We recommend using
+`ScyllaDB Manager <https://manager.docs.scylladb.com/stable/backup/index.html>`_
+to create backups.
+
+Alternatively, you can use the ``nodetool snapshot`` command.
+For **each** node in the cluster, run the following:
+
+.. code:: sh
+
+   nodetool drain
+   nodetool snapshot
+
+Take note of the directory name that nodetool gives you, and copy all
+the directories with this name under ``/var/lib/scylla`` to a backup device.
+
+When the upgrade is completed on all nodes, remove the snapshot with the 
+``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of
+space.
+
+Back up the configuration file
+------------------------------
+
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to roll back the upgrade.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+      .. code:: sh
+
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
+
+   .. group-tab:: RHEL/CentOS
+
+      .. code:: sh 
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup
+
+Gracefully stop the node
+------------------------
+
+.. code:: sh
+
+   sudo service scylla-server stop
+
+Download and install the new release
+------------------------------------
+
+You don’t need to update the ScyllaDB DEB or RPM repo when you upgrade to
+a patch release.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+        To install a patch version on Debian or Ubuntu, run:
+
+        .. code:: sh
+            
+            sudo apt-get clean all
+            sudo apt-get update
+            sudo apt-get dist-upgrade scylla
+
+        Answer ‘y’ to the first two questions.
+
+   .. group-tab:: RHEL/CentOS
+
+        To install a patch version on RHEL or CentOS, run:
+
+        .. code:: sh
+            
+            sudo yum clean all
+            sudo yum update scylla\* -y
+        
+   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+
+        If you're using the ScyllaDB official image (recommended), see 
+        the **Debian/Ubuntu** tab for upgrade instructions.
+
+        If you're using your own image and have installed ScyllaDB packages for 
+        Ubuntu or Debian, you need to apply an extended upgrade procedure:
+
+        #. Install the new ScyllaDB version with the additional
+           ``scylla-machine-image`` package:
+
+            .. code-block:: console
+
+               sudo apt-get clean all
+               sudo apt-get update
+               sudo apt-get dist-upgrade scylla
+               sudo apt-get dist-upgrade scylla-machine-image
+        #. Run ``scylla_setup`` without ``running io_setup``.
+        #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
+
+Start the node
+--------------
+
+.. code:: sh
+
+   sudo service start scylla-server
+
+Validate
+--------
+#. Check cluster status with ``nodetool status`` and make sure **all** nodes,
+   including the one you just upgraded, are in UN status.
+#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"``
+   to check the ScyllaDB version.
+#. Use ``journalctl _COMM=scylla`` to check there are no new errors in the log.
+#. Check again after 2 minutes to validate that no new issues are introduced.
+
+Once you are sure the node upgrade is successful, move to the next node in
+the cluster.
+
+Rollback Procedure
+==================
+
+The following procedure describes a rollback from ScyllaDB release
+|NEW_VERSION| to |SRC_VERSION|. Apply this procedure if an upgrade from
+|SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes. 
+
+* Use this procedure only on nodes you upgraded to |NEW_VERSION|.
+* Execute the following commands one node at a time, moving to the next node only
+  after the rollback procedure is completed successfully.
+
+ScyllaDB rollback is a rolling procedure that does **not** require a full
+cluster shutdown. For each of the nodes to roll back to |SRC_VERSION|, you will:
+
+#. Drain the node and stop ScyllaDB.
+#. Downgrade to the previous release.
+#. Restore the configuration file.
+#. Restart ScyllaDB.
+#. Validate the rollback success.
+
+Rollback Steps
+==============
+
+Gracefully shutdown ScyllaDB
+-----------------------------
+
+.. code:: sh
+    
+   nodetool drain
+   sudo service stop scylla-server
+
+Downgrade to the previous release
+----------------------------------
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+        To downgrade to |SRC_VERSION| on Debian or Ubuntu, run:
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
+        
+        Answer ‘y’ to the first two questions.
+
+   .. group-tab:: RHEL/CentOS
+
+        To downgrade to |SRC_VERSION| on RHEL or CentOS, run:
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo yum downgrade scylla\*-|SRC_VERSION|-\* -y
+
+   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+
+        If you’re using the ScyllaDB official image (recommended), see
+        the **Debian/Ubuntu** tab for upgrade instructions.
+
+        If you’re using your own image and have installed ScyllaDB packages for
+        Ubuntu or Debian, you need to additionally downgrade
+        the ``scylla-machine-image`` package.
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
+            sudo apt-get install scylla-machine-image=|SRC_VERSION|\*
+        
+        Answer ‘y’ to the first two questions.
+
+
+Restore the configuration file
+------------------------------
+
+.. code:: sh
+   
+   sudo rm -rf /etc/scylla/scylla.yaml
+   sudo cp -a /etc/scylla/scylla.yaml.backup /etc/scylla/scylla.yaml
+
+Start the node
+--------------
+
+.. code:: sh
+
+   sudo service scylla-server start
+
+Validate
+--------
+Check upgrade instruction above for validation. Once you are sure the node
+rollback is successful, move to the next node in the cluster.
--- a/docs/using-scylla/cassandra-compatibility.rst
+++ b/docs/using-scylla/cassandra-compatibility.rst
@@ -227,13 +227,19 @@ Security
 Indexing and Caching
 ^^^^^^^^^^^^^^^^^^^^^

-+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|   Options                                                    | Support                                                                              |
-+==============================================================+======================================================================================+
-|:doc:`Secondary Index </features/secondary-indexes>`          | |v|                                                                                  |
-+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|:doc:`Materialized Views </features/materialized-views>`      |  |v|                                                                                 |
-+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|   Options                                                      | Support                                                                              |
+================================================================+======================================================================================+
+|:doc:`Secondary Index </features/secondary-indexes>`            | |v|                                                                                  |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|StorageAttachedIndex (SAI)                                      | |x|                                                                                  |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|:ref:`SAI for vector search <cassandra-sai-compatibility>`      | |v| :sup:`*`                                                                         |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|:doc:`Materialized Views </features/materialized-views>`        | |v|                                                                                  |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+
+:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``


 Additional Features
--- a/ent/encryption/encrypted_file_impl.cc
+++ b/ent/encryption/encrypted_file_impl.cc
@@ -727,7 +727,12 @@ public:

        // now we need one page more to be able to save one for next lap
        auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
-        auto buf2 = co_await _input.read_exactly(fill_size);
+        // If the underlying stream is already at EOF (e.g. buf1 came from
+        // cached _next while the previous read_exactly drained the source),
+        // skip the read_exactly call — it would return empty anyway.
+        auto buf2 = _input.eof()
+            ? temporary_buffer<char>()
+            : co_await _input.read_exactly(fill_size);

        temporary_buffer<char> output(buf1.size() + buf2.size());

--- a/ent/encryption/encryption.cc
+++ b/ent/encryption/encryption.cc
@@ -380,7 +380,7 @@ public:
    }

    template<typename HostType, typename CacheType, typename ConfigType>
-    shared_ptr<HostType> get_host(const sstring& host, CacheType& cache, const ConfigType& config_map) {
+    shared_ptr<HostType> get_host(const sstring& host, CacheType& cache, const ConfigType& config_map, std::string_view config_entry_name) {
        auto& host_cache = cache[this_shard_id()];
        auto it = host_cache.find(host);
        if (it != host_cache.end()) {
@@ -394,23 +394,26 @@ public:
            return result;
        }

-        throw std::invalid_argument("No such host: " + host);
+        throw std::invalid_argument(fmt::format(
+            "Encryption host \"{}\" is not defined in scylla.yaml. "
+            "Make sure it is listed under the \"{}\" section.",
+            host, config_entry_name));
    }

    shared_ptr<kmip_host> get_kmip_host(const sstring& host) override {
-        return get_host<kmip_host>(host, _per_thread_kmip_host_cache, _cfg->kmip_hosts());
+        return get_host<kmip_host>(host, _per_thread_kmip_host_cache, _cfg->kmip_hosts(), "kmip_hosts");
    }

    shared_ptr<kms_host> get_kms_host(const sstring& host) override {
-        return get_host<kms_host>(host, _per_thread_kms_host_cache, _cfg->kms_hosts());
+        return get_host<kms_host>(host, _per_thread_kms_host_cache, _cfg->kms_hosts(), "kms_hosts");
    }

    shared_ptr<gcp_host> get_gcp_host(const sstring& host) override {
-        return get_host<gcp_host>(host, _per_thread_gcp_host_cache, _cfg->gcp_hosts());
+        return get_host<gcp_host>(host, _per_thread_gcp_host_cache, _cfg->gcp_hosts(), "gcp_hosts");
    }

    shared_ptr<azure_host> get_azure_host(const sstring& host) override {
-        return get_host<azure_host>(host, _per_thread_azure_host_cache, _cfg->azure_hosts());
+        return get_host<azure_host>(host, _per_thread_azure_host_cache, _cfg->azure_hosts(), "azure_hosts");
    }


--- a/ent/ldap/ldap_connection.cc
+++ b/ent/ldap/ldap_connection.cc
@@ -437,7 +437,6 @@ void ldap_connection::poll_results() {
            const auto found = _msgid_to_promise.find(id);
            if (found == _msgid_to_promise.end()) {
                mylog.error("poll_results: got valid result for unregistered id {}, dropping it", id);
-                ldap_msgfree(result);
            } else {
                found->second.set_value(std::move(result_ptr));
                _msgid_to_promise.erase(found);
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -41,7 +41,7 @@ public:
               _ip == other._ip;
    }

-    endpoint_state(inet_address ip) noexcept
+    explicit endpoint_state(inet_address ip) noexcept
        : _heart_beat_state()
        , _update_timestamp(clk::now())
        , _ip(ip)
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -172,12 +172,14 @@ public:
    gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
    gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
    gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
+    gms::feature logstor { *this, "LOGSTOR"sv };
    gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
    gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
    gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
    gms::feature topology_noop_request { *this, "TOPOLOGY_NOOP_REQUEST"sv };
    gms::feature tablets_intermediate_fallback_cleanup { *this, "TABLETS_INTERMEDIATE_FALLBACK_CLEANUP"sv };
    gms::feature batchlog_v2 { *this, "BATCHLOG_V2"sv };
+    gms::feature vnodes_to_tablets_migrations { *this, "VNODES_TO_TABLETS_MIGRATIONS"sv };
 public:

    const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -59,7 +59,6 @@ using clk = gossiper::clk;
 static logging::logger logger("gossip");

 constexpr std::chrono::milliseconds gossiper::INTERVAL;
-constexpr std::chrono::hours gossiper::A_VERY_LONG_TIME;
 constexpr generation_type::value_type gossiper::MAX_GENERATION_DIFFERENCE;

 const sstring& gossiper::get_cluster_name() const noexcept {
@@ -648,7 +647,7 @@ future<> gossiper::do_apply_state_locally(locator::host_id node, endpoint_state
            }
            // Re-rake after apply_new_states
            es = get_endpoint_state_ptr(node);
-            if (!is_alive(es->get_host_id()) && !is_dead_state(*es) && !shadow_round) { // unless of course, it was dead
+            if (!is_alive(es->get_host_id()) && !is_left(*es) && !shadow_round) { // unless of course, it was dead
                mark_alive(es);
            }
        } else {
@@ -767,7 +766,7 @@ future<> gossiper::remove_endpoint(locator::host_id endpoint, permit_id pid) {

    if (was_alive) {
        try {
-            logger.info("InetAddress {}/{} is now DOWN, status = {}", state->get_host_id(), ip, get_gossip_status(*state));
+            logger.info("InetAddress {}/{} is now DOWN, status = {}", host_id, ip, get_node_status(host_id));
            co_await do_on_dead_notifications(ip, std::move(state), pid);
        } catch (...) {
            logger.warn("Fail to call on_dead callback: {}", std::current_exception());
@@ -1174,10 +1173,10 @@ future<> gossiper::unregister_(shared_ptr<i_endpoint_state_change_subscriber> su

 std::set<locator::host_id> gossiper::get_live_members() const {
    std::set<locator::host_id> live_members(_live_endpoints.begin(), _live_endpoints.end());
-    auto myip = get_broadcast_address();
+    auto myid = my_host_id();
    logger.debug("live_members before={}", live_members);
-    if (!is_shutdown(myip)) {
-        live_members.insert(my_host_id());
+    if (!is_shutdown(myid)) {
+        live_members.insert(myid);
    }
    logger.debug("live_members after={}", live_members);
    return live_members;
@@ -1248,7 +1247,6 @@ future<> gossiper::evict_from_membership(locator::host_id hid, permit_id pid) {
        }
        g._endpoint_state_map.erase(hid);
    });
-    _expire_time_endpoint_map.erase(hid);
    logger.debug("evicting {} from gossip", hid);
 }

@@ -1321,21 +1319,6 @@ future<> gossiper::replicate(endpoint_state es, permit_id pid) {
    }
 }

-future<> gossiper::advertise_token_removed(locator::host_id host_id, permit_id pid) {
-    auto permit = co_await lock_endpoint(host_id, pid);
-    pid = permit.id();
-    auto eps = get_endpoint_state(host_id);
-    eps.update_timestamp(); // make sure we don't evict it too soon
-    eps.get_heart_beat_state().force_newer_generation_unsafe();
-    auto expire_time = compute_expire_time();
-    eps.add_application_state(application_state::STATUS, versioned_value::removed_nonlocal(host_id, expire_time.time_since_epoch().count()));
-    logger.info("Completing removal of {}", host_id);
-    add_expire_time_for_endpoint(host_id, expire_time);
-    co_await replicate(std::move(eps), pid);
-    // ensure at least one gossip round occurs before returning
-    co_await sleep_abortable(INTERVAL * 2, _abort_source);
-}
-
 future<> gossiper::assassinate_endpoint(sstring address) {
    throw std::runtime_error("Assassinating endpoint is not supported in topology over raft mode");
 }
@@ -1368,13 +1351,10 @@ future<> gossiper::do_gossip_to_unreachable_member(gossip_digest_syn message) {
        std::uniform_real_distribution<double> dist(0, 1);
        double rand_dbl = dist(_random_engine);
        if (rand_dbl < prob) {
-            std::set<locator::host_id> addrs;
-            for (auto&& x : _unreachable_endpoints) {
-                // Ignore the node which is decommissioned
-                if (get_gossip_status(_address_map.get(x.first)) != sstring(versioned_value::STATUS_LEFT)) {
-                    addrs.insert(x.first);
-                }
-            }
+            auto addrs = _unreachable_endpoints | std::ranges::views::keys | std::views::filter([this] (auto ep) {
+                // Ignore the node which is no longer part of the cluster
+                return !_topo_sm._topology.left_nodes.contains(raft::server_id(ep.uuid()));
+            }) | std::ranges::to<std::set>();
            logger.trace("do_gossip_to_unreachable_member: live_endpoint nr={} unreachable_endpoints nr={}",
                live_endpoint_count, unreachable_endpoint_count);
            return send_gossip(message, addrs);
@@ -1383,17 +1363,6 @@ future<> gossiper::do_gossip_to_unreachable_member(gossip_digest_syn message) {
    return make_ready_future<>();
 }

-clk::time_point gossiper::get_expire_time_for_endpoint(locator::host_id id) const noexcept {
-    /* default expire_time is A_VERY_LONG_TIME */
-    auto it = _expire_time_endpoint_map.find(id);
-    if (it == _expire_time_endpoint_map.end()) {
-        return compute_expire_time();
-    } else {
-        auto stored_time = it->second;
-        return stored_time;
-    }
-}
-
 endpoint_state_ptr gossiper::get_endpoint_state_ptr(locator::host_id ep) const noexcept {
    auto it = _endpoint_state_map.find(ep);
    if (it == _endpoint_state_map.end()) {
@@ -1420,7 +1389,7 @@ endpoint_state& gossiper::my_endpoint_state() {
    auto ep = get_broadcast_address();
    auto it = _endpoint_state_map.find(id);
    if (it == _endpoint_state_map.end()) {
-        it = _endpoint_state_map.emplace(id, make_endpoint_state_ptr({ep})).first;
+        it = _endpoint_state_map.emplace(id, make_endpoint_state_ptr(endpoint_state{ep})).first;
    }
    return const_cast<endpoint_state&>(*it->second);
 }
@@ -1634,9 +1603,8 @@ future<> gossiper::real_mark_alive(locator::host_id host_id) {
    }

    // Do not mark a node with status shutdown as UP.
-    auto status = sstring(get_gossip_status(*es));
-    if (status == sstring(versioned_value::SHUTDOWN)) {
-        logger.warn("Skip marking node {} with status = {} as UP", host_id, status);
+    if (is_shutdown(*es)) {
+        logger.warn("Skip marking node {} with status = shutdown as UP", host_id);
        co_return;
    }

@@ -1649,7 +1617,6 @@ future<> gossiper::real_mark_alive(locator::host_id host_id) {
        auto [it_, inserted] = data.live.insert(addr);
        was_live = !inserted;
    });
-    _expire_time_endpoint_map.erase(host_id);
    if (was_live) {
        co_return;
    }
@@ -1662,7 +1629,7 @@ future<> gossiper::real_mark_alive(locator::host_id host_id) {

    auto addr = es->get_ip();

-    logger.info("InetAddress {}/{} is now UP, status = {}", host_id, addr, status);
+    logger.info("InetAddress {}/{} is now UP, status = {}", host_id, addr, get_node_status(host_id));

    co_await _subscribers.for_each([addr, host_id, es, pid = permit.id()] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) -> future<> {
        co_await subscriber->on_alive(addr, host_id, es, pid);
@@ -1678,7 +1645,7 @@ future<> gossiper::mark_dead(locator::host_id addr, endpoint_state_ptr state, pe
        data.live.erase(addr);
        data.unreachable[addr] = now();
    });
-    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(*state));
+    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_node_status(addr));
    co_await do_on_dead_notifications(state->get_ip(), std::move(state), pid);
 }

@@ -1688,14 +1655,14 @@ future<> gossiper::handle_major_state_change(endpoint_state eps, permit_id pid,

    endpoint_state_ptr eps_old = get_endpoint_state_ptr(ep);

-    if (!is_dead_state(eps) && !shadow_round) {
+    if (!is_left(eps) && !shadow_round) {
        if (_endpoint_state_map.contains(ep))  {
-            logger.info("Node {} has restarted, now UP, status = {}", ep, get_gossip_status(eps));
+            logger.info("Node {} has restarted, now UP, status = {}", ep, get_node_status(ep));
        } else {
-            logger.debug("Node {} is now part of the cluster, status = {}", ep, get_gossip_status(eps));
+            logger.debug("Node {} is now part of the cluster, status = {}", ep, get_node_status(ep));
        }
    }
-    logger.trace("Adding endpoint state for {}, status = {}", ep, get_gossip_status(eps));
+    logger.trace("Adding endpoint state for {}, status = {}", ep, get_node_status(ep));
    co_await replicate(eps, pid);

    if (shadow_round) {
@@ -1713,10 +1680,10 @@ future<> gossiper::handle_major_state_change(endpoint_state eps, permit_id pid,
    if (!ep_state) {
        throw std::out_of_range(format("ep={}", ep));
    }
-    if (!is_dead_state(*ep_state)) {
+    if (!is_left(*ep_state)) {
        mark_alive(ep_state);
    } else {
-        logger.debug("Not marking {} alive due to dead state {}", ep, get_gossip_status(eps));
+        logger.debug("Not marking {} alive due to dead state {}", ep, get_node_status(ep));
        co_await mark_dead(ep, ep_state, pid);
    }

@@ -1730,8 +1697,8 @@ future<> gossiper::handle_major_state_change(endpoint_state eps, permit_id pid,
    }
 }

-bool gossiper::is_dead_state(const endpoint_state& eps) const {
-    return std::ranges::any_of(DEAD_STATES, [state = get_gossip_status(eps)](const auto& deadstate) { return state == deadstate; });
+bool gossiper::is_left(const endpoint_state& eps) const {
+    return _topo_sm._topology.left_nodes.contains(raft::server_id(eps.get_host_id().uuid()));
 }

 bool gossiper::is_shutdown(const locator::host_id& endpoint) const {
@@ -1746,10 +1713,6 @@ bool gossiper::is_normal(const locator::host_id& endpoint) const {
    return get_gossip_status(endpoint) == versioned_value::STATUS_NORMAL;
 }

-bool gossiper::is_silent_shutdown_state(const endpoint_state& ep_state) const{
-    return std::ranges::any_of(SILENT_SHUTDOWN_STATES, [state = get_gossip_status(ep_state)](const auto& deadstate) { return state == deadstate; });
-}
-
 future<> gossiper::apply_new_states(endpoint_state local_state, const endpoint_state& remote_state, permit_id pid, bool shadow_round) {
    // don't SCYLLA_ASSERT here, since if the node restarts the version will go back to zero
    //int oldVersion = local_state.get_heart_beat_state().get_heart_beat_version();
@@ -2173,16 +2136,14 @@ future<> gossiper::do_stop_gossiping() {
        logger.info("gossip is already stopped");
        co_return;
    }
+
    auto my_ep_state = get_this_endpoint_state_ptr();
-    if (my_ep_state) {
-        logger.info("My status = {}", get_gossip_status(*my_ep_state));
-    }
-    if (my_ep_state && !is_silent_shutdown_state(*my_ep_state)) {
+    if (my_ep_state && _topo_sm._topology.normal_nodes.contains(raft::server_id(my_host_id().uuid()))) {
        auto local_generation = my_ep_state->get_heart_beat_state().get_generation();
        logger.info("Announcing shutdown");
        co_await add_local_application_state(application_state::STATUS, versioned_value::shutdown(true));
        auto live_endpoints = _live_endpoints;
-        for (locator::host_id id : live_endpoints) {
+        co_await coroutine::parallel_for_each(live_endpoints, [this, &local_generation] (locator::host_id id) -> future<> {
            logger.info("Sending a GossipShutdown to {} with generation {}", id, local_generation);
            try {
                co_await ser::gossip_rpc_verbs::send_gossip_shutdown(&_messaging, id, get_broadcast_address(), local_generation.value());
@@ -2190,7 +2151,7 @@ future<> gossiper::do_stop_gossiping() {
            } catch (...) {
                logger.warn("Fail to send GossipShutdown to {}: {}", id, std::current_exception());
            }
-        }
+        });
        co_await sleep(std::chrono::milliseconds(_gcfg.shutdown_announce_ms));
    } else {
        logger.warn("No local state or state is in silent shutdown, not announcing shutdown");
@@ -2241,19 +2202,6 @@ bool gossiper::is_enabled() const {
    return _enabled && !_abort_source.abort_requested();
 }

-void gossiper::add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time) {
-    auto now_ = now();
-    auto diff = std::chrono::duration_cast<std::chrono::seconds>(expire_time - now_).count();
-    logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T %z}]: (expire = {}, now = {}, diff = {} seconds)",
-            endpoint, fmt::gmtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
-            now_.time_since_epoch().count(), diff);
-    _expire_time_endpoint_map[endpoint] = expire_time;
-}
-
-clk::time_point gossiper::compute_expire_time() {
-    return now() + A_VERY_LONG_TIME;
-}
-
 bool gossiper::is_alive(locator::host_id id) const {
    if (id == my_host_id()) {
        return true;
@@ -2373,91 +2321,22 @@ std::string_view gossiper::get_gossip_status(const locator::host_id& endpoint) c
    return do_get_gossip_status(get_application_state_ptr(endpoint, application_state::STATUS));
 }

-bool gossiper::is_safe_for_bootstrap(inet_address endpoint) const {
-    // We allow to bootstrap a new node in only two cases:
-    // 1) The node is a completely new node and no state in gossip at all
-    // 2) The node has state in gossip and it is already removed from the
-    // cluster either by nodetool decommission or nodetool removenode
-    bool allowed = true;
-    auto host_id = try_get_host_id(endpoint);
-    if (!host_id) {
-        logger.debug("is_safe_for_bootstrap: node={}, status=no state in gossip, allowed_to_bootstrap={}", endpoint, allowed);
-        return allowed;
+std::string gossiper::get_node_status(const locator::host_id& endpoint) const noexcept {
+    if (this_shard_id() != 0) {
+        on_internal_error(logger, "get_node_status should only be called on shard 0");
    }
-    auto eps = get_endpoint_state_ptr(*host_id);
-    if (!eps) {
-        logger.debug("is_safe_for_bootstrap: node={}, status=no state in gossip, allowed_to_bootstrap={}", endpoint, allowed);
-        return allowed;
+    if (is_shutdown(endpoint)) {
+        return "shutdown";
    }
-    auto status = get_gossip_status(*eps);
-    std::unordered_set<std::string_view> allowed_statuses{
-        versioned_value::STATUS_LEFT,
-        versioned_value::REMOVED_TOKEN,
-    };
-    allowed = allowed_statuses.contains(status);
-    logger.debug("is_safe_for_bootstrap: node={}, status={}, allowed_to_bootstrap={}", endpoint, status, allowed);
-    return allowed;
-}
-
-std::set<sstring> gossiper::get_supported_features(locator::host_id endpoint) const {
-    auto app_state = get_application_state_ptr(endpoint, application_state::SUPPORTED_FEATURES);
-    if (!app_state) {
-        return {};
-    }
-    return feature_service::to_feature_set(app_state->value());
-}
-
-std::set<sstring> gossiper::get_supported_features(const std::unordered_map<locator::host_id, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const {
-    std::unordered_map<locator::host_id, std::set<sstring>> features_map;
-    std::set<sstring> common_features;
-
-    for (auto& x : loaded_peer_features) {
-        auto features = feature_service::to_feature_set(x.second);
-        if (features.empty()) {
-            logger.warn("Loaded empty features for peer node {}", x.first);
-        } else {
-            features_map.emplace(x.first, std::move(features));
+    auto n = _topo_sm._topology.find(raft::server_id{endpoint.uuid()});
+    if (!n) {
+        if (_topo_sm._topology.left_nodes.contains(raft::server_id{endpoint.uuid()})) {
+            return "left";
        }
+        return "unknown";
+    } else {
+        return fmt::format("{}", n->second.state);
    }
-
-    for (auto& x : _endpoint_state_map) {
-        auto host_id = x.second->get_host_id();
-        auto features = get_supported_features(host_id);
-        if (ignore_local_node && host_id == my_host_id()) {
-            logger.debug("Ignore SUPPORTED_FEATURES of local node: features={}", features);
-            continue;
-        }
-        if (features.empty()) {
-            auto it = loaded_peer_features.find(host_id);
-            if (it != loaded_peer_features.end()) {
-                logger.info("Node {} does not contain SUPPORTED_FEATURES in gossip, using features saved in system table, features={}", host_id, feature_service::to_feature_set(it->second));
-            } else {
-                logger.warn("Node {} does not contain SUPPORTED_FEATURES in gossip or system table", host_id);
-            }
-        } else {
-            // Replace the features with live info
-            features_map[host_id] = std::move(features);
-        }
-    }
-
-    if (ignore_local_node) {
-        features_map.erase(my_host_id());
-    }
-
-    if (!features_map.empty()) {
-        common_features = features_map.begin()->second;
-    }
-
-    for (auto& x : features_map) {
-        auto& features = x.second;
-        std::set<sstring> result;
-        std::set_intersection(features.begin(), features.end(),
-                common_features.begin(), common_features.end(),
-                std::inserter(result, result.end()));
-        common_features = std::move(result);
-    }
-    common_features.erase("");
-    return common_features;
 }

 void gossiper::check_snitch_name_matches(sstring local_snitch_name) const {
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -91,7 +91,6 @@ struct loaded_endpoint_state {
 class gossiper : public seastar::async_sharded_service<gossiper>, public seastar::peering_sharded_service<gossiper> {
 public:
    using clk = seastar::lowres_system_clock;
-    using ignore_features_of_local_node = bool_class<class ignore_features_of_local_node_tag>;
    using generation_for_nodes = std::unordered_map<locator::host_id, generation_type>;
 private:
    using messaging_verb = netw::messaging_verb;
@@ -198,18 +197,7 @@ private:
    endpoint_locks_map _endpoint_locks;

 public:
-    static constexpr std::array DEAD_STATES{
-        versioned_value::REMOVED_TOKEN,
-        versioned_value::STATUS_LEFT,
-    };
-    static constexpr std::array SILENT_SHUTDOWN_STATES{
-        versioned_value::REMOVED_TOKEN,
-        versioned_value::STATUS_LEFT,
-        versioned_value::STATUS_BOOTSTRAPPING,
-        versioned_value::STATUS_UNKNOWN,
-    };
    static constexpr std::chrono::milliseconds INTERVAL{1000};
-    static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3};

    // Maximum difference between remote generation value and generation
    // value this node would get if this node were restarted that we are
@@ -241,7 +229,6 @@ private:
    /* initial seeds for joining the cluster */
    std::set<inet_address> _seeds;

-    std::map<locator::host_id, clk::time_point> _expire_time_endpoint_map;

    bool _in_shadow_round = false;

@@ -341,13 +328,6 @@ private:
    utils::chunked_vector<gossip_digest> make_random_gossip_digest() const;

 public:
-    /**
-     * Handles switching the endpoint's state from REMOVING_TOKEN to REMOVED_TOKEN
-     *
-     * @param endpoint
-     * @param host_id
-     */
-    future<> advertise_token_removed(locator::host_id host_id, permit_id);

    /**
     * Do not call this method unless you know what you are doing.
@@ -363,7 +343,6 @@ public:
    future<generation_type> get_current_generation_number(locator::host_id endpoint) const;
    future<version_type> get_current_heart_beat_version(locator::host_id endpoint) const;

-    bool is_safe_for_bootstrap(inet_address endpoint) const;
 private:
    /**
     * Returns true if the chosen target was also a seed. False otherwise
@@ -383,7 +362,6 @@ private:
    future<> do_gossip_to_unreachable_member(gossip_digest_syn message);

 public:
-    clk::time_point get_expire_time_for_endpoint(locator::host_id endpoint) const noexcept;

    // Gets a shared pointer to the endpoint_state, if exists.
    // Otherwise, returns a null ptr.
@@ -467,7 +445,7 @@ private:
 public:
    bool is_alive(locator::host_id id) const;

-    bool is_dead_state(const endpoint_state& eps) const;
+    bool is_left(const endpoint_state& eps) const;
    // Wait for nodes to be alive on all shards
    future<> wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout);
    future<> wait_alive(std::vector<locator::host_id> nodes, std::chrono::milliseconds timeout);
@@ -588,17 +566,12 @@ public:
 public:
    bool is_enabled() const;

-public:
-    void add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time);
-
-    static clk::time_point compute_expire_time();
 public:
    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const locator::host_id& endpoint) const;
    bool is_shutdown(const endpoint_state& eps) const;
    bool is_normal(const locator::host_id& endpoint) const;
    bool is_cql_ready(const locator::host_id& endpoint) const;
-    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
    void force_newer_generation();
 public:
    std::string_view get_gossip_status(const endpoint_state& ep_state) const noexcept;
@@ -615,12 +588,8 @@ private:
    gossip_address_map& _address_map;
    gossip_config _gcfg;
    condition_variable _failure_detector_loop_cv;
-    // Get features supported by a particular node
-    std::set<sstring> get_supported_features(locator::host_id endpoint) const;
    locator::token_metadata_ptr get_token_metadata_ptr() const noexcept;
-public:
-    // Get features supported by all the nodes this node knows about
-    std::set<sstring> get_supported_features(const std::unordered_map<locator::host_id, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const;
+    std::string get_node_status(const locator::host_id& endpoint) const noexcept;
 private:
    seastar::metrics::metric_groups _metrics;
 public:
--- a/Show More
+++ b/Show More