docs: document alternator_streams_increased_compatibility option in compatibility.md

Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
Initial plan
2026-04-20 16:40:35 +00:00 · 2026-03-09 16:37:52 +00:00 · 2026-03-09 16:35:57 +00:00 · 2026-03-09 11:42:35 +01:00 · 2026-03-09 12:12:04 +02:00 · 2026-03-09 10:50:09 +01:00
3729 changed files with 13451 additions and 8651 deletions
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -0,0 +1,53 @@
+name: Backport with Jira Integration
+
+on:
+  push:
+    branches:
+      - master
+      - next-*.*
+      - branch-*.*
+  pull_request_target:
+    types: [labeled, closed]
+    branches: 
+      - master
+      - next
+      - next-*.*
+      - branch-*.*
+
+jobs:
+  backport-on-push:
+    if: github.event_name == 'push'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'push'
+      base_branch: ${{ github.ref }}
+      commits: ${{ github.event.before }}..${{ github.sha }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-on-label:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'labeled'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      head_commit: ${{ github.event.pull_request.base.sha }}
+      label_name: ${{ github.event.label.name }}
+      pr_state: ${{ github.event.pull_request.state }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-chain:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'chain'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      pr_body: ${{ github.event.pull_request.body }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_sync_milestone_to_jira.yml
+++ b/.github/workflows/call_sync_milestone_to_jira.yml
@@ -9,6 +9,6 @@ jobs:
    uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
    with:
      # Comma-separated list of Jira project keys
-      jira_project_keys: "SCYLLADB,CUSTOMER,SMI"
+      jira_project_keys: "SCYLLADB,CUSTOMER,SMI,RELENG"
    secrets:
      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -12,16 +12,38 @@ jobs:
    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
+      - name: Verify Org Membership
+        id: verify_author
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+          PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
+          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
+          COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
+        shell: bash
+        run: |
+          if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
+            AUTHOR="$PR_AUTHOR"
+            ASSOCIATION="$PR_ASSOCIATION"
+          else
+            AUTHOR="$COMMENT_AUTHOR"
+            ASSOCIATION="$COMMENT_ASSOCIATION"
+          fi
+          if [[ "$ASSOCIATION" == "MEMBER" || "$ASSOCIATION" == "OWNER" ]]; then
+            echo "member=true" >> $GITHUB_OUTPUT
+          else
+            echo "::warning::${AUTHOR} is not a member of scylladb (association: ${ASSOCIATION}); skipping CI trigger."
+            echo "member=false" >> $GITHUB_OUTPUT
+          fi
+
      - name: Validate Comment Trigger
        if: github.event_name == 'issue_comment'
        id: verify_comment
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
        shell: bash
        run: |
-          BODY=$(cat << 'EOF'
-          ${{ github.event.comment.body }}
-          EOF
-          )
-          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
+          CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')

          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
            echo "trigger=true" >> $GITHUB_OUTPUT
@@ -30,13 +52,13 @@ jobs:
          fi

      - name: Trigger Scylla-CI-Route Jenkins Job
-        if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
+        if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
+          PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
+          PR_REPO_NAME: "${{ github.event.repository.full_name }}"
        run: |
-          PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
-          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
-          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
+            --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,7 +300,6 @@ add_subdirectory(locator)
 add_subdirectory(message)
 add_subdirectory(mutation)
 add_subdirectory(mutation_writer)
-add_subdirectory(node_ops)
 add_subdirectory(readers)
 add_subdirectory(replica)
 add_subdirectory(raft)
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -63,6 +63,7 @@
 #include "types/types.hh"
 #include "db/system_keyspace.hh"
 #include "cql3/statements/ks_prop_defs.hh"
+#include "alternator/ttl_tag.hh"

 using namespace std::chrono_literals;

@@ -164,7 +165,7 @@ static map_type attrs_type() {

 static const column_definition& attrs_column(const schema& schema) {
    const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
-    SCYLLA_ASSERT(cdef);
+    throwing_assert(cdef);
    return *cdef;
 }

@@ -1649,7 +1650,7 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
 }

 future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode) {
-    SCYLLA_ASSERT(this_shard_id() == 0);
+    throwing_assert(this_shard_id() == 0);

    // We begin by parsing and validating the content of the CreateTable
    // command. We can't inspect the current database schema at this point
@@ -2837,14 +2838,12 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
        }
    } else if (_write_isolation != write_isolation::LWT_ALWAYS) {
        std::optional<mutation> m = apply(nullptr, api::new_timestamp(), cdc_opts);
-        SCYLLA_ASSERT(m); // !needs_read_before_write, so apply() did not check a condition
+        throwing_assert(m); // !needs_read_before_write, so apply() did not check a condition
        return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this, &wcu_total] () mutable {
            return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
        });
    }
-    if (!cas_shard) {
-        on_internal_error(elogger, "cas_shard is not set");
-    }
+    throwing_assert(cas_shard);
    // If we're still here, we need to do this write using LWT:
    global_stats.write_using_lwt++;
    per_table_stats.write_using_lwt++;
@@ -5413,7 +5412,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
 }

 static dht::token token_for_segment(int segment, int total_segments) {
-    SCYLLA_ASSERT(total_segments > 1 && segment >= 0 && segment < total_segments);
+    throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments);
    uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
    return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -710,7 +710,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        ++_executor._stats.requests_blocked_memory;
    }
    auto units = co_await std::move(units_fut);
-    SCYLLA_ASSERT(req->content_stream);
+    throwing_assert(req->content_stream);
    chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit);
    // If the request had no Content-Length, we reserved too many units
    // so need to return some
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -46,6 +46,7 @@
 #include "alternator/executor.hh"
 #include "alternator/controller.hh"
 #include "alternator/serialization.hh"
+#include "alternator/ttl_tag.hh"
 #include "dht/sharder.hh"
 #include "db/config.hh"
 #include "db/tags/utils.hh"
@@ -57,19 +58,10 @@ static logging::logger tlogger("alternator_ttl");

 namespace alternator {

-// We write the expiration-time attribute enabled on a table in a
-// tag TTL_TAG_KEY.
-// Currently, the *value* of this tag is simply the name of the attribute,
-// and the expiration scanner interprets it as an Alternator attribute name -
-// It can refer to a real column or if that doesn't exist, to a member of
-// the ":attrs" map column. Although this is designed for Alternator, it may
-// be good enough for CQL as well (there, the ":attrs" column won't exist).
-extern const sstring TTL_TAG_KEY;
-
 future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.update_time_to_live++;
    if (!_proxy.features().alternator_ttl) {
-        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Experimental support is available if the 'alternator-ttl' experimental feature is enabled on all nodes.");
+        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Upgrade all nodes to a version that supports it.");
    }

    schema_ptr schema = get_table(_proxy, request);
@@ -324,9 +316,7 @@ static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_se
    const auto& tm = *erm->get_token_metadata_ptr();
    const auto& sorted_tokens = tm.sorted_tokens();
    std::vector<std::pair<dht::token_range, locator::host_id>> ret;
-    if (sorted_tokens.empty()) {
-        on_internal_error(tlogger, "Token metadata is empty");
-    }
+    throwing_assert(!sorted_tokens.empty());
    auto prev_tok = sorted_tokens.back();
    for (const auto& tok : sorted_tokens) {
        co_await coroutine::maybe_yield();
@@ -563,7 +553,7 @@ static future<> scan_table_ranges(
        expiration_service::stats& expiration_stats)
 {
    const schema_ptr& s = scan_ctx.s;
-    SCYLLA_ASSERT (partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
+    throwing_assert(partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
    auto p = service::pager::query_pagers::pager(proxy, s, scan_ctx.selection, *scan_ctx.query_state_ptr,
            *scan_ctx.query_options, scan_ctx.command, std::move(partition_ranges), nullptr);
    while (!p->is_exhausted()) {
@@ -640,13 +630,38 @@ static future<> scan_table_ranges(
                }
            } else {
                // For a real column to contain an expiration time, it
-                // must be a numeric type.
-                // FIXME: Currently we only support decimal_type (which is
-                // what Alternator uses), but other numeric types can be
-                // supported as well to make this feature more useful in CQL.
-                // Note that kind::decimal is also checked above.
-                big_decimal n = value_cast<big_decimal>(v);
-                expired = is_expired(n, now);
+                // must be a numeric type. We currently support decimal
+                // (used by Alternator TTL) as well as bigint, int and
+                // timestamp (used by CQL per-row TTL).
+                switch (meta[*expiration_column]->type->get_kind()) {
+                    case abstract_type::kind::decimal:
+                        // Used by Alternator TTL for key columns not stored
+                        // in the map. The value is in seconds, fractional
+                        // part is ignored.
+                        expired = is_expired(value_cast<big_decimal>(v), now);
+                        break;
+                    case abstract_type::kind::long_kind:
+                        // Used by CQL per-row TTL. The value is in seconds.
+                        expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int64_t>(v))), now);
+                        break;
+                    case abstract_type::kind::int32:
+                        // Used by CQL per-row TTL. The value is in seconds.
+                        // Using int type is not recommended because it will
+                        // overflow in 2038, but we support it to allow users
+                        // to use existing int columns for expiration.
+                        expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int32_t>(v))), now);
+                        break;
+                    case abstract_type::kind::timestamp:
+                        // Used by CQL per-row TTL. The value is in milliseconds
+                        // but we truncate it to gc_clock's precision (whole seconds).
+                        expired = is_expired(gc_clock::time_point(std::chrono::duration_cast<gc_clock::duration>(value_cast<db_clock::time_point>(v).time_since_epoch())), now);
+                        break;
+                    default:
+                        // Should never happen - we verified the column's type
+                        // before starting the scan.
+                        [[unlikely]]
+                        on_internal_error(tlogger, format("expiration scanner value of unsupported type {} in column {}", meta[*expiration_column]->type->cql3_type_name(), scan_ctx.column_name) );
+                }
            }
            if (expired) {
                expiration_stats.items_deleted++;
@@ -708,16 +723,12 @@ static future<bool> scan_table(
        co_return false;
    }
    // attribute_name may be one of the schema's columns (in Alternator, this
-    // means it's a key column), or an element in Alternator's attrs map
-    // encoded in Alternator's JSON encoding.
-    // FIXME: To make this less Alternators-specific, we should encode in the
-    // single key's value three things:
-    // 1. The name of a column
-    // 2. Optionally if column is a map, a member in the map
-    // 3. The deserializer for the value: CQL or Alternator (JSON).
-    // The deserializer can be guessed: If the given column or map item is
-    // numeric, it can be used directly. If it is a "bytes" type, it needs to
-    // be deserialized using Alternator's deserializer.
+    // means a key column, in CQL it's a regular column), or an element in
+    // Alternator's attrs map encoded in Alternator's JSON encoding (which we
+    // decode). If attribute_name is a real column, in Alternator it will have
+    // the type decimal, counting seconds since the UNIX epoch, while in CQL
+    // it will one of the types bigint or int (counting seconds) or timestamp
+    // (counting milliseconds).
    bytes column_name = to_bytes(*attribute_name);
    const column_definition *cd = s->get_column_definition(column_name);
    std::optional<std::string> member;
@@ -736,11 +747,14 @@ static future<bool> scan_table(
    data_type column_type = cd->type;
    // Verify that the column has the right type: If "member" exists
    // the column must be a map, and if it doesn't, the column must
-    // (currently) be a decimal_type. If the column has the wrong type
-    // nothing can get expired in this table, and it's pointless to
-    // scan it.
+    // be decimal_type (Alternator), bigint, int or timestamp (CQL).
+    // If the column has the wrong type nothing can get expired in
+    // this table, and it's pointless to scan it.
    if ((member && column_type->get_kind() != abstract_type::kind::map) ||
-        (!member && column_type->get_kind() != abstract_type::kind::decimal)) {
+        (!member && column_type->get_kind() != abstract_type::kind::decimal &&
+         column_type->get_kind() != abstract_type::kind::long_kind &&
+         column_type->get_kind() != abstract_type::kind::int32 &&
+         column_type->get_kind() != abstract_type::kind::timestamp)) {
        tlogger.info("table {} TTL column has unsupported type, not scanning", s->cf_name());
        co_return false;
    }
@@ -767,7 +781,7 @@ static future<bool> scan_table(
                // by tasking another node to take over scanning of the dead node's primary
                // ranges. What we do here is that this node will also check expiration
                // on its *secondary* ranges - but only those whose primary owner is down.
-                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
@@ -878,12 +892,10 @@ future<> expiration_service::run() {
 future<> expiration_service::start() {
    // Called by main() on each shard to start the expiration-service
    // thread. Just runs run() in the background and allows stop().
-    if (_db.features().alternator_ttl) {
-        if (!shutting_down()) {
-            _end = run().handle_exception([] (std::exception_ptr ep) {
-                tlogger.error("expiration_service failed: {}", ep);
-            });
-        }
+    if (!shutting_down()) {
+        _end = run().handle_exception([] (std::exception_ptr ep) {
+            tlogger.error("expiration_service failed: {}", ep);
+        });
    }
    return make_ready_future<>();
 }
--- a/alternator/ttl_tag.hh
+++ b/alternator/ttl_tag.hh
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "seastarx.hh"
+#include <seastar/core/sstring.hh>
+
+namespace alternator {
+// We use the table tag TTL_TAG_KEY ("system:ttl_attribute") to remember
+// which attribute was chosen as the expiration-time attribute for
+// Alternator's TTL and CQL's per-row TTL features.
+// Currently, the *value* of this tag is simply the name of the attribute:
+// It can refer to a real column or if that doesn't exist, to a member of
+// the ":attrs" map column (which Alternator uses).
+extern const sstring TTL_TAG_KEY;
+} // namespace alternator
+
+// let users use TTL_TAG_KEY without the "alternator::" prefix,
+// to make it easier to move it to a different namespace later.
+using alternator::TTL_TAG_KEY;
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -3085,6 +3085,48 @@
            }
         ]
      },
+
+      {
+         "path":"/storage_service/tablets/snapshots",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Takes the snapshot for the given keyspaces/tables. A snapshot name must be specified.",
+               "type":"void",
+               "nickname":"take_cluster_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"tag",
+                     "description":"the tag given to the snapshot",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"keyspace",
+                     "description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"table",
+                     "description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+
      {
         "path":"/storage_service/quiesce_topology",
         "operations":[
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -783,17 +783,13 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::

        apilog.info("cleanup_all global={}", global);

-        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
-            if (!ss.is_topology_coordinator_enabled()) {
-                co_return false;
-            }
-            co_await ss.do_clusterwide_vnodes_cleanup();
-            co_return true;
-        });
-        if (done) {
+        if (global) {
+            co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
+                co_return co_await ss.do_clusterwide_vnodes_cleanup();
+            });
            co_return json::json_return_type(0);
        }
-        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
+        // fall back to the local cleanup if local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
@@ -801,9 +797,7 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::

        // Mark this node as clean
        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
-            if (ss.is_topology_coordinator_enabled()) {
-                co_await ss.reset_cleanup_needed();
-            }
+            co_await ss.reset_cleanup_needed();
        });

        co_return json::json_return_type(0);
@@ -814,9 +808,6 @@ future<json::json_return_type>
 rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
        apilog.info("reset_cleanup_needed");
        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
-            if (!ss.is_topology_coordinator_enabled()) {
-                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
-            }
            return ss.reset_cleanup_needed();
        });
        co_return json_void();
@@ -1574,16 +1565,7 @@ rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::
 static
 future<json::json_return_type>
 rest_upgrade_to_raft_topology(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("Requested to schedule upgrade to raft topology");
-        try {
-            co_await ss.invoke_on(0, [] (auto& ss) {
-                return ss.start_upgrade_to_raft_topology();
-            });
-        } catch (...) {
-            auto ex = std::current_exception();
-            apilog.error("Failed to schedule upgrade to raft topology: {}", ex);
-            std::rethrow_exception(std::move(ex));
-        }
+        apilog.info("Requested to schedule upgrade to raft topology, but this version does not need it since it uses raft topology by default.");
        co_return json_void();
 }

@@ -2025,6 +2007,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
+        auto tcopt = req->get_query_param("tc");
+
        db::snapshot_options opts = {
            .skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
        };
@@ -2049,6 +2033,27 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        }
    });

+    ss::take_cluster_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        apilog.info("take_cluster_snapshot: {}", req->get_query_params());
+        auto tag = req->get_query_param("tag");
+        auto column_families = split(req->get_query_param("table"), ",");
+        // Note: not published/active. Retain as internal option, but...
+        auto sfopt = req->get_query_param("skip_flush");
+
+        db::snapshot_options opts = {
+            .skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
+        };
+
+        std::vector<sstring> keynames = split(req->get_query_param("keyspace"), ",");
+        try {
+            co_await snap_ctl.local().take_cluster_column_family_snapshot(keynames, column_families, tag, opts);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("take_cluster_snapshot failed: {}", std::current_exception());
+            throw;
+        }
+    });
+
    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        apilog.info("del_snapshot: {}", req->get_query_params());
        auto tag = req->get_query_param("tag");
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -25,6 +25,7 @@ target_sources(scylla_auth
    service.cc
    standard_role_manager.cc
    transitional.cc
+    maintenance_socket_authenticator.cc
    maintenance_socket_role_manager.cc)
 target_include_directories(scylla_auth
  PUBLIC
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -9,19 +9,9 @@
 #include "auth/allow_all_authenticator.hh"

 #include "service/migration_manager.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

 constexpr std::string_view allow_all_authenticator_name("org.apache.cassandra.auth.AllowAllAuthenticator");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authenticator,
-        allow_all_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
-
 }
--- a/auth/allow_all_authorizer.cc
+++ b/auth/allow_all_authorizer.cc
@@ -9,18 +9,9 @@
 #include "auth/allow_all_authorizer.hh"

 #include "auth/common.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

 constexpr std::string_view allow_all_authorizer_name("org.apache.cassandra.auth.AllowAllAuthorizer");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-    authorizer,
-    allow_all_authorizer,
-    cql3::query_processor&,
-    ::service::raft_group0_client&,
-    ::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthorizer");
-
 }
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -110,15 +110,23 @@ future<> cache::prune(const resource& r) {
 future<> cache::reload_all_permissions() noexcept {
    SCYLLA_ASSERT(_permission_loader);
    auto units = co_await get_units(_loading_sem, 1, _as);
+    auto copy_keys = [] (const std::unordered_map<resource, permission_set>& m) {
+        std::vector<resource> keys;
+        keys.reserve(m.size());
+        for (const auto& [res, _] : m) {
+            keys.push_back(res);
+        }
+        return keys;
+    };
    const role_or_anonymous anon;
-    for (auto& [res, perms] : _anonymous_permissions) {
-        perms = co_await _permission_loader(anon, res);
+    for (const auto& res : copy_keys(_anonymous_permissions)) {
+        _anonymous_permissions[res] = co_await _permission_loader(anon, res);
    }
    for (auto& [role, entry] : _roles) {
        auto& perms_cache = entry->cached_permissions;
        auto r = role_or_anonymous(role);
-        for (auto& [res, perms] : perms_cache) {
-            perms = co_await _permission_loader(r, res);
+        for (const auto& res : copy_keys(perms_cache)) {
+            perms_cache[res] = co_await _permission_loader(r, res);
        }
    }
    logger.debug("Reloaded auth cache with {} entries", _roles.size());
@@ -228,6 +236,7 @@ future<> cache::load_all() {
        co_await distribute_role(name, role);
    }
    co_await container().invoke_on_others([this](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        c._current_version = _current_version;
        co_await c.prune_all();
    });
@@ -287,10 +296,11 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {

 future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
    auto role_ptr = role.get();
-    co_await container().invoke_on_others([&name, role_ptr](cache& c) {
+    co_await container().invoke_on_others([&name, role_ptr](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        if (!role_ptr) {
            c.remove_role(name);
-            return;
+            co_return;
        }
        auto role_copy = make_lw_shared<role_record>(*role_ptr);
        c.add_role(name, std::move(role_copy));
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -13,14 +13,11 @@
 #include <boost/regex.hpp>
 #include <fmt/ranges.h>

-#include "utils/class_registrator.hh"
 #include "utils/to_string.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "cql3/query_processor.hh"
 #include "db/config.hh"

-static const auto CERT_AUTH_NAME = "com.scylladb.auth.CertificateAuthenticator";
-const std::string_view auth::certificate_authenticator_name(CERT_AUTH_NAME);

 static logging::logger clogger("certificate_authenticator");

@@ -30,13 +27,6 @@ static const std::string cfg_query_attr = "query";
 static const std::string cfg_source_subject = "SUBJECT";
 static const std::string cfg_source_altname = "ALTNAME";

-static const class_registrator<auth::authenticator
-    , auth::certificate_authenticator
-    , cql3::query_processor&
-    , ::service::raft_group0_client&
-    , ::service::migration_manager&
-    , auth::cache&> cert_auth_reg(CERT_AUTH_NAME);
-
 enum class auth::certificate_authenticator::query_source {
    subject, altname
 };
@@ -99,7 +89,7 @@ future<> auth::certificate_authenticator::stop() {
 }

 std::string_view auth::certificate_authenticator::qualified_java_name() const {
-    return certificate_authenticator_name;
+    return "com.scylladb.auth.CertificateAuthenticator";
 }

 bool auth::certificate_authenticator::require_authentication() const {
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -27,8 +27,6 @@ namespace auth {

 class cache;

-extern const std::string_view certificate_authenticator_name;
-
 class certificate_authenticator : public authenticator {
    enum class query_source;
    std::vector<std::pair<query_source, boost::regex>> _queries;
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -26,7 +26,6 @@ extern "C" {
 #include "cql3/untyped_result_set.hh"
 #include "exceptions/exceptions.hh"
 #include "utils/log.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

@@ -40,14 +39,6 @@ static constexpr std::string_view PERMISSIONS_NAME = "permissions";

 static logging::logger alogger("default_authorizer");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authorizer,
-        default_authorizer,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.CassandraAuthorizer");
-
 default_authorizer::default_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
        : _qp(qp)
        , _migration_manager(mm) {
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -24,7 +24,6 @@
 #include "exceptions/exceptions.hh"
 #include "seastarx.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/class_registrator.hh"
 #include "db/config.hh"
 #include "utils/exponential_backoff_retry.hh"

@@ -72,20 +71,10 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
    return values;
 }

-const char* ldap_role_manager_full_name = "com.scylladb.auth.LDAPRoleManager";
-
 } // anonymous namespace

 namespace auth {

-static const class_registrator<
-    role_manager,
-    ldap_role_manager,
-    cql3::query_processor&,
-    ::service::raft_group0_client&,
-    ::service::migration_manager&,
-    cache&> registration(ldap_role_manager_full_name);
-
 ldap_role_manager::ldap_role_manager(
        std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
        uint32_t permissions_update_interval_in_ms,
@@ -115,7 +104,7 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
 }

 std::string_view ldap_role_manager::qualified_java_name() const noexcept {
-    return ldap_role_manager_full_name;
+    return "com.scylladb.auth.LDAPRoleManager";
 }

 const resource_set& ldap_role_manager::protected_resources() const {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -57,8 +57,7 @@ class ldap_role_manager : public role_manager {
            cache& cache ///< Passed to standard_role_manager.
    );

-    /// Retrieves LDAP configuration entries from qp and invokes the other constructor.  Required by
-    /// class_registrator<role_manager>.
+    /// Retrieves LDAP configuration entries from qp and invokes the other constructor.
    ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache);

    /// Thrown when query-template parsing fails.
--- a/auth/maintenance_socket_authenticator.cc
+++ b/auth/maintenance_socket_authenticator.cc
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#include "auth/maintenance_socket_authenticator.hh"
+
+
+namespace auth {
+
+maintenance_socket_authenticator::~maintenance_socket_authenticator() {
+}
+
+future<> maintenance_socket_authenticator::start() {
+    return make_ready_future<>();
+}
+
+future<> maintenance_socket_authenticator::ensure_superuser_is_created() const {
+    return make_ready_future<>();
+}
+
+bool maintenance_socket_authenticator::require_authentication() const {
+    return false;
+}
+
+} // namespace auth
--- a/auth/maintenance_socket_authenticator.hh
+++ b/auth/maintenance_socket_authenticator.hh
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include <seastar/core/shared_future.hh>
+
+#include "password_authenticator.hh"
+
+namespace auth {
+
+// maintenance_socket_authenticator is used for clients connecting to the
+// maintenance socket. It does not require authentication,
+// while still allowing the managing of roles and their credentials.
+class maintenance_socket_authenticator : public password_authenticator {
+public:
+    using password_authenticator::password_authenticator;
+
+    virtual ~maintenance_socket_authenticator();
+
+    virtual future<> start() override;
+
+    virtual future<> ensure_superuser_is_created() const override;
+
+    bool require_authentication() const override;
+};
+
+} // namespace auth
+
--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -13,23 +13,48 @@
 #include <string_view>
 #include "auth/cache.hh"
 #include "cql3/description.hh"
-#include "utils/class_registrator.hh"
+#include "utils/log.hh"
+#include "utils/on_internal_error.hh"

 namespace auth {

-constexpr std::string_view maintenance_socket_role_manager_name = "com.scylladb.auth.MaintenanceSocketRoleManager";
+static logging::logger log("maintenance_socket_role_manager");

-static const class_registrator<
-        role_manager,
-        maintenance_socket_role_manager,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> registration(sstring{maintenance_socket_role_manager_name});
+future<> maintenance_socket_role_manager::ensure_role_operations_are_enabled() {
+    if (_is_maintenance_mode) {
+        on_internal_error(log, "enabling role operations not allowed in maintenance mode");
+    }

+    if (_std_mgr.has_value()) {
+        on_internal_error(log, "role operations are already enabled");
+    }
+
+    _std_mgr.emplace(_qp, _group0_client, _migration_manager, _cache);
+    return _std_mgr->start();
+}
+
+void maintenance_socket_role_manager::set_maintenance_mode() {
+    if (_std_mgr.has_value()) {
+        on_internal_error(log, "cannot enter maintenance mode after role operations have been enabled");
+    }
+    _is_maintenance_mode = true;
+}
+
+maintenance_socket_role_manager::maintenance_socket_role_manager(
+        cql3::query_processor& qp,
+        ::service::raft_group0_client& rg0c,
+        ::service::migration_manager& mm,
+        cache& c)
+    : _qp(qp)
+    , _group0_client(rg0c)
+    , _migration_manager(mm)
+    , _cache(c)
+    , _std_mgr(std::nullopt)
+    , _is_maintenance_mode(false) {
+}

 std::string_view maintenance_socket_role_manager::qualified_java_name() const noexcept {
-    return maintenance_socket_role_manager_name;
+    return "com.scylladb.auth.MaintenanceSocketRoleManager";
 }

 const resource_set& maintenance_socket_role_manager::protected_resources() const {
@@ -43,81 +68,161 @@ future<> maintenance_socket_role_manager::start() {
 }

 future<> maintenance_socket_role_manager::stop() {
-    return make_ready_future<>();
+    return _std_mgr ? _std_mgr->stop() : make_ready_future<>();
 }

 future<> maintenance_socket_role_manager::ensure_superuser_is_created() {
-    return make_ready_future<>();
+    return _std_mgr ? _std_mgr->ensure_superuser_is_created() : make_ready_future<>();
 }

 template<typename T = void>
-future<T> operation_not_supported_exception(std::string_view operation) {
+future<T> operation_not_available_in_maintenance_mode_exception(std::string_view operation) {
    return make_exception_future<T>(
-        std::runtime_error(fmt::format("role manager: {} operation not supported through maintenance socket", operation)));
+        std::runtime_error(fmt::format("role manager: {} operation not available through maintenance socket in maintenance mode", operation)));
 }

-future<> maintenance_socket_role_manager::create(std::string_view role_name, const role_config&, ::service::group0_batch&) {
-    return operation_not_supported_exception("CREATE");
+template<typename T = void>
+future<T> manager_not_ready_exception(std::string_view operation) {
+    return make_exception_future<T>(
+        std::runtime_error(fmt::format("role manager: {} operation not available because manager not ready yet (role operations not enabled)", operation)));
+}
+
+future<> maintenance_socket_role_manager::validate_operation(std::string_view name) const {
+    if (_is_maintenance_mode) {
+        return operation_not_available_in_maintenance_mode_exception(name);
+    }
+    if (!_std_mgr) {
+        return manager_not_ready_exception(name);
+    }
+    return make_ready_future<>();
+}
+
+future<> maintenance_socket_role_manager::create(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
+    auto f = validate_operation("CREATE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->create(role_name, c, mc);
 }

 future<> maintenance_socket_role_manager::drop(std::string_view role_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("DROP");
+    auto f = validate_operation("DROP");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->drop(role_name, mc);
 }

-future<> maintenance_socket_role_manager::alter(std::string_view role_name, const role_config_update&, ::service::group0_batch&) {
-    return operation_not_supported_exception("ALTER");
+future<> maintenance_socket_role_manager::alter(std::string_view role_name, const role_config_update& u, ::service::group0_batch& mc) {
+    auto f = validate_operation("ALTER");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->alter(role_name, u, mc);
 }

 future<> maintenance_socket_role_manager::grant(std::string_view grantee_name, std::string_view role_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("GRANT");
+    auto f = validate_operation("GRANT");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->grant(grantee_name, role_name, mc);
 }

 future<> maintenance_socket_role_manager::revoke(std::string_view revokee_name, std::string_view role_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("REVOKE");
+    auto f = validate_operation("REVOKE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->revoke(revokee_name, role_name, mc);
 }

-future<role_set> maintenance_socket_role_manager::query_granted(std::string_view grantee_name, recursive_role_query) {
-    return operation_not_supported_exception<role_set>("QUERY GRANTED");
+future<role_set> maintenance_socket_role_manager::query_granted(std::string_view grantee_name, recursive_role_query m) {
+    auto f = validate_operation("QUERY GRANTED");
+    if (f.failed()) {
+        return make_exception_future<role_set>(f.get_exception());
+    }
+    return _std_mgr->query_granted(grantee_name, m);
 }

-future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
-    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state& qs) {
+    auto f = validate_operation("QUERY ALL DIRECTLY GRANTED");
+    if (f.failed()) {
+        return make_exception_future<role_to_directly_granted_map>(f.get_exception());
+    }
+    return _std_mgr->query_all_directly_granted(qs);
 }

-future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
-    return operation_not_supported_exception<role_set>("QUERY ALL");
+future<role_set> maintenance_socket_role_manager::query_all(::service::query_state& qs) {
+    auto f = validate_operation("QUERY ALL");
+    if (f.failed()) {
+        return make_exception_future<role_set>(f.get_exception());
+    }
+    return _std_mgr->query_all(qs);
 }

 future<bool> maintenance_socket_role_manager::exists(std::string_view role_name) {
-    return operation_not_supported_exception<bool>("EXISTS");
+    auto f = validate_operation("EXISTS");
+    if (f.failed()) {
+        return make_exception_future<bool>(f.get_exception());
+    }
+    return _std_mgr->exists(role_name);
 }

 future<bool> maintenance_socket_role_manager::is_superuser(std::string_view role_name) {
-    return make_ready_future<bool>(true);
+    auto f = validate_operation("IS SUPERUSER");
+    if (f.failed()) {
+        return make_exception_future<bool>(f.get_exception());
+    }
+    return _std_mgr->is_superuser(role_name);
 }

 future<bool> maintenance_socket_role_manager::can_login(std::string_view role_name) {
-    return make_ready_future<bool>(true);
+    auto f = validate_operation("CAN LOGIN");
+    if (f.failed()) {
+        return make_exception_future<bool>(f.get_exception());
+    }
+    return _std_mgr->can_login(role_name);
 }

-future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
-    return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
+future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
+    auto f = validate_operation("GET ATTRIBUTE");
+    if (f.failed()) {
+        return make_exception_future<std::optional<sstring>>(f.get_exception());
+    }
+    return _std_mgr->get_attribute(role_name, attribute_name, qs);
 }

-future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
-    return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
+future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    auto f = validate_operation("QUERY ATTRIBUTE FOR ALL");
+    if (f.failed()) {
+        return make_exception_future<role_manager::attribute_vals>(f.get_exception());
+    }
+    return _std_mgr->query_attribute_for_all(attribute_name, qs);
 }

 future<> maintenance_socket_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("SET ATTRIBUTE");
+    auto f = validate_operation("SET ATTRIBUTE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->set_attribute(role_name, attribute_name, attribute_value, mc);
 }

 future<> maintenance_socket_role_manager::remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("REMOVE ATTRIBUTE");
+    auto f = validate_operation("REMOVE ATTRIBUTE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->remove_attribute(role_name, attribute_name, mc);
 }

 future<std::vector<cql3::description>> maintenance_socket_role_manager::describe_role_grants() {
-    return operation_not_supported_exception<std::vector<cql3::description>>("DESCRIBE SCHEMA WITH INTERNALS");
+    auto f = validate_operation("DESCRIBE ROLE GRANTS");
+    if (f.failed()) {
+        return make_exception_future<std::vector<cql3::description>>(f.get_exception());
+    }
+    return _std_mgr->describe_role_grants();
 }

 } // namespace auth
--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -11,6 +11,7 @@
 #include "auth/cache.hh"
 #include "auth/resource.hh"
 #include "auth/role_manager.hh"
+#include "auth/standard_role_manager.hh"
 #include <seastar/core/future.hh>

 namespace cql3 {
@@ -24,13 +25,26 @@ class raft_group0_client;

 namespace auth {

-extern const std::string_view maintenance_socket_role_manager_name;
-
-// This role manager is used by the maintenance socket. It has disabled all role management operations to not depend on
-// system_auth keyspace, which may be not yet created when the maintenance socket starts listening.
+// This role manager is used by the maintenance socket. It has disabled all role management operations
+// in maintenance mode. In normal mode it delegates all operations to a standard_role_manager,
+// which is created on demand when the node joins the cluster.
 class maintenance_socket_role_manager final : public role_manager {
+    cql3::query_processor& _qp;
+    ::service::raft_group0_client& _group0_client;
+    ::service::migration_manager& _migration_manager;
+    cache& _cache;
+    std::optional<standard_role_manager> _std_mgr;
+    bool _is_maintenance_mode;
+
 public:
-    maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&) {}
+    void set_maintenance_mode() override;
+
+    // Ensures role management operations are enabled.
+    // It must be called once the node has joined the cluster.
+    // In the meantime all role management operations will fail.
+    future<> ensure_role_operations_are_enabled() override;
+
+    maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    virtual std::string_view qualified_java_name() const noexcept override;

@@ -42,21 +56,21 @@ public:

    virtual future<> ensure_superuser_is_created() override;

-    virtual future<> create(std::string_view role_name, const role_config&, ::service::group0_batch&) override;
+    virtual future<> create(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) override;

    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;

-    virtual future<> alter(std::string_view role_name, const role_config_update&, ::service::group0_batch&) override;
+    virtual future<> alter(std::string_view role_name, const role_config_update& u, ::service::group0_batch& mc) override;

    virtual future<> grant(std::string_view grantee_name, std::string_view role_name, ::service::group0_batch& mc) override;

    virtual future<> revoke(std::string_view revokee_name, std::string_view role_name, ::service::group0_batch& mc) override;

-    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;
+    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query m) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& qs) override;

-    virtual future<role_set> query_all(::service::query_state&) override;
+    virtual future<role_set> query_all(::service::query_state& qs) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -64,15 +78,19 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

    virtual future<> remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) override;

    virtual future<std::vector<cql3::description>> describe_role_grants() override;
+
+private:
+    future<> validate_operation(std::string_view name) const;
+
 };

 }
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -26,7 +26,6 @@
 #include "cql3/untyped_result_set.hh"
 #include "utils/log.hh"
 #include "service/migration_manager.hh"
-#include "utils/class_registrator.hh"
 #include "replica/database.hh"
 #include "cql3/query_processor.hh"
 #include "db/config.hh"
@@ -37,27 +36,18 @@ constexpr std::string_view password_authenticator_name("org.apache.cassandra.aut

 // name of the hash column.
 static constexpr std::string_view SALTED_HASH = "salted_hash";
-static constexpr std::string_view DEFAULT_USER_NAME = meta::DEFAULT_SUPERUSER_NAME;
 static const sstring DEFAULT_USER_PASSWORD = sstring(meta::DEFAULT_SUPERUSER_NAME);

 static logging::logger plogger("password_authenticator");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authenticator,
-        password_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
-
 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

-static std::string_view get_config_value(std::string_view value, std::string_view def) {
-    return value.empty() ? def : value;
-}
-std::string password_authenticator::default_superuser(const db::config& cfg) {
-    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
+std::string password_authenticator::default_superuser(cql3::query_processor& qp) {
+    if (legacy_mode(qp)) {
+        return std::string(meta::DEFAULT_SUPERUSER_NAME);
+    }
+
+    return qp.db().get_config().auth_superuser_name();
 }

 password_authenticator::~password_authenticator() {
@@ -69,7 +59,6 @@ password_authenticator::password_authenticator(cql3::query_processor& qp, ::serv
    , _migration_manager(mm)
    , _cache(cache)
    , _stopped(make_ready_future<>()) 
-    , _superuser(default_superuser(qp.db().get_config()))
 {}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -123,11 +112,14 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 }

 future<> password_authenticator::legacy_create_default_if_missing() {
+    if (_superuser.empty()) {
+        on_internal_error(plogger, "Legacy auth default superuser name is empty");
+    }
    const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
    if (exists) {
        co_return;
    }
-    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
+    std::string salted_pwd(_qp.db().get_config().auth_superuser_salted_password());
    if (salted_pwd.empty()) {
        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
    }
@@ -147,6 +139,9 @@ future<> password_authenticator::legacy_create_default_if_missing() {

 future<> password_authenticator::maybe_create_default_password() {
    auto needs_password = [this] () -> future<bool> {
+        if (_superuser.empty()) {
+            co_return false;
+        }
        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
        auto results = co_await _qp.execute_internal(query,
                db::consistency_level::LOCAL_ONE,
@@ -178,9 +173,9 @@ future<> password_authenticator::maybe_create_default_password() {
        co_return;
    }
    // Set default superuser's password.
-    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
+    std::string salted_pwd(_qp.db().get_config().auth_superuser_salted_password());
    if (salted_pwd.empty()) {
-        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
+        co_return;
    }
    const auto update_query = update_row_query();
    co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
@@ -210,6 +205,8 @@ future<> password_authenticator::maybe_create_default_password_with_retries() {

 future<> password_authenticator::start() {
    return once_among_shards([this] {
+        _superuser = default_superuser(_qp);
+
        // Verify that at least one hashing scheme is supported.
        passwords::detail::verify_scheme(_scheme);
        plogger.info("Using password hashing scheme: {}", passwords::detail::prefix_for_scheme(_scheme));
@@ -217,6 +214,9 @@ future<> password_authenticator::start() {
        _stopped = do_after_system_ready(_as, [this] {
            return async([this] {
                if (legacy_mode(_qp)) {
+                    if (_superuser.empty()) {
+                        on_internal_error(plogger, "Legacy auth default superuser name is empty");
+                    }
                    if (!_superuser_created_promise.available()) {
                        // Counterintuitively, we mark promise as ready before any startup work
                        // because wait_for_schema_agreement() below will block indefinitely
@@ -251,6 +251,9 @@ future<> password_authenticator::start() {
        });

        if (legacy_mode(_qp)) {
+            if (_superuser.empty()) {
+                on_internal_error(plogger, "Legacy auth default superuser name is empty");
+            }
            static const sstring create_roles_query = fmt::format(
                    "CREATE TABLE {}.{} ("
                    "  {} text PRIMARY KEY,"
@@ -280,7 +283,7 @@ future<> password_authenticator::stop() {
 db::consistency_level password_authenticator::consistency_for_user(std::string_view role_name) {
    // TODO: this is plain dung. Why treat hardcoded default special, but for example a user-created
    // super user uses plain LOCAL_ONE?
-    if (role_name == DEFAULT_USER_NAME) {
+    if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
    }
    return db::consistency_level::LOCAL_ONE;
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -51,7 +51,7 @@ class password_authenticator : public authenticator {

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
-    static std::string default_superuser(const db::config&);
+    static std::string default_superuser(cql3::query_processor& qp);

    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -112,6 +112,11 @@ public:

    virtual future<> stop() = 0;

+    ///
+    /// Notify that the maintenance mode is starting.
+    ///
+    virtual void set_maintenance_mode() {}
+
    ///
    /// Ensure that superuser role exists.
    ///
@@ -119,6 +124,11 @@ public:
    ///
    virtual future<> ensure_superuser_is_created() = 0;

+    ///
+    /// Ensure role management operations are enabled. Some role managers may defer initialization.
+    ///
+    virtual future<> ensure_role_operations_are_enabled() { return make_ready_future<>(); }
+
    ///
    /// \returns an exceptional future with \ref role_already_exists for a role that has previously been created.
    ///
--- a/auth/saslauthd_authenticator.cc
+++ b/auth/saslauthd_authenticator.cc
@@ -22,21 +22,11 @@
 #include "db/config.hh"
 #include "utils/log.hh"
 #include "seastarx.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

 static logging::logger mylog("saslauthd_authenticator");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authenticator,
-        saslauthd_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
-
 saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&)
    : _socket_path(qp.db().get_config().saslauthd_socket_path())
 {}
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -16,6 +16,8 @@
 #include <algorithm>
 #include <chrono>

+#include <boost/algorithm/string.hpp>
+
 #include <seastar/core/future-util.hh>
 #include <seastar/core/shard_id.hh>
 #include <seastar/core/sharded.hh>
@@ -23,8 +25,17 @@

 #include "auth/allow_all_authenticator.hh"
 #include "auth/allow_all_authorizer.hh"
+#include "auth/certificate_authenticator.hh"
 #include "auth/common.hh"
+#include "auth/default_authorizer.hh"
+#include "auth/ldap_role_manager.hh"
+#include "auth/maintenance_socket_authenticator.hh"
+#include "auth/maintenance_socket_role_manager.hh"
+#include "auth/password_authenticator.hh"
 #include "auth/role_or_anonymous.hh"
+#include "auth/saslauthd_authenticator.hh"
+#include "auth/standard_role_manager.hh"
+#include "auth/transitional.hh"
 #include "cql3/functions/functions.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/description.hh"
@@ -43,7 +54,6 @@
 #include "service/raft/raft_group0_client.hh"
 #include "mutation/timestamp.hh"
 #include "utils/assert.hh"
-#include "utils/class_registrator.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "data_dictionary/keyspace_metadata.hh"
 #include "service/storage_service.hh"
@@ -176,8 +186,9 @@ service::service(
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
        ::service::migration_notifier& mn,
-        ::service::migration_manager& mm,
-        const service_config& sc,
+        authorizer_factory authorizer_factory,
+        authenticator_factory authenticator_factory,
+        role_manager_factory role_manager_factory,
        maintenance_socket_enabled used_by_maintenance_socket,
        cache& cache)
            : service(
@@ -185,9 +196,9 @@ service::service(
                      qp,
                      g0,
                      mn,
-                      create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
-                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache),
-                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm, cache),
+                      authorizer_factory(),
+                      authenticator_factory(),
+                      role_manager_factory(),
                      used_by_maintenance_socket) {
 }

@@ -307,6 +318,10 @@ future<permission_set> service::get_permissions(const role_or_anonymous& maybe_r
    return _cache.get_permissions(maybe_role, r);
 }

+void service::set_maintenance_mode() {
+    _role_manager->set_maintenance_mode();
+}
+
 future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
    for (const auto& role : roles) {
        if (co_await _role_manager->is_superuser(role)) {
@@ -342,6 +357,10 @@ static void validate_authentication_options_are_supported(
    }
 }

+future<> service::ensure_role_operations_are_enabled() {
+    return _role_manager->ensure_role_operations_are_enabled();
+}
+
 future<> service::create_role(std::string_view name,
        const role_config& config,
        const authentication_options& options,
@@ -659,6 +678,10 @@ future<std::vector<cql3::description>> service::describe_auth(bool with_hashed_p
 // Free functions.
 //

+void set_maintenance_mode(service& ser) {
+    ser.set_maintenance_mode();
+}
+
 future<bool> has_superuser(const service& ser, const authenticated_user& u) {
    if (is_anonymous(u)) {
        return make_ready_future<bool>(false);
@@ -667,6 +690,10 @@ future<bool> has_superuser(const service& ser, const authenticated_user& u) {
    return ser.has_superuser(*u.name);
 }

+future<> ensure_role_operations_are_enabled(service& ser) {
+    return ser.underlying_role_manager().ensure_role_operations_are_enabled();
+}
+
 future<role_set> get_roles(const service& ser, const authenticated_user& u) {
    if (is_anonymous(u)) {
        return make_ready_future<role_set>();
@@ -928,4 +955,111 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
            std::nullopt);
 }

+namespace {
+
+std::string_view get_short_name(std::string_view name) {
+    auto pos = name.find_last_of('.');
+    if (pos == std::string_view::npos) {
+        return name;
+    }
+    return name.substr(pos + 1);
+}
+
+} // anonymous namespace
+
+authorizer_factory make_authorizer_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm) {
+    std::string_view short_name = get_short_name(name);
+
+    if (boost::iequals(short_name, "AllowAllAuthorizer")) {
+        return [&qp, &g0, &mm] {
+            return std::make_unique<allow_all_authorizer>(qp.local(), g0, mm.local());
+        };
+    } else if (boost::iequals(short_name, "CassandraAuthorizer")) {
+        return [&qp, &g0, &mm] {
+            return std::make_unique<default_authorizer>(qp.local(), g0, mm.local());
+        };
+    } else if (boost::iequals(short_name, "TransitionalAuthorizer")) {
+        return [&qp, &g0, &mm] {
+            return std::make_unique<transitional_authorizer>(qp.local(), g0, mm.local());
+        };
+    }
+    throw std::invalid_argument(fmt::format("Unknown authorizer: {}", name));
+}
+
+authenticator_factory make_authenticator_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    std::string_view short_name = get_short_name(name);
+
+    if (boost::iequals(short_name, "AllowAllAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<allow_all_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "PasswordAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<password_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "CertificateAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<certificate_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "SaslauthdAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<saslauthd_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "TransitionalAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<transitional_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    }
+    throw std::invalid_argument(fmt::format("Unknown authenticator: {}", name));
+}
+
+role_manager_factory make_role_manager_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    std::string_view short_name = get_short_name(name);
+
+    if (boost::iequals(short_name, "CassandraRoleManager")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<standard_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "LDAPRoleManager")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<ldap_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    }
+    throw std::invalid_argument(fmt::format("Unknown role manager: {}", name));
+}
+
+authenticator_factory make_maintenance_socket_authenticator_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    return [&qp, &g0, &mm, &auth_cache] {
+        return std::make_unique<maintenance_socket_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+    };
+}
+
+role_manager_factory make_maintenance_socket_role_manager_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    return [&qp, &g0, &mm, &auth_cache] {
+        return std::make_unique<maintenance_socket_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
+    };
+}
+
 }
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -44,11 +44,10 @@ namespace auth {

 class role_or_anonymous;

-struct service_config final {
-    sstring authorizer_java_name;
-    sstring authenticator_java_name;
-    sstring role_manager_java_name;
-};
+/// Factory function types for creating auth module instances on each shard.
+using authorizer_factory = std::function<std::unique_ptr<authorizer>()>;
+using authenticator_factory = std::function<std::unique_ptr<authenticator>()>;
+using role_manager_factory = std::function<std::unique_ptr<role_manager>()>;

 ///
 /// Due to poor (in this author's opinion) decisions of Apache Cassandra, certain choices of one role-manager,
@@ -108,15 +107,16 @@ public:

    ///
    /// This constructor is intended to be used when the class is sharded via \ref seastar::sharded. In that case, the
-    /// arguments must be copyable, which is why we delay construction with instance-construction instructions instead
+    /// arguments must be copyable, which is why we delay construction with instance-construction factories instead
    /// of the instances themselves.
    ///
    service(
            cql3::query_processor&,
            ::service::raft_group0_client&,
            ::service::migration_notifier&,
-            ::service::migration_manager&,
-            const service_config&,
+            authorizer_factory,
+            authenticator_factory,
+            role_manager_factory,
            maintenance_socket_enabled,
            cache&);

@@ -138,6 +138,11 @@ public:
    ///
    future<permission_set> get_uncached_permissions(const role_or_anonymous&, const resource&) const;

+    ///
+    /// Notify the service that the node is entering maintenance mode.
+    ///
+    void set_maintenance_mode();
+
    ///
    /// Query whether the named role has been granted a role that is a superuser.
    ///
@@ -147,6 +152,11 @@ public:
    ///
    future<bool> has_superuser(std::string_view role_name) const;

+    ///
+    /// Ensure that the role operations are enabled. Some role managers defer initialization.
+    ///
+    future<> ensure_role_operations_are_enabled();
+    
    ///
    /// Create a role with optional authentication information.
    ///
@@ -208,8 +218,12 @@ private:
    future<std::vector<cql3::description>> describe_permissions() const;
 };

+void set_maintenance_mode(service&);
+
 future<bool> has_superuser(const service&, const authenticated_user&);

+future<> ensure_role_operations_are_enabled(service&);
+
 future<role_set> get_roles(const service&, const authenticated_user&);

 future<permission_set> get_permissions(const service&, const authenticated_user&, const resource&);
@@ -396,4 +410,52 @@ future<> commit_mutations(service& ser, ::service::group0_batch&& mc);
 // Migrates data from old keyspace to new one which supports linearizable writes via raft.
 future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_client& g0, start_operation_func_t start_operation_func, abort_source& as);

+///
+/// Factory helper functions for creating auth module instances.
+/// These are intended for use with sharded<service>::start() where copyable arguments are required.
+/// The returned factories capture the sharded references and call .local() when invoked on each shard.
+///
+
+/// Creates an authorizer factory for config-selectable authorizer types.
+/// @param name The authorizer class name (e.g., "CassandraAuthorizer", "AllowAllAuthorizer")
+authorizer_factory make_authorizer_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm);
+
+/// Creates an authenticator factory for config-selectable authenticator types.
+/// @param name The authenticator class name (e.g., "PasswordAuthenticator", "AllowAllAuthenticator")
+authenticator_factory make_authenticator_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);
+
+/// Creates a role_manager factory for config-selectable role manager types.
+/// @param name The role manager class name (e.g., "CassandraRoleManager")
+role_manager_factory make_role_manager_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);
+
+/// Creates a factory for the maintenance socket authenticator.
+/// This authenticator is not config-selectable and is only used for the maintenance socket.
+authenticator_factory make_maintenance_socket_authenticator_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);
+
+/// Creates a factory for the maintenance socket role manager.
+/// This role manager is not config-selectable and is only used for the maintenance socket.
+role_manager_factory make_maintenance_socket_role_manager_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);
+
 }
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -34,7 +34,6 @@
 #include <seastar/core/loop.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include "service/raft/raft_group0_client.hh"
-#include "utils/class_registrator.hh"
 #include "service/migration_manager.hh"
 #include "password_authenticator.hh"
 #include "utils/managed_string.hh"
@@ -44,14 +43,6 @@ namespace auth {

 static logging::logger log("standard_role_manager");

-static const class_registrator<
-        role_manager,
-        standard_role_manager,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");
-
 static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
    if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
@@ -123,7 +114,6 @@ standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::servic
    , _migration_manager(mm)
    , _cache(cache)
    , _stopped(make_ready_future<>())
-    , _superuser(password_authenticator::default_superuser(qp.db().get_config()))
 {}

 std::string_view standard_role_manager::qualified_java_name() const noexcept {
@@ -186,6 +176,9 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
 }

 future<> standard_role_manager::legacy_create_default_role_if_missing() {
+    if (_superuser.empty()) {
+        on_internal_error(log, "Legacy auth default superuser name is empty");
+    }
    try {
        const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_can_login, _superuser);
        if (exists) {
@@ -209,6 +202,9 @@ future<> standard_role_manager::legacy_create_default_role_if_missing() {
 }

 future<> standard_role_manager::maybe_create_default_role() {
+    if (_superuser.empty()) {
+        co_return;
+    }
    auto has_superuser = [this] () -> future<bool> {
        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
        auto results = co_await _qp.execute_internal(query, db::consistency_level::LOCAL_ONE,
@@ -300,6 +296,8 @@ future<> standard_role_manager::migrate_legacy_metadata() {

 future<> standard_role_manager::start() {
    return once_among_shards([this] () -> future<> {
+        _superuser = password_authenticator::default_superuser(_qp);
+
        if (legacy_mode(_qp)) {
            co_await create_legacy_metadata_tables_if_missing();
        }
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -8,244 +8,200 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "auth/transitional.hh"
 #include "auth/authenticated_user.hh"
-#include "auth/authenticator.hh"
-#include "auth/authorizer.hh"
 #include "auth/default_authorizer.hh"
 #include "auth/password_authenticator.hh"
-#include "auth/cache.hh"
 #include "auth/permission.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

-static const sstring PACKAGE_NAME("com.scylladb.auth.");
-
-static const sstring& transitional_authenticator_name() {
-    static const sstring name = PACKAGE_NAME + "TransitionalAuthenticator";
-    return name;
+transitional_authenticator::transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
+        : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
 }

-static const sstring& transitional_authorizer_name() {
-    static const sstring name = PACKAGE_NAME + "TransitionalAuthorizer";
-    return name;
+transitional_authenticator::transitional_authenticator(std::unique_ptr<authenticator> a)
+        : _authenticator(std::move(a)) {
 }

-class transitional_authenticator : public authenticator {
-    std::unique_ptr<authenticator> _authenticator;
+future<> transitional_authenticator::start() {
+    return _authenticator->start();
+}

-public:
-    static const sstring PASSWORD_AUTHENTICATOR_NAME;
+future<> transitional_authenticator::stop() {
+    return _authenticator->stop();
+}

-    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
-            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
+std::string_view transitional_authenticator::qualified_java_name() const {
+    return "com.scylladb.auth.TransitionalAuthenticator";
+}
+
+bool transitional_authenticator::require_authentication() const {
+    return true;
+}
+
+authentication_option_set transitional_authenticator::supported_options() const {
+    return _authenticator->supported_options();
+}
+
+authentication_option_set transitional_authenticator::alterable_options() const {
+    return _authenticator->alterable_options();
+}
+
+future<authenticated_user> transitional_authenticator::authenticate(const credentials_map& credentials) const {
+    auto i = credentials.find(authenticator::USERNAME_KEY);
+    if ((i == credentials.end() || i->second.empty())
+            && (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
+        // return anon user
+        return make_ready_future<authenticated_user>(anonymous_user());
    }
-    transitional_authenticator(std::unique_ptr<authenticator> a)
-            : _authenticator(std::move(a)) {
-    }
-
-    virtual future<> start() override {
-        return _authenticator->start();
-    }
-
-    virtual future<> stop() override {
-        return _authenticator->stop();
-    }
-
-    virtual std::string_view qualified_java_name() const override {
-        return transitional_authenticator_name();
-    }
-
-    virtual bool require_authentication() const override {
-        return true;
-    }
-
-    virtual authentication_option_set supported_options() const override {
-        return _authenticator->supported_options();
-    }
-
-    virtual authentication_option_set alterable_options() const override {
-        return _authenticator->alterable_options();
-    }
-
-    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override {
-        auto i = credentials.find(authenticator::USERNAME_KEY);
-        if ((i == credentials.end() || i->second.empty())
-                && (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
+    return make_ready_future().then([this, &credentials] {
+        return _authenticator->authenticate(credentials);
+    }).handle_exception([](auto ep) {
+        try {
+            std::rethrow_exception(ep);
+        } catch (const exceptions::authentication_exception&) {
            // return anon user
            return make_ready_future<authenticated_user>(anonymous_user());
        }
-        return make_ready_future().then([this, &credentials] {
-            return _authenticator->authenticate(credentials);
-        }).handle_exception([](auto ep) {
-            try {
-                std::rethrow_exception(ep);
-            } catch (const exceptions::authentication_exception&) {
-                // return anon user
-                return make_ready_future<authenticated_user>(anonymous_user());
-            }
-        });
-    }
-
-    virtual future<> create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override {
-        return _authenticator->create(role_name, options, mc);
-    }
-
-    virtual future<> alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override {
-        return _authenticator->alter(role_name, options, mc);
-    }
-
-    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override {
-        return _authenticator->drop(role_name, mc);
-    }
-
-    virtual future<custom_options> query_custom_options(std::string_view role_name) const override {
-        return _authenticator->query_custom_options(role_name);
-    }
-
-    virtual bool uses_password_hashes() const override {
-        return _authenticator->uses_password_hashes();
-    }
-
-    virtual future<std::optional<sstring>> get_password_hash(std::string_view role_name) const override {
-        return _authenticator->get_password_hash(role_name);
-    }
-
-    virtual const resource_set& protected_resources() const override {
-        return _authenticator->protected_resources();
-    }
-
-    virtual ::shared_ptr<sasl_challenge> new_sasl_challenge() const override {
-        class sasl_wrapper : public sasl_challenge {
-        public:
-            sasl_wrapper(::shared_ptr<sasl_challenge> sasl)
-                    : _sasl(std::move(sasl)) {
-            }
-
-            virtual bytes evaluate_response(bytes_view client_response) override {
-                try {
-                    return _sasl->evaluate_response(client_response);
-                } catch (const exceptions::authentication_exception&) {
-                    _complete = true;
-                    return {};
-                }
-            }
-
-            virtual bool is_complete() const override {
-                return _complete || _sasl->is_complete();
-            }
-
-            virtual future<authenticated_user> get_authenticated_user() const override {
-                return futurize_invoke([this] {
-                    return _sasl->get_authenticated_user().handle_exception([](auto ep) {
-                        try {
-                            std::rethrow_exception(ep);
-                        } catch (const exceptions::authentication_exception&) {
-                            // return anon user
-                            return make_ready_future<authenticated_user>(anonymous_user());
-                        }
-                    });
-                });
-	    }
-
-            const sstring& get_username() const override {
-                return _sasl->get_username();
-            }
-
-        private:
-            ::shared_ptr<sasl_challenge> _sasl;
-
-            bool _complete = false;
-        };
-        return ::make_shared<sasl_wrapper>(_authenticator->new_sasl_challenge());
-    }
-
-    virtual future<> ensure_superuser_is_created() const override {
-        return _authenticator->ensure_superuser_is_created();
-    }
-};
-
-class transitional_authorizer : public authorizer {
-    std::unique_ptr<authorizer> _authorizer;
-
-public:
-    transitional_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
-            : transitional_authorizer(std::make_unique<default_authorizer>(qp, g0, mm)) {
-    }
-    transitional_authorizer(std::unique_ptr<authorizer> a)
-            : _authorizer(std::move(a)) {
-    }
-
-    ~transitional_authorizer() {
-    }
-
-    virtual future<> start() override {
-        return _authorizer->start();
-    }
-
-    virtual future<> stop() override {
-        return _authorizer->stop();
-    }
-
-    virtual std::string_view qualified_java_name() const override {
-        return transitional_authorizer_name();
-    }
-
-    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
-        static const permission_set transitional_permissions =
-                permission_set::of<
-                        permission::CREATE,
-                        permission::ALTER,
-                        permission::DROP,
-                        permission::SELECT,
-                        permission::MODIFY>();
-
-        return make_ready_future<permission_set>(transitional_permissions);
-    }
-
-    virtual future<> grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc)  override {
-        return _authorizer->grant(s, std::move(ps), r, mc);
-    }
-
-    virtual future<> revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override {
-        return _authorizer->revoke(s, std::move(ps), r, mc);
-    }
-
-    virtual future<std::vector<permission_details>> list_all() const override {
-        return _authorizer->list_all();
-    }
-
-    virtual future<> revoke_all(std::string_view s, ::service::group0_batch& mc) override {
-        return _authorizer->revoke_all(s, mc);
-    }
-
-    virtual future<> revoke_all(const resource& r, ::service::group0_batch& mc) override {
-        return _authorizer->revoke_all(r, mc);
-    }
-
-    virtual const resource_set& protected_resources() const override {
-        return _authorizer->protected_resources();
-    }
-};
-
+    });
 }

-//
-// To ensure correct initialization order, we unfortunately need to use string literals.
-//
+future<> transitional_authenticator::create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
+    return _authenticator->create(role_name, options, mc);
+}

-static const class_registrator<
-        auth::authenticator,
-        auth::transitional_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        auth::cache&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
+future<> transitional_authenticator::alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
+    return _authenticator->alter(role_name, options, mc);
+}

-static const class_registrator<
-        auth::authorizer,
-        auth::transitional_authorizer,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&> transitional_authorizer_reg(auth::PACKAGE_NAME + "TransitionalAuthorizer");
+future<> transitional_authenticator::drop(std::string_view role_name, ::service::group0_batch& mc) {
+    return _authenticator->drop(role_name, mc);
+}
+
+future<custom_options> transitional_authenticator::query_custom_options(std::string_view role_name) const {
+    return _authenticator->query_custom_options(role_name);
+}
+
+bool transitional_authenticator::uses_password_hashes() const {
+    return _authenticator->uses_password_hashes();
+}
+
+future<std::optional<sstring>> transitional_authenticator::get_password_hash(std::string_view role_name) const {
+    return _authenticator->get_password_hash(role_name);
+}
+
+const resource_set& transitional_authenticator::protected_resources() const {
+    return _authenticator->protected_resources();
+}
+
+::shared_ptr<sasl_challenge> transitional_authenticator::new_sasl_challenge() const {
+    class sasl_wrapper : public sasl_challenge {
+    public:
+        sasl_wrapper(::shared_ptr<sasl_challenge> sasl)
+                : _sasl(std::move(sasl)) {
+        }
+
+        virtual bytes evaluate_response(bytes_view client_response) override {
+            try {
+                return _sasl->evaluate_response(client_response);
+            } catch (const exceptions::authentication_exception&) {
+                _complete = true;
+                return {};
+            }
+        }
+
+        virtual bool is_complete() const override {
+            return _complete || _sasl->is_complete();
+        }
+
+        virtual future<authenticated_user> get_authenticated_user() const override {
+            return futurize_invoke([this] {
+                return _sasl->get_authenticated_user().handle_exception([](auto ep) {
+                    try {
+                        std::rethrow_exception(ep);
+                    } catch (const exceptions::authentication_exception&) {
+                        // return anon user
+                        return make_ready_future<authenticated_user>(anonymous_user());
+                    }
+                });
+            });
+        }
+
+        const sstring& get_username() const override {
+            return _sasl->get_username();
+        }
+
+    private:
+        ::shared_ptr<sasl_challenge> _sasl;
+
+        bool _complete = false;
+    };
+    return ::make_shared<sasl_wrapper>(_authenticator->new_sasl_challenge());
+}
+
+future<> transitional_authenticator::ensure_superuser_is_created() const {
+    return _authenticator->ensure_superuser_is_created();
+}
+
+transitional_authorizer::transitional_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
+        : transitional_authorizer(std::make_unique<default_authorizer>(qp, g0, mm)) {
+}
+
+transitional_authorizer::transitional_authorizer(std::unique_ptr<authorizer> a)
+        : _authorizer(std::move(a)) {
+}
+
+transitional_authorizer::~transitional_authorizer() {
+}
+
+future<> transitional_authorizer::start() {
+    return _authorizer->start();
+}
+
+future<> transitional_authorizer::stop() {
+    return _authorizer->stop();
+}
+
+std::string_view transitional_authorizer::qualified_java_name() const {
+    return "com.scylladb.auth.TransitionalAuthorizer";
+}
+
+future<permission_set> transitional_authorizer::authorize(const role_or_anonymous&, const resource&) const {
+    static const permission_set transitional_permissions =
+            permission_set::of<
+                    permission::CREATE,
+                    permission::ALTER,
+                    permission::DROP,
+                    permission::SELECT,
+                    permission::MODIFY>();
+
+    return make_ready_future<permission_set>(transitional_permissions);
+}
+
+future<> transitional_authorizer::grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) {
+    return _authorizer->grant(s, std::move(ps), r, mc);
+}
+
+future<> transitional_authorizer::revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) {
+    return _authorizer->revoke(s, std::move(ps), r, mc);
+}
+
+future<std::vector<permission_details>> transitional_authorizer::list_all() const {
+    return _authorizer->list_all();
+}
+
+future<> transitional_authorizer::revoke_all(std::string_view s, ::service::group0_batch& mc) {
+    return _authorizer->revoke_all(s, mc);
+}
+
+future<> transitional_authorizer::revoke_all(const resource& r, ::service::group0_batch& mc) {
+    return _authorizer->revoke_all(r, mc);
+}
+
+const resource_set& transitional_authorizer::protected_resources() const {
+    return _authorizer->protected_resources();
+}
+
+}
--- a/auth/transitional.hh
+++ b/auth/transitional.hh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "auth/authenticator.hh"
+#include "auth/authorizer.hh"
+#include "auth/cache.hh"
+
+namespace cql3 {
+class query_processor;
+}
+
+namespace service {
+class raft_group0_client;
+class migration_manager;
+}
+
+namespace auth {
+
+///
+/// Transitional authenticator that allows anonymous access when credentials are not provided
+/// or authentication fails. Used for migration scenarios.
+///
+class transitional_authenticator : public authenticator {
+    std::unique_ptr<authenticator> _authenticator;
+
+public:
+    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache);
+    transitional_authenticator(std::unique_ptr<authenticator> a);
+
+    virtual future<> start() override;
+    virtual future<> stop() override;
+    virtual std::string_view qualified_java_name() const override;
+    virtual bool require_authentication() const override;
+    virtual authentication_option_set supported_options() const override;
+    virtual authentication_option_set alterable_options() const override;
+    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override;
+    virtual future<> create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override;
+    virtual future<> alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override;
+    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
+    virtual future<custom_options> query_custom_options(std::string_view role_name) const override;
+    virtual bool uses_password_hashes() const override;
+    virtual future<std::optional<sstring>> get_password_hash(std::string_view role_name) const override;
+    virtual const resource_set& protected_resources() const override;
+    virtual ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;
+    virtual future<> ensure_superuser_is_created() const override;
+};
+
+///
+/// Transitional authorizer that grants a fixed set of permissions to all users.
+/// Used for migration scenarios.
+///
+class transitional_authorizer : public authorizer {
+    std::unique_ptr<authorizer> _authorizer;
+
+public:
+    transitional_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm);
+    transitional_authorizer(std::unique_ptr<authorizer> a);
+    ~transitional_authorizer();
+
+    virtual future<> start() override;
+    virtual future<> stop() override;
+    virtual std::string_view qualified_java_name() const override;
+    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override;
+    virtual future<> grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override;
+    virtual future<> revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override;
+    virtual future<std::vector<permission_details>> list_all() const override;
+    virtual future<> revoke_all(std::string_view s, ::service::group0_batch& mc) override;
+    virtual future<> revoke_all(const resource& r, ::service::group0_batch& mc) override;
+    virtual const resource_set& protected_resources() const override;
+};
+
+} // namespace auth
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -618,7 +618,7 @@ static void set_default_properties_log_table(schema_builder& b, const schema& s,
    b.set_caching_options(caching_options::get_disabled_caching_options());

    auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
-    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata(), false));
+    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, false));
    b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
 }

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -598,8 +598,7 @@ protected:
    // Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
    std::vector<sstables::shared_sstable> _used_garbage_collected_sstables;
    utils::observable<> _stop_request_observable;
-    // optional tombstone_gc_state that is used when gc has to check only the compacting sstables to collect tombstones.
-    std::optional<tombstone_gc_state> _tombstone_gc_state_with_commitlog_check_disabled;
+    tombstone_gc_state _tombstone_gc_state;
    int64_t _output_repaired_at = 0;
 private:
    // Keeps track of monitors for input sstable.
@@ -649,9 +648,12 @@ protected:
        , _owned_ranges(std::move(descriptor.owned_ranges))
        , _sharder(descriptor.sharder)
        , _owned_ranges_checker(_owned_ranges ? std::optional<dht::incremental_owned_ranges_checker>(*_owned_ranges) : std::nullopt)
-        , _tombstone_gc_state_with_commitlog_check_disabled(descriptor.gc_check_only_compacting_sstables ? std::make_optional(_table_s.get_tombstone_gc_state().with_commitlog_check_disabled()) : std::nullopt)
+        , _tombstone_gc_state(_table_s.get_tombstone_gc_state())
        , _progress_monitor(progress_monitor)
    {
+        if (descriptor.gc_check_only_compacting_sstables) {
+            _tombstone_gc_state = _tombstone_gc_state.with_commitlog_check_disabled();
+        }
        std::unordered_set<sstables::run_id> ssts_run_ids;
        _contains_multi_fragment_runs = std::any_of(_sstables.begin(), _sstables.end(), [&ssts_run_ids] (sstables::shared_sstable& sst) {
            return !ssts_run_ids.insert(sst->run_identifier()).second;
@@ -849,8 +851,8 @@ private:
        return _table_s.get_compaction_strategy().make_sstable_set(_table_s);
    }

-    const tombstone_gc_state& get_tombstone_gc_state() const {
-        return _tombstone_gc_state_with_commitlog_check_disabled ? _tombstone_gc_state_with_commitlog_check_disabled.value() : _table_s.get_tombstone_gc_state();
+    tombstone_gc_state get_tombstone_gc_state() const {
+        return _tombstone_gc_state;
    }

    future<> setup() {
@@ -1050,7 +1052,7 @@ private:
            return can_never_purge;
        }
        return [this] (const dht::decorated_key& dk, is_shadowable is_shadowable) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp, _tombstone_gc_state_with_commitlog_check_disabled.has_value(), is_shadowable);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp, !_tombstone_gc_state.is_commitlog_check_enabled(), is_shadowable);
        };
    }

--- a/compaction/compaction_group_view.hh
+++ b/compaction/compaction_group_view.hh
@@ -54,7 +54,7 @@ public:
    virtual future<> on_compaction_completion(compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
    virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
    virtual bool tombstone_gc_enabled() const noexcept = 0;
-    virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;
+    virtual tombstone_gc_state get_tombstone_gc_state() const noexcept = 0;
    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
    virtual const std::string get_group_id() const noexcept = 0;
    virtual seastar::condition_variable& get_staging_done_condition() noexcept = 0;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -778,6 +778,7 @@ compaction_manager::get_incremental_repair_read_lock(compaction::compaction_grou
        cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
+    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
@@ -791,6 +792,7 @@ compaction_manager::get_incremental_repair_write_lock(compaction::compaction_gro
        cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
+    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
@@ -1040,7 +1042,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as, tasks::task
        _compaction_controller.set_max_shares(max_shares);
    }))
    , _strategy_control(std::make_unique<strategy_control>(*this))
-    , _tombstone_gc_state(_shared_tombstone_gc_state) {
+{
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
    register_metrics();
    // Bandwidth throttling is node-wide, updater is needed on single shard
@@ -1064,7 +1066,7 @@ compaction_manager::compaction_manager(tasks::task_manager& tm)
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
    , _compaction_max_shares_observer(_cfg.max_shares.observe([] (const float& max_shares) {}))
    , _strategy_control(std::make_unique<strategy_control>(*this))
-    , _tombstone_gc_state(_shared_tombstone_gc_state) {
+{
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
    // No metric registration because this constructor is supposed to be used only by the testing
    // infrastructure.
@@ -1519,7 +1521,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
            | std::ranges::to<std::unordered_set>());
    };
-    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
+    const auto injected_threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold");
+    const auto threshold = injected_threshold.value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
+
    auto count = co_await num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
@@ -1534,9 +1538,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
    auto& cstate = get_compaction_state(&t);
    try {
        while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
-            co_await cstate.compaction_done.wait([this, &t] {
-                return !can_perform_regular_compaction(t);
-            });
+            co_await cstate.compaction_done.when();
        }
    } catch (const broken_condition_variable&) {
        co_return;
@@ -2387,6 +2389,8 @@ future<> compaction_manager::remove(compaction_group_view& t, sstring reason) no
    if (!c_state.gate.is_closed()) {
        auto close_gate = c_state.gate.close();
        co_await stop_ongoing_compactions(reason, &t);
+        // Wait for users of incremental repair lock (can be either repair itself or maintenance compactions).
+        co_await c_state.incremental_repair_lock.write_lock();
        co_await std::move(close_gate);
    }

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -167,10 +167,6 @@ private:
    std::unique_ptr<strategy_control> _strategy_control;

    shared_tombstone_gc_state _shared_tombstone_gc_state;
-    // TODO: tombstone_gc_state should now have value semantics, but the code
-    // still uses it with reference semantics (inconsistently though).
-    // Drop this member, once the code is converted into using value semantics.
-    tombstone_gc_state _tombstone_gc_state;

    utils::disk_space_monitor::subscription _out_of_space_subscription;
 private:
@@ -456,10 +452,6 @@ public:

    compaction::strategy_control& get_strategy_control() const noexcept;

-    const tombstone_gc_state& get_tombstone_gc_state() const noexcept {
-        return _tombstone_gc_state;
-    };
-
    shared_tombstone_gc_state& get_shared_tombstone_gc_state() noexcept {
        return _shared_tombstone_gc_state;
    };
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -639,7 +639,7 @@ strict_is_not_null_in_views: true
 # * workdir: the node will open the maintenance socket on the path <scylla's workdir>/cql.m,
 #            where <scylla's workdir> is a path defined by the workdir configuration option,
 # * <socket path>: the node will open the maintenance socket on the path <socket path>.
-maintenance_socket: ignore
+maintenance_socket: workdir

 # If set to true, configuration parameters defined with LiveUpdate option can be updated in runtime with CQL
 # by updating system.config virtual table. If we don't want any configuration parameter to be changed in runtime
@@ -648,10 +648,9 @@ maintenance_socket: ignore
 # e.g. for cloud users, for whom scylla's configuration should be changed only by support engineers.
 # live_updatable_config_params_changeable_via_cql: true

-# ****************
-# *  GUARDRAILS  *
-# ****************
-
+#
+# Guardrails options
+#
 # Guardrails to warn or fail when Replication Factor is smaller/greater than the threshold.
 # Please note that the value of 0 is always allowed,
 # which means that having no replication at all, i.e. RF = 0, is always valid.
@@ -661,6 +660,27 @@ maintenance_socket: ignore
 # minimum_replication_factor_warn_threshold:  3
 # maximum_replication_factor_warn_threshold: -1
 # maximum_replication_factor_fail_threshold: -1
+#
+# Guardrails to warn about or disallow creating a keyspace with specific replication strategy.
+# Each of these 2 settings is a list storing replication strategies considered harmful.
+# The replication strategies to choose from are:
+# 1) SimpleStrategy,
+# 2) NetworkTopologyStrategy,
+# 3) LocalStrategy,
+# 4) EverywhereStrategy
+#
+# replication_strategy_warn_list:
+#  - SimpleStrategy
+# replication_strategy_fail_list:
+#
+# Guardrail to enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.
+# enable_create_table_with_compact_storage: false
+#
+# Guardrails to limit usage of selected consistency levels for writes.
+# Adding a warning to a CQL query response can significantly increase network
+# traffic and decrease overall throughput.
+# write_consistency_levels_warned: []
+# write_consistency_levels_disallowed: []

 #
 # System information encryption settings
@@ -838,21 +858,6 @@ maintenance_socket: ignore
 #   key_namespace: <kmip key namespace> (optional)
 #

-# Guardrails to warn about or disallow creating a keyspace with specific replication strategy.
-# Each of these 2 settings is a list storing replication strategies considered harmful.
-# The replication strategies to choose from are:
-# 1) SimpleStrategy,
-# 2) NetworkTopologyStrategy,
-# 3) LocalStrategy,
-# 4) EverywhereStrategy
-#
-# replication_strategy_warn_list:
-#  - SimpleStrategy
-# replication_strategy_fail_list:
-
-# Guardrail to enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.
-# enable_create_table_with_compact_storage: false
-
 # Control tablets for new keyspaces.
 # Can be set to: disabled|enabled|enforced
 #
@@ -874,7 +879,16 @@ maintenance_socket: ignore
 # The `tablets` option cannot be changed using `ALTER KEYSPACE`.
 tablets_mode_for_new_keyspaces: enabled

-# Enforce RF-rack-valid keyspaces.
+# Require every tablet-enabled keyspace to be RF-rack-valid.
+#
+# A tablet-enabled keyspace is RF-rack-valid when, for each data center,
+# its replication factor (RF) is 0, 1, or exactly equal to the number of
+# racks in that data center. Setting the RF to the number of racks ensures
+# that a single rack failure never results in data unavailability.
+#
+# When set to true, CREATE KEYSPACE and ALTER KEYSPACE statements that
+# would produce an RF-rack-invalid keyspace are rejected.
+# When set to false, such statements are allowed but emit a warning.
 rf_rack_valid_keyspaces: false

 #
--- a/configure.py
+++ b/configure.py
@@ -1192,6 +1192,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/azure/identity/default_credentials.cc',
                'utils/gcp/gcp_credentials.cc',
                'utils/gcp/object_storage.cc',
+                'utils/gcp/object_storage_retry_strategy.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -1203,6 +1204,7 @@ scylla_core = (['message/messaging_service.cc',
                'gms/application_state.cc',
                'gms/inet_address.cc',
                'dht/i_partitioner.cc',
+                'dht/fixed_shard.cc',
                'dht/token.cc',
                'dht/murmur3_partitioner.cc',
                'dht/boot_strapper.cc',
@@ -1274,6 +1276,7 @@ scylla_core = (['message/messaging_service.cc',
                'auth/resource.cc',
                'auth/roles-metadata.cc',
                'auth/passwords.cc',
+                'auth/maintenance_socket_authenticator.cc',
                'auth/password_authenticator.cc',
                'auth/permission.cc',
                'auth/service.cc',
@@ -1339,6 +1342,7 @@ scylla_core = (['message/messaging_service.cc',
                'service/strong_consistency/groups_manager.cc',
                'service/strong_consistency/coordinator.cc',
                'service/strong_consistency/state_machine.cc',
+                'service/strong_consistency/raft_groups_storage.cc',
                'service/raft/group0_state_id_handler.cc',
                'service/raft/group0_state_machine.cc',
                'service/raft/group0_state_machine_merger.cc',
@@ -1360,7 +1364,6 @@ scylla_core = (['message/messaging_service.cc',
                'service/topology_state_machine.cc',
                'service/topology_mutation.cc',
                'service/topology_coordinator.cc',
-                'node_ops/node_ops_ctl.cc',
                'node_ops/task_manager_module.cc',
                'reader_concurrency_semaphore_group.cc',
                'utils/disk_space_monitor.cc',
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -874,8 +874,8 @@ cfamDefinition[cql3::statements::create_table_statement::raw_statement& expr]
    ;

 cfamColumns[cql3::statements::create_table_statement::raw_statement& expr]
-    @init { bool is_static=false; }
-    : k=ident v=comparatorType (K_STATIC {is_static = true;})? { $expr.add_definition(k, v, is_static); }
+    @init { bool is_static=false, is_ttl=false; }
+    : k=ident v=comparatorType (K_TTL {is_ttl = true;})? (K_STATIC {is_static = true;})? { $expr.add_definition(k, v, is_static, is_ttl); }
        (K_PRIMARY K_KEY { $expr.add_key_aliases(std::vector<shared_ptr<cql3::column_identifier>>{k}); })?
    | K_PRIMARY K_KEY '(' pkDef[expr] (',' c=ident { $expr.add_column_alias(c); } )* ')'
    ;
@@ -1042,6 +1042,7 @@ alterTableStatement returns [std::unique_ptr<alter_table_statement::raw_statemen
        std::vector<alter_table_statement::column_change> column_changes;
        std::vector<std::pair<shared_ptr<cql3::column_identifier::raw>, shared_ptr<cql3::column_identifier::raw>>> renames;
        auto attrs = std::make_unique<cql3::attributes::raw>();
+        shared_ptr<cql3::column_identifier::raw> ttl_change;
    }
    : K_ALTER K_COLUMNFAMILY cf=columnFamilyName
          ( K_ALTER id=cident K_TYPE v=comparatorType { type = alter_table_statement::type::alter; column_changes.emplace_back(alter_table_statement::column_change{id, v}); }
@@ -1060,9 +1061,11 @@ alterTableStatement returns [std::unique_ptr<alter_table_statement::raw_statemen
          | K_RENAME                                  { type = alter_table_statement::type::rename; }
               id1=cident K_TO toId1=cident { renames.emplace_back(id1, toId1); }
               ( K_AND idn=cident K_TO toIdn=cident { renames.emplace_back(idn, toIdn); } )*
+          | K_TTL                                     { type = alter_table_statement::type::ttl; }
+               ( id=cident { ttl_change = id; } | K_NULL )
          )
    {
-        $expr = std::make_unique<alter_table_statement::raw_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames), std::move(attrs));
+        $expr = std::make_unique<alter_table_statement::raw_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames), std::move(attrs), std::move(ttl_change));
    }
    ;

@@ -2071,7 +2074,21 @@ vector_type returns [shared_ptr<cql3::cql3_type::raw> pt]
        {
            if ($d.text[0] == '-')
                throw exceptions::invalid_request_exception("Vectors must have a dimension greater than 0");
-            $pt = cql3::cql3_type::raw::vector(t, std::stoul($d.text));
+            unsigned long parsed_dimension;
+            try {
+                parsed_dimension = std::stoul($d.text);
+            } catch (const std::exception& e) {
+                throw exceptions::invalid_request_exception(format("Invalid vector dimension: {}", $d.text));
+            }
+            static_assert(sizeof(unsigned long) >= sizeof(vector_dimension_t));
+            if (parsed_dimension == 0) {
+                throw exceptions::invalid_request_exception("Vectors must have a dimension greater than 0");
+            }
+            if (parsed_dimension > cql3::cql3_type::MAX_VECTOR_DIMENSION) {
+                throw exceptions::invalid_request_exception(
+                        format("Vectors must have a dimension less than or equal to {}", cql3::cql3_type::MAX_VECTOR_DIMENSION));
+            }
+            $pt = cql3::cql3_type::raw::vector(t, static_cast<vector_dimension_t>(parsed_dimension));
        }
    ;

--- a/cql3/assignment_testable.hh
+++ b/cql3/assignment_testable.hh
@@ -27,7 +27,7 @@ public:

    struct vector_test_result {
        test_result result;
-        std::optional<size_t> dimension_opt;
+        std::optional<vector_dimension_t> dimension_opt;
    };

    static bool is_assignable(test_result tr) {
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -307,17 +307,14 @@ public:

 class cql3_type::raw_vector : public raw {
    shared_ptr<raw> _type;
-    size_t _dimension;
-
-    // This limitation is acquired from the maximum number of dimensions in OpenSearch. 
-    static constexpr size_t MAX_VECTOR_DIMENSION = 16000;
+    vector_dimension_t _dimension;

    virtual sstring to_string() const override {
        return seastar::format("vector<{}, {}>", _type, _dimension);
    }

 public:
-    raw_vector(shared_ptr<raw> type, size_t dimension)
+    raw_vector(shared_ptr<raw> type, vector_dimension_t dimension)
            : _type(std::move(type)), _dimension(dimension) {
    }

@@ -417,7 +414,7 @@ cql3_type::raw::tuple(std::vector<shared_ptr<raw>> ts) {
 }

 shared_ptr<cql3_type::raw>
-cql3_type::raw::vector(shared_ptr<raw> t, size_t dimension) {
+cql3_type::raw::vector(shared_ptr<raw> t, vector_dimension_t dimension) {
    return ::make_shared<raw_vector>(std::move(t), dimension);
 }

--- a/cql3/cql3_type.hh
+++ b/cql3/cql3_type.hh
@@ -39,6 +39,9 @@ public:
    data_type get_type() const { return _type; }
    const sstring& to_string() const { return _type->cql3_type_name(); }

+    // This limitation is acquired from the maximum number of dimensions in OpenSearch.
+    static constexpr vector_dimension_t MAX_VECTOR_DIMENSION = 16000;
+
    // For UserTypes, we need to know the current keyspace to resolve the
    // actual type used, so Raw is a "not yet prepared" CQL3Type.
    class raw {
@@ -64,7 +67,7 @@ public:
        static shared_ptr<raw> list(shared_ptr<raw> t);
        static shared_ptr<raw> set(shared_ptr<raw> t);
        static shared_ptr<raw> tuple(std::vector<shared_ptr<raw>> ts);
-        static shared_ptr<raw> vector(shared_ptr<raw> t, size_t dimension);
+        static shared_ptr<raw> vector(shared_ptr<raw> t, vector_dimension_t dimension);
        static shared_ptr<raw> frozen(shared_ptr<raw> t);
        friend sstring format_as(const raw& r) {
            return r.to_string();
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -502,8 +502,8 @@ vector_validate_assignable_to(const collection_constructor& c, data_dictionary::
        throw exceptions::invalid_request_exception(format("Invalid vector type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

-    size_t expected_size = vt->get_dimension();
-    if (!expected_size) {
+    vector_dimension_t expected_size = vt->get_dimension();
+    if (expected_size == 0) {
        throw exceptions::invalid_request_exception(format("Invalid vector type literal for {}: type {} expects at least one element",
                                                            *receiver.name, receiver.type->as_cql3_type()));
    }
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -10,15 +10,16 @@
 #include "types/types.hh"
 #include "types/vector.hh"
 #include "exceptions/exceptions.hh"
-#include <span>
 #include <bit>
+#include <span>
+#include <seastar/core/byteorder.hh>

 namespace cql3 {
 namespace functions {

 namespace detail {

-std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
+std::vector<float> extract_float_vector(const bytes_opt& param, vector_dimension_t dimension) {
    if (!param) {
        throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
    }
@@ -30,14 +31,10 @@ std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension
                       expected_size, dimension, param->size()));
    }

-    std::vector<float> result;
-    result.reserve(dimension);
-
-    bytes_view view(*param);
+    std::vector<float> result(dimension);
+    const char* p = reinterpret_cast<const char*>(param->data());
    for (size_t i = 0; i < dimension; ++i) {
-        // read_simple handles network byte order (big-endian) conversion
-        uint32_t raw = read_simple<uint32_t>(view);
-        result.push_back(std::bit_cast<float>(raw));
+        result[i] = std::bit_cast<float>(consume_be<uint32_t>(p));
    }

    return result;
@@ -55,13 +52,14 @@ namespace {
 // You should only use this function if you need to preserve the original vectors and cannot normalize
 // them in advance.
 float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
-    double dot_product = 0.0;
-    double squared_norm_a = 0.0;
-    double squared_norm_b = 0.0;
+    #pragma clang fp contract(fast) reassociate(on) // Allow the compiler to optimize the loop.
+    float dot_product = 0.0;
+    float squared_norm_a = 0.0;
+    float squared_norm_b = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        float a = v1[i];
+        float b = v2[i];

        dot_product += a * b;
        squared_norm_a += a * a;
@@ -69,7 +67,7 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
    }

    if (squared_norm_a == 0 || squared_norm_b == 0) {
-        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
+        return std::numeric_limits<float>::quiet_NaN();
    }

    // The cosine similarity is in the range [-1, 1].
@@ -79,13 +77,14 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
 }

 float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
-    double sum = 0.0;
+    #pragma clang fp contract(fast) reassociate(on) // Allow the compiler to optimize the loop.
+    float sum = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        float a = v1[i];
+        float b = v2[i];

-        double diff = a - b;
+        float diff = a - b;
        sum += diff * diff;
    }

@@ -98,11 +97,12 @@ float compute_euclidean_similarity(std::span<const float> v1, std::span<const fl
 // Assumes that both vectors are L2-normalized.
 // This similarity is intended as an optimized way to perform cosine similarity calculation.
 float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
-    double dot_product = 0.0;
+    #pragma clang fp contract(fast) reassociate(on) // Allow the compiler to optimize the loop.
+    float dot_product = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        float a = v1[i];
+        float b = v2[i];
        dot_product += a * b;
    }

@@ -156,7 +156,7 @@ std::vector<data_type> retrieve_vector_arg_types(const function_name& name, cons
        }
    }

-    size_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
+    vector_dimension_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
    auto type = vector_type_impl::get_instance(float_type, dimension);
    return {type, type};
 }
@@ -170,7 +170,7 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)

    // Extract dimension from the vector type
    const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
-    size_t dimension = type.get_dimension();
+    vector_dimension_t dimension = type.get_dimension();

    // Optimized path: extract floats directly from bytes, bypassing data_value overhead
    std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -39,7 +39,7 @@ namespace detail {
 // Extract float vector directly from serialized bytes, bypassing data_value overhead.
 // This is an internal API exposed for testing purposes.
 // Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
-std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
+std::vector<float> extract_float_vector(const bytes_opt& param, vector_dimension_t dimension);

 } // namespace detail

--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -91,7 +91,11 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
        , _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
        , _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
        , _lang_manager(langm)
+        , _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) { _write_consistency_levels_warned = to_consistency_level_set(v); }))
+        , _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) { _write_consistency_levels_disallowed = to_consistency_level_set(v); }))
        {
+    _write_consistency_levels_warned = to_consistency_level_set(_db.get_config().write_consistency_levels_warned());
+    _write_consistency_levels_disallowed = to_consistency_level_set(_db.get_config().write_consistency_levels_disallowed());
    namespace sm = seastar::metrics;
    namespace stm = statements;
    using clevel = db::consistency_level;
@@ -508,6 +512,32 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
                                            "i.e. attempts to set a forbidden replication strategy in a keyspace via CREATE/ALTER KEYSPACE.")).set_skip_when_empty(),
            });

+    std::vector<sm::metric_definition> cql_cl_group;
+    for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
+        cql_cl_group.push_back(
+            sm::make_counter(
+                "writes_per_consistency_level",
+                _cql_stats.writes_per_consistency_level[cl],
+                sm::description("Counts the number of writes for each consistency level."),
+                {cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
+    }
+    _metrics.add_group("cql", cql_cl_group);
+
+    _metrics.add_group("cql", {
+        sm::make_counter(
+            "write_consistency_levels_disallowed_violations",
+            _cql_stats.write_consistency_levels_disallowed_violations,
+            sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
+                            "i.e. attempts to write with a forbidden consistency level."),
+            {basic_level}),
+        sm::make_counter(
+            "write_consistency_levels_warned_violations",
+            _cql_stats.write_consistency_levels_warned_violations,
+            sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
+                            "i.e. attempts to write with a discouraged consistency level."),
+            {basic_level}),
+    });
+
    _mnotifier.register_listener(_migration_subscriber.get());
 }

@@ -1233,6 +1263,14 @@ shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_s
    return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard, std::move(cached_fn_calls));
 }

+query_processor::consistency_level_set query_processor::to_consistency_level_set(const query_processor::cl_option_list& levels) {
+    query_processor::consistency_level_set result;
+    for (const auto& opt : levels) {
+        result.set(static_cast<db::consistency_level>(opt));
+    }
+    return result;
+}
+
 void query_processor::update_authorized_prepared_cache_config() {
    utils::loading_cache_config cfg;
    cfg.max_size = _mcfg.authorized_prepared_cache_size;
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -34,6 +34,9 @@
 #include "service/raft/raft_group0_client.hh"
 #include "types/types.hh"
 #include "db/auth_version.hh"
+#include "db/consistency_level_type.hh"
+#include "db/config.hh"
+#include "utils/enum_option.hh"
 #include "service/storage_proxy_fwd.hh"


@@ -142,6 +145,30 @@ private:
    std::unordered_map<sstring, std::unique_ptr<statements::prepared_statement>> _internal_statements;

    lang::manager& _lang_manager;
+
+    using cl_option_list = std::vector<enum_option<db::consistency_level_restriction_t>>;
+
+    /// Efficient bitmask-based set of consistency levels.
+    using consistency_level_set = enum_set<super_enum<db::consistency_level,
+        db::consistency_level::ANY,
+        db::consistency_level::ONE,
+        db::consistency_level::TWO,
+        db::consistency_level::THREE,
+        db::consistency_level::QUORUM,
+        db::consistency_level::ALL,
+        db::consistency_level::LOCAL_QUORUM,
+        db::consistency_level::EACH_QUORUM,
+        db::consistency_level::SERIAL,
+        db::consistency_level::LOCAL_SERIAL,
+        db::consistency_level::LOCAL_ONE>>;
+
+
+    consistency_level_set _write_consistency_levels_warned;
+    consistency_level_set _write_consistency_levels_disallowed;
+    utils::observer<cl_option_list> _write_consistency_levels_warned_observer;
+    utils::observer<cl_option_list> _write_consistency_levels_disallowed_observer;
+
+    static consistency_level_set to_consistency_level_set(const cl_option_list& levels);
 public:
    static const sstring CQL_VERSION;

@@ -493,6 +520,21 @@ public:
            int32_t page_size = -1,
            service::node_local_only node_local_only = service::node_local_only::no) const;

+    enum class write_consistency_guardrail_state { NONE, WARN, FAIL };
+    inline write_consistency_guardrail_state check_write_consistency_levels_guardrail(db::consistency_level cl) {
+        _cql_stats.writes_per_consistency_level[size_t(cl)]++;
+
+        if (_write_consistency_levels_disallowed.contains(cl)) [[unlikely]] {
+            _cql_stats.write_consistency_levels_disallowed_violations++;
+            return write_consistency_guardrail_state::FAIL;
+        }
+        if (_write_consistency_levels_warned.contains(cl)) [[unlikely]] {
+            _cql_stats.write_consistency_levels_warned_violations++;
+            return write_consistency_guardrail_state::WARN;
+        }
+        return write_consistency_guardrail_state::NONE;
+    }
+
 private:
    // Keep the holder until you stop using the `remote` services.
    std::pair<std::reference_wrapper<remote>, gate::holder> remote();
--- a/cql3/query_result_printer.hh
+++ b/cql3/query_result_printer.hh
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <ostream>
+
+namespace cql3 {
+
+class result;
+
+void print_query_results_text(std::ostream& os, const result& result);
+void print_query_results_json(std::ostream& os, const result& result);
+
+} // namespace cql3
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -9,8 +9,10 @@
 */

 #include <cstdint>
+#include "types/json_utils.hh"
 #include "utils/assert.hh"
 #include "utils/hashers.hh"
+#include "utils/rjson.hh"
 #include "cql3/result_set.hh"

 namespace cql3 {
@@ -195,4 +197,85 @@ make_empty_metadata() {
    return empty_metadata_cache;
 }

+void print_query_results_text(std::ostream& os, const cql3::result& result) {
+    const auto& metadata = result.get_metadata();
+    const auto& column_metadata = metadata.get_names();
+
+    struct column_values {
+        size_t max_size{0};
+        sstring header_format;
+        sstring row_format;
+        std::vector<sstring> values;
+
+        void add(sstring value) {
+            max_size = std::max(max_size, value.size());
+            values.push_back(std::move(value));
+        }
+    };
+
+    std::vector<column_values> columns;
+    columns.resize(column_metadata.size());
+
+    for (size_t i = 0; i < column_metadata.size(); ++i) {
+        columns[i].add(column_metadata[i]->name->text());
+    }
+
+    for (const auto& row : result.result_set().rows()) {
+        for (size_t i = 0; i < row.size(); ++i) {
+            if (row[i]) {
+                columns[i].add(column_metadata[i]->type->to_string(linearized(managed_bytes_view(*row[i]))));
+            } else {
+                columns[i].add("");
+            }
+        }
+    }
+
+    std::vector<sstring> separators(columns.size(), sstring());
+    for (size_t i = 0; i < columns.size(); ++i) {
+        auto& col_values = columns[i];
+        col_values.header_format = seastar::format(" {{:<{}}} ", col_values.max_size);
+        col_values.row_format = seastar::format(" {{:>{}}} ", col_values.max_size);
+        for (size_t c = 0; c < col_values.max_size; ++c) {
+            separators[i] += "-";
+        }
+    }
+
+    for (size_t r = 0; r < result.result_set().rows().size() + 1; ++r) {
+        std::vector<sstring> row;
+        row.reserve(columns.size());
+        for (size_t i = 0; i < columns.size(); ++i) {
+            const auto& format = r == 0 ? columns[i].header_format : columns[i].row_format;
+            row.push_back(fmt::format(fmt::runtime(std::string_view(format)), columns[i].values[r]));
+        }
+        fmt::print(os, "{}\n", fmt::join(row, "|"));
+        if (!r) {
+            fmt::print(os, "-{}-\n", fmt::join(separators, "-+-"));
+        }
+    }
+}
+
+void print_query_results_json(std::ostream& os, const cql3::result& result) {
+    const auto& metadata = result.get_metadata();
+    const auto& column_metadata = metadata.get_names();
+
+    rjson::streaming_writer writer(os);
+
+    writer.StartArray();
+    for (const auto& row : result.result_set().rows()) {
+        writer.StartObject();
+        for (size_t i = 0; i < row.size(); ++i) {
+            writer.Key(column_metadata[i]->name->text());
+            if (!row[i] || row[i]->empty()) {
+                writer.Null();
+                continue;
+            }
+            const auto value = to_json_string(*column_metadata[i]->type, *row[i]);
+            const auto type = to_json_type(*column_metadata[i]->type, *row[i]);
+            writer.RawValue(value, type);
+        }
+        writer.EndObject();
+    }
+    writer.EndArray();
+}
+
 }
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -212,11 +212,20 @@ public:
    }

    virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
-        uint32_t index = selection::add_column_for_post_processing(c);
+        auto it = std::find_if(_selectors.begin(), _selectors.end(), [&c](const expr::expression& e) {
+            auto col = expr::as_if<expr::column_value>(&e);
+            return col && col->col == &c;
+        });
+        if (it != _selectors.end()) {
+            return std::distance(_selectors.begin(), it);
+        }
+
+        add_column(c);
+        get_result_metadata()->add_non_serialized_column(c.column_specification);
        _selectors.push_back(expr::column_value(&c));
        if (_inner_loop.empty()) {
            // Simple case: no aggregation
-            return index;
+            return _selectors.size() - 1;
        } else {
            // Complex case: aggregation, must pass through temporary
            auto first_func = cql3::functions::aggregate_fcts::make_first_function(c.type);
@@ -470,10 +479,21 @@ std::vector<const column_definition*> selection::wildcard_columns(schema_ptr sch
    return simple_selection::make(schema, std::move(columns), false);
 }

-uint32_t selection::add_column_for_post_processing(const column_definition& c) {
+selection::add_column_result selection::add_column(const column_definition& c) {
+    auto index = index_of(c);
+    if (index != -1) {
+        return {index, false};
+    }
    _columns.push_back(&c);
-    _metadata->add_non_serialized_column(c.column_specification);
-    return _columns.size() - 1;
+    return {_columns.size() - 1, true};
+}
+
+uint32_t selection::add_column_for_post_processing(const column_definition& c) {
+    auto col = add_column(c);
+    if (col.added) {
+        _metadata->add_non_serialized_column(c.column_specification);
+    }
+    return col.index;
 }

 ::shared_ptr<selection> selection::from_selectors(data_dictionary::database db, schema_ptr schema, const sstring& ks, const std::vector<prepared_selector>& prepared_selectors) {
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -130,6 +130,14 @@ public:
    virtual std::vector<shared_ptr<functions::function>> used_functions() const { return {}; }

    query::partition_slice::option_set get_query_options();
+protected:
+    // Result of add_column: index in _columns and whether it was added now (or existed already).
+    struct add_column_result {
+        uint32_t index;
+        bool added;
+    };
+    // Adds a column to the _columns if not already present, returns add_column_result.
+    add_column_result add_column(const column_definition& c);
 private:
    static bool processes_selection(const std::vector<prepared_selector>& prepared_selectors);

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -10,6 +10,7 @@

 #include "cdc/log.hh"
 #include "index/vector_index.hh"
+#include "types/types.hh"
 #include "utils/assert.hh"
 #include <seastar/core/coroutine.hh>
 #include "cql3/query_options.hh"
@@ -30,6 +31,9 @@
 #include "cql3/query_processor.hh"
 #include "cdc/cdc_extension.hh"
 #include "cdc/cdc_partitioner.hh"
+#include "db/tags/extension.hh"
+#include "db/tags/utils.hh"
+#include "alternator/ttl_tag.hh"

 namespace cql3 {

@@ -43,7 +47,8 @@ alter_table_statement::alter_table_statement(uint32_t bound_terms,
                                             std::vector<column_change> column_changes,
                                             std::optional<cf_prop_defs> properties,
                                             renames_type renames,
-                                             std::unique_ptr<attributes> attrs)
+                                             std::unique_ptr<attributes> attrs,
+                                             shared_ptr<column_identifier::raw> ttl_change)
    : schema_altering_statement(std::move(name))
    , _bound_terms(bound_terms)
    , _type(t)
@@ -51,6 +56,7 @@ alter_table_statement::alter_table_statement(uint32_t bound_terms,
    , _properties(std::move(properties))
    , _renames(std::move(renames))
    , _attrs(std::move(attrs))
+    , _ttl_change(std::move(ttl_change))
 {
 }

@@ -380,6 +386,21 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
            throw exceptions::invalid_request_exception("Cannot drop columns from a non-CQL3 table");
        }
        invoke_column_change_fn(std::mem_fn(&alter_table_statement::drop_column));
+
+        // If we dropped the column used for per-row TTL, we need to remove the tag.
+        if (std::optional<std::string> ttl_column = db::find_tag(*s, TTL_TAG_KEY)) {
+            for (auto& [raw_name, raw_validator, is_static] : _column_changes) {
+                if (*ttl_column == raw_name->text()) {
+                    const std::map<sstring, sstring>* tags_ptr = db::get_tags_of_table(s);
+                    if (tags_ptr) {
+                        std::map<sstring, sstring> tags_map = *tags_ptr;
+                        tags_map.erase(TTL_TAG_KEY);
+                        cfm.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
+                    }
+                    break;
+                }
+            }
+        }
        break;

    case alter_table_statement::type::opts:
@@ -434,6 +455,7 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
        break;

    case alter_table_statement::type::rename:
+    {
        for (auto&& entry : _renames) {
            auto from = entry.first->prepare_column_identifier(*s);
            auto to = entry.second->prepare_column_identifier(*s);
@@ -470,6 +492,53 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
        }
        return make_pair(std::move(new_base_schema), std::move(view_updates));
    }
+    case alter_table_statement::type::ttl:
+        if (!db.features().cql_row_ttl) {
+            throw exceptions::invalid_request_exception("The CQL per-row TTL feature is not yet supported by this cluster. Upgrade all nodes to use it.");
+        }
+        if (_ttl_change) {
+            // Enable per-row TTL with chosen column for expiration time
+            const column_definition *cdef = 
+                s->get_column_definition(to_bytes(_ttl_change->text()));
+            if (!cdef) {
+                throw exceptions::invalid_request_exception(fmt::format("Column '{}' does not exist in table {}.{}", _ttl_change->text(), keyspace(), column_family()));
+            }
+            if (cdef->type != timestamp_type && cdef->type != long_type && cdef->type != int32_type) {
+                throw exceptions::invalid_request_exception(fmt::format("TTL column {} must be of type timestamp, bigint or int, can't be {}", _ttl_change->text(), cdef->type->as_cql3_type().to_string()));
+            }
+            if (cdef->is_primary_key()) {
+                throw exceptions::invalid_request_exception(fmt::format("Cannot use a primary key column {} as a TTL column", _ttl_change->text()));
+            }
+            if (cdef->is_static()) {
+                throw exceptions::invalid_request_exception(fmt::format("Cannot use a static column {} as a TTL column", _ttl_change->text()));
+            }
+            std::optional<std::string> old_ttl_column = db::find_tag(*s, TTL_TAG_KEY);
+            if (old_ttl_column) {
+                throw exceptions::invalid_request_exception(fmt::format("Cannot set TTL column, table {}.{} already has a TTL column defined: {}", keyspace(), column_family(), *old_ttl_column));
+            }
+            const std::map<sstring, sstring>* old_tags_ptr = db::get_tags_of_table(s);
+            std::map<sstring, sstring> tags_map;
+            if (old_tags_ptr) {
+                // tags_ptr is a constant pointer to schema data. To modify
+                // it, we must make a copy.
+                tags_map = *old_tags_ptr;
+            }
+            tags_map[TTL_TAG_KEY] = _ttl_change->text();
+            cfm.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
+        } else {
+            // Disable per-row TTL
+            const std::map<sstring, sstring>* tags_ptr = db::get_tags_of_table(s);
+            if (!tags_ptr || tags_ptr->find(TTL_TAG_KEY) == tags_ptr->end()) {
+                throw exceptions::invalid_request_exception(fmt::format("Cannot unset TTL column, table {}.{} does not have a TTL column set", keyspace(), column_family()));
+            }
+            // tags_ptr is a constant pointer to schema data. To modify it, we
+            // must make a copy.
+            std::map<sstring, sstring> tags_map = *tags_ptr;
+            tags_map.erase(TTL_TAG_KEY);
+            cfm.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
+        }
+        break;
+    }

    return make_pair(cfm.build(), std::move(view_updates));
 }
@@ -508,13 +577,15 @@ alter_table_statement::raw_statement::raw_statement(cf_name name,
                                                    std::vector<column_change> column_changes,
                                                    std::optional<cf_prop_defs> properties,
                                                    renames_type renames,
-                                                    std::unique_ptr<attributes::raw> attrs)
+                                                    std::unique_ptr<attributes::raw> attrs,
+                                                    shared_ptr<column_identifier::raw> ttl_change)
    : cf_statement(std::move(name))
    , _type(t)
    , _column_changes(std::move(column_changes))
    , _properties(std::move(properties))
    , _renames(std::move(renames))
    , _attrs(std::move(attrs))
+    , _ttl_change(std::move(ttl_change))
    {}

 std::unique_ptr<cql3::statements::prepared_statement>
@@ -539,7 +610,8 @@ alter_table_statement::raw_statement::prepare(data_dictionary::database db, cql_
                _column_changes,
                _properties,
                _renames,
-                std::move(prepared_attrs)
+                std::move(prepared_attrs),
+                _ttl_change
            ),
            ctx,
            // since alter table is `cql_statement_no_metadata` (it doesn't return any metadata when preparing)
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -32,6 +32,7 @@ public:
        drop,
        opts,
        rename,
+        ttl,
    };
    using renames_type = std::vector<std::pair<shared_ptr<column_identifier::raw>,
                                               shared_ptr<column_identifier::raw>>>;
@@ -50,6 +51,7 @@ private:
    const std::optional<cf_prop_defs> _properties;
    const renames_type _renames;
    const std::unique_ptr<attributes> _attrs;
+    shared_ptr<column_identifier::raw> _ttl_change;
 public:
    alter_table_statement(uint32_t bound_terms,
                          cf_name name,
@@ -57,7 +59,8 @@ public:
                          std::vector<column_change> column_changes,
                          std::optional<cf_prop_defs> properties,
                          renames_type renames,
-                          std::unique_ptr<attributes> attrs);
+                          std::unique_ptr<attributes> attrs,
+                          shared_ptr<column_identifier::raw> ttl_change);

    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
@@ -78,6 +81,7 @@ class alter_table_statement::raw_statement : public raw::cf_statement {
    const std::optional<cf_prop_defs> _properties;
    const alter_table_statement::renames_type _renames;
    const std::unique_ptr<attributes::raw> _attrs;
+    shared_ptr<column_identifier::raw> _ttl_change;

 public:
    raw_statement(cf_name name,
@@ -85,7 +89,8 @@ public:
                  std::vector<column_change> column_changes,
                  std::optional<cf_prop_defs> properties,
                  renames_type renames,
-                  std::unique_ptr<attributes::raw> attrs);
+                  std::unique_ptr<attributes::raw> attrs,
+                  shared_ptr<column_identifier::raw> ttl_change);
    
    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -259,6 +259,15 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (options.getSerialConsistency() == null)
        throw new InvalidRequestException("Invalid empty serial consistency level");
 #endif
+
+    const auto cl = options.get_consistency();
+    const query_processor::write_consistency_guardrail_state guardrail_state = qp.check_write_consistency_levels_guardrail(cl);
+    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
+        return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(
+                exceptions::invalid_request_exception(
+                        format("Consistency level {} is not allowed for write operations", cl)));
+    }
+
    for (size_t i = 0; i < _statements.size(); ++i) {
        _statements[i].statement->restrictions().validate_primary_key(options.for_statement(i));
    }
@@ -266,23 +275,31 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (_has_conditions) {
        ++_stats.cas_batches;
        _stats.statements_in_cas_batches += _statements.size();
-        return execute_with_conditions(qp, options, query_state);
+        return execute_with_conditions(qp, options, query_state).then([guardrail_state, cl] (auto result) {
+            if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
+                result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+            }
+            return result;
+        });
    }

    ++_stats.batches;
    _stats.statements_in_batches += _statements.size();

    auto timeout = db::timeout_clock::now() + get_timeout(query_state.get_client_state(), options);
-    return get_mutations(qp, options, timeout, local, now, query_state).then([this, &qp, &options, timeout, tr_state = query_state.get_trace_state(),
+    return get_mutations(qp, options, timeout, local, now, query_state).then([this, &qp, cl, timeout, tr_state = query_state.get_trace_state(),
                                                                                                                               permit = query_state.get_permit()] (utils::chunked_vector<mutation> ms) mutable {
-        return execute_without_conditions(qp, std::move(ms), options.get_consistency(), timeout, std::move(tr_state), std::move(permit));
-    }).then([] (coordinator_result<> res) {
+        return execute_without_conditions(qp, std::move(ms), cl, timeout, std::move(tr_state), std::move(permit));
+    }).then([guardrail_state, cl] (coordinator_result<> res) {
        if (!res) {
            return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
                    seastar::make_shared<cql_transport::messages::result_message::exception>(std::move(res).assume_error()));
        }
-        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
-                make_shared<cql_transport::messages::result_message::void_message>());
+        auto result = make_shared<cql_transport::messages::result_message::void_message>();
+        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
+            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+        }
+        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(result));
    });
 }

--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -30,6 +30,9 @@
 #include "service/storage_proxy.hh"
 #include "db/config.hh"
 #include "compaction/time_window_compaction_strategy.hh"
+#include "db/tags/extension.hh"
+#include "db/tags/utils.hh"
+#include "alternator/ttl_tag.hh"

 namespace cql3 {

@@ -41,10 +44,12 @@ create_table_statement::create_table_statement(cf_name name,
                                               ::shared_ptr<cf_prop_defs> properties,
                                               bool if_not_exists,
                                               column_set_type static_columns,
+                                               ::shared_ptr<column_identifier> ttl_column,
                                               const std::optional<table_id>& id)
    : schema_altering_statement{name}
    , _use_compact_storage(false)
    , _static_columns{static_columns}
+    , _ttl_column{ttl_column}
    , _properties{properties}
    , _if_not_exists{if_not_exists}
    , _id(id)
@@ -123,6 +128,13 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
 #endif

    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
+    // Remembering which column was designated as the TTL column for row-based
+    // TTL column is done using a "tag" extension. If there is no TTL column,
+    // we don't need this extension at all.
+    if (_ttl_column) {
+        std::map<sstring, sstring> tags_map = {{TTL_TAG_KEY, _ttl_column->text()}};
+        builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
+    }
 }

 void create_table_statement::add_column_metadata_from_aliases(schema_builder& builder, std::vector<bytes> aliases, const std::vector<data_type>& types, column_kind kind) const
@@ -198,7 +210,7 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    }
    const bool has_default_ttl = _properties.properties()->get_default_time_to_live() > 0;

-    auto stmt = ::make_shared<create_table_statement>(*_cf_name, _properties.properties(), _if_not_exists, _static_columns, _properties.properties()->get_id());
+    auto stmt = ::make_shared<create_table_statement>(*_cf_name, _properties.properties(), _if_not_exists, _static_columns, _ttl_column, _properties.properties()->get_id());

    bool ks_uses_tablets;
    try {
@@ -403,6 +415,27 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
        }
    }

+    // If a TTL column is defined, it must be a regular column - not a static
+    // column or part of the primary key.
+    if (_ttl_column) {
+        if (!db.features().cql_row_ttl) {
+            throw exceptions::invalid_request_exception("The CQL per-row TTL feature is not yet supported by this cluster. Upgrade all nodes to use it.");
+        }
+        for (const auto& alias : key_aliases) {
+            if (alias->text() == _ttl_column->text()) {
+                throw exceptions::invalid_request_exception(format("TTL column {} cannot be part of the PRIMARY KEY", alias->text()));
+            }
+        }
+        for (const auto& alias : _column_aliases) {
+            if (alias->text() == _ttl_column->text()) {
+                throw exceptions::invalid_request_exception(format("TTL column {} cannot be part of the PRIMARY KEY", alias->text()));
+            }
+        }
+        if (_static_columns.contains(_ttl_column)) {
+            throw exceptions::invalid_request_exception(format("TTL column {} cannot be a static column", _ttl_column->text()));
+        }
+    }
+
    return std::make_unique<prepared_statement>(audit_info(), stmt, std::move(stmt_warnings));
 }

@@ -425,12 +458,23 @@ data_type create_table_statement::raw_statement::get_type_and_remove(column_map_
    return _properties.get_reversable_type(*t, type);
 }

-void create_table_statement::raw_statement::add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static) {
+void create_table_statement::raw_statement::add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static, bool is_ttl) {
    _defined_names.emplace(def);
    _definitions.emplace(def, type);
    if (is_static) {
        _static_columns.emplace(def);
    }
+    if (is_ttl) {
+        if (_ttl_column) {
+            throw exceptions::invalid_request_exception(fmt::format("Cannot have more than one TTL column in a table. Saw {} and {}", _ttl_column->text(), def->text()));
+        }
+        // FIXME: find a way to check cql3_type::raw without fmt::format
+        auto type_name = fmt::format("{}", type);
+        if (type_name != "timestamp" && type_name != "bigint" && type_name != "int") {
+            throw exceptions::invalid_request_exception(fmt::format("TTL column '{}' must be of type timestamp, bigint or int, can't be {}", def->text(), type_name));
+        }
+        _ttl_column = def;
+    }
 }

 void create_table_statement::raw_statement::add_key_aliases(const std::vector<::shared_ptr<column_identifier>> aliases) {
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -57,6 +57,7 @@ class create_table_statement : public schema_altering_statement {
                           shared_ptr_equal_by_value<column_identifier>>;
    column_map_type _columns;
    column_set_type _static_columns;
+    ::shared_ptr<column_identifier> _ttl_column; // for row-based TTL
    const ::shared_ptr<cf_prop_defs> _properties;
    const bool _if_not_exists;
    std::optional<table_id> _id;
@@ -65,6 +66,7 @@ public:
                           ::shared_ptr<cf_prop_defs> properties,
                           bool if_not_exists,
                           column_set_type static_columns,
+                           ::shared_ptr<column_identifier> ttl_column,
                           const std::optional<table_id>& id);

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
@@ -100,6 +102,7 @@ private:
    std::vector<std::vector<::shared_ptr<column_identifier>>> _key_aliases;
    std::vector<::shared_ptr<column_identifier>> _column_aliases;
    create_table_statement::column_set_type _static_columns;
+    ::shared_ptr<column_identifier> _ttl_column; // for row-based TTL

    std::multiset<::shared_ptr<column_identifier>,
            indirect_less<::shared_ptr<column_identifier>, column_identifier::text_comparator>> _defined_names;
@@ -116,7 +119,7 @@ public:

    data_type get_type_and_remove(column_map_type& columns, ::shared_ptr<column_identifier> t);

-    void add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static);
+    void add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static, bool is_ttl);

    void add_key_aliases(const std::vector<::shared_ptr<column_identifier>> aliases);

--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -659,8 +659,7 @@ future<std::vector<std::vector<managed_bytes_opt>>> schema_describe_statement::d
            auto& auth_service = *client_state.get_auth_service();

            if (config.with_hashed_passwords) {
-                const auto maybe_user = client_state.user();
-                if (!maybe_user || !co_await auth::has_superuser(auth_service, *maybe_user)) {
+                if (!co_await client_state.has_superuser()) {
                    co_await coroutine::return_exception(exceptions::unauthorized_exception(
                            "DESCRIBE SCHEMA WITH INTERNALS AND PASSWORDS can only be issued by a superuser"));
                }
--- a/cql3/statements/list_permissions_statement.cc
+++ b/cql3/statements/list_permissions_statement.cc
@@ -49,7 +49,7 @@ future<> cql3::statements::list_permissions_statement::check_access(query_proces
    const auto& as = *state.get_auth_service();
    const auto user = state.user();

-    return auth::has_superuser(as, *user).then([this, &as, user](bool has_super) {
+    return state.has_superuser().then([this, &as, user](bool has_super) {
        if (has_super) {
            return make_ready_future<>();
        }
--- a/cql3/statements/list_users_statement.cc
+++ b/cql3/statements/list_users_statement.cc
@@ -74,7 +74,7 @@ cql3::statements::list_users_statement::execute(query_processor& qp, service::qu
    const auto& cs = state.get_client_state();
    const auto& as = *cs.get_auth_service();

-    return auth::has_superuser(as, *cs.user()).then([&cs, &as, make_results = std::move(make_results)](bool has_superuser) mutable {
+    return cs.has_superuser().then([&cs, &as, make_results = std::move(make_results)](bool has_superuser) mutable {
        if (has_superuser) {
            return as.underlying_role_manager().query_all().then([&as, make_results = std::move(make_results)](std::unordered_set<sstring> roles) mutable {
                return make_results(as, std::move(roles));
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -268,10 +268,22 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    inc_cql_stats(qs.get_client_state().is_internal());

+    const auto cl = options.get_consistency();
+    const query_processor::write_consistency_guardrail_state guardrail_state = qp.check_write_consistency_levels_guardrail(cl);
+    if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
+        co_return coroutine::exception(
+                std::make_exception_ptr(exceptions::invalid_request_exception(
+                        format("Consistency level {} is not allowed for write operations", cl))));
+    }
+
    _restrictions->validate_primary_key(options);

    if (has_conditions()) {
-        co_return co_await execute_with_condition(qp, qs, options);
+        auto result = co_await execute_with_condition(qp, qs, options);
+        if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
+            result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+        }
+        co_return result;
    }

    json_cache_opt json_cache = maybe_prepare_json_cache(options);
@@ -290,6 +302,9 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
    }

    auto result = seastar::make_shared<cql_transport::messages::result_message::void_message>();
+    if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
+        result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
+    }
    if (keys_size_one) {
        auto&& table = s->table();
        if (_may_use_token_aware_routing && table.uses_tablets() && qs.get_client_state().is_protocol_extension_set(cql_transport::cql_protocol_extension::TABLETS_ROUTING_V1)) {
--- a/cql3/statements/role-management-statements.cc
+++ b/cql3/statements/role-management-statements.cc
@@ -94,7 +94,7 @@ future<> create_role_statement::check_access(query_processor& qp, const service:
            return;
        }

-        const bool has_superuser = auth::has_superuser(*state.get_auth_service(), *state.user()).get();
+        const bool has_superuser = state.has_superuser().get();

        if (_options.hashed_password && !has_superuser) {
            throw exceptions::unauthorized_exception("Only superusers can create a role with a hashed password.");
@@ -213,7 +213,7 @@ future<> alter_role_statement::check_access(query_processor& qp, const service::
        auto& as = *state.get_auth_service();

        const auto& user = *state.user();
-        const bool user_is_superuser = auth::has_superuser(as, user).get();
+        const bool user_is_superuser = state.has_superuser().get();

        if (_options.is_superuser) {
            if (!user_is_superuser) {
@@ -306,7 +306,7 @@ future<> drop_role_statement::check_access(query_processor& qp, const service::c

        auto& as = *state.get_auth_service();

-        const bool user_is_superuser = auth::has_superuser(as, *state.user()).get();
+        const bool user_is_superuser = state.has_superuser().get();

        const bool role_has_superuser = [this, &as] {
            try {
@@ -442,7 +442,7 @@ list_roles_statement::execute(query_processor& qp, service::query_state& state,
    const auto& cs = state.get_client_state();
    const auto& as = *cs.get_auth_service();

-    return auth::has_superuser(as, *cs.user()).then([this, &cs, &as, make_results = std::move(make_results)](bool super) mutable {
+    return cs.has_superuser().then([this, &cs, &as, make_results = std::move(make_results)](bool super) mutable {
        auto& rm = as.underlying_role_manager();
        const auto& a = as.underlying_authenticator();
        const auto query_mode = _recursive ? auth::recursive_role_query::yes : auth::recursive_role_query::no;
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -2004,9 +2004,7 @@ static std::optional<ann_ordering_info> get_ann_ordering_info(

    auto indexes = sim.list_indexes();
    auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
-        return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
-                       ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
-               (ind.target_column() == prepared_ann_ordering.first->name_as_text());
+        return secondary_index::vector_index::is_vector_index_on_column(ind.metadata(), prepared_ann_ordering.first->name_as_text());
    });

    if (it == indexes.end()) {
@@ -2759,11 +2757,7 @@ select_statement::ordering_comparator_type select_statement::get_ordering_compar
    // even if we don't
    // ultimately ship them to the client (CASSANDRA-4911).
    for (auto&& [column_def, is_descending] : orderings) {
-        auto index = selection.index_of(*column_def);
-        if (index < 0) {
-            index = selection.add_column_for_post_processing(*column_def);
-        }
-
+        auto index = selection.add_column_for_post_processing(*column_def);
        sorters.emplace_back(index, column_def->type);
    }

@@ -2866,9 +2860,7 @@ void select_statement::ensure_filtering_columns_retrieval(data_dictionary::datab
                                        selection::selection& selection,
                                        const restrictions::statement_restrictions& restrictions) {
    for (auto&& cdef : restrictions.get_column_defs_for_filtering(db)) {
-        if (!selection.has_column(*cdef)) {
-            selection.add_column_for_post_processing(*cdef);
-        }
+        selection.add_column_for_post_processing(*cdef);
    }
 }

--- a/cql3/stats.hh
+++ b/cql3/stats.hh
@@ -11,6 +11,7 @@
 #pragma once

 #include "cql3/statements/statement_type.hh"
+#include "db/consistency_level_type.hh"

 #include <cstdint>

@@ -87,6 +88,9 @@ struct cql_stats {

    uint64_t replication_strategy_warn_list_violations = 0;
    uint64_t replication_strategy_fail_list_violations = 0;
+    uint64_t writes_per_consistency_level[size_t(db::consistency_level::MAX_VALUE) + 1] = {};
+    uint64_t write_consistency_levels_disallowed_violations = 0;
+    uint64_t write_consistency_levels_warned_violations = 0;

 private:
    uint64_t _unpaged_select_queries[(size_t)ks_selector::SIZE] = {0ul};
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -55,8 +55,21 @@ int32_t batchlog_shard_of(db_clock::time_point written_at) {
    return hash & ((1ULL << batchlog_shard_bits) - 1);
 }

+bool is_batchlog_v1(const schema& schema) {
+    return schema.cf_name() == system_keyspace::BATCHLOG;
+}
+
 std::pair<partition_key, clustering_key>
 get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
+    if (is_batchlog_v1(schema)) {
+        if (!id) {
+            on_internal_error(blogger, "get_batchlog_key(): key for batchlog v1 requires batchlog id");
+        }
+        auto pkey = partition_key::from_single_value(schema, {serialized(*id)});
+        auto ckey = clustering_key::make_empty();
+        return std::pair(std::move(pkey), std::move(ckey));
+    }
+
    auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});

    std::vector<bytes> ckey_components;
@@ -85,6 +98,14 @@ mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_
    auto cdef_data = schema->get_column_definition(to_bytes("data"));
    m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));

+    if (is_batchlog_v1(*schema)) {
+        auto cdef_version = schema->get_column_definition(to_bytes("version"));
+        m.set_cell(ckey, *cdef_version, atomic_cell::make_live(*cdef_version->type, timestamp, serialized(version)));
+
+        auto cdef_written_at = schema->get_column_definition(to_bytes("written_at"));
+        m.set_cell(ckey, *cdef_written_at, atomic_cell::make_live(*cdef_written_at->type, timestamp, serialized(now)));
+    }
+
    return m;
 }

@@ -122,9 +143,10 @@ mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clo
 const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

-db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
+db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config)
        : _qp(qp)
        , _sys_ks(sys_ks)
+        , _fs(fs)
        , _replay_timeout(config.replay_timeout)
        , _replay_rate(config.replay_rate)
        , _delay(config.delay)
@@ -300,23 +322,156 @@ future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
    });
 }

-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
-    co_await maybe_migrate_v1_to_v2();
+namespace {

-    typedef db_clock::rep clock_type;
+using clock_type = db_clock::rep;

+struct replay_stats {
+    std::optional<db_clock::time_point> min_too_fresh;
+    bool need_cleanup = false;
+};
+
+} // anonymous namespace
+
+static future<db::all_batches_replayed> process_batch(
+        cql3::query_processor& qp,
+        db::batchlog_manager::stats& stats,
+        db::batchlog_manager::post_replay_cleanup cleanup,
+        utils::rate_limiter& limiter,
+        schema_ptr schema,
+        std::unordered_map<int32_t, replay_stats>& replay_stats_per_shard,
+        const db_clock::time_point now,
+        db_clock::duration replay_timeout,
+        std::chrono::seconds write_timeout,
+        const cql3::untyped_result_set::row& row) {
+    const bool is_v1 = db::is_batchlog_v1(*schema);
+    const auto stage = is_v1 ? db::batchlog_stage::initial : static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
+    const auto batch_shard = is_v1 ? 0 : row.get_as<int32_t>("shard");
+    auto written_at = row.get_as<db_clock::time_point>("written_at");
+    auto id = row.get_as<utils::UUID>("id");
+    // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
+    auto timeout = replay_timeout;
+
+    if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+        blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+        co_return db::all_batches_replayed::no;
+    }
+
+    auto data = row.get_blob_unfragmented("data");
+
+    blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
+
+    utils::chunked_vector<mutation> mutations;
+    bool send_failed = false;
+
+    auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
+
+    try {
+        utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
+        auto in = ser::as_input_stream(data);
+        while (in.size()) {
+            auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
+            const auto tbl = qp.db().try_find_table(fm.column_family_id());
+            if (!tbl) {
+                continue;
+            }
+            if (written_at <= tbl->get_truncation_time()) {
+                continue;
+            }
+            schema_ptr s = tbl->schema();
+            if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
+                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+            }
+            fms.emplace_back(std::move(fm), std::move(s));
+        }
+
+        if (now < written_at + timeout) {
+            blogger.debug("Skipping replay of {}, too fresh", id);
+
+            shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
+
+            co_return db::all_batches_replayed::no;
+        }
+
+        auto size = data.size();
+
+        for (const auto& [fm, s] : fms) {
+            mutations.emplace_back(fm.to_mutation(s));
+            co_await coroutine::maybe_yield();
+        }
+
+        if (!mutations.empty()) {
+            const auto ttl = [written_at]() -> clock_type {
+                /*
+                * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
+                * This ensures that deletes aren't "undone" by an old batch replay.
+                */
+                auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
+                warn(unimplemented::cause::HINT);
+#if 0
+                for (auto& m : *mutations) {
+                    unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+                }
+#endif
+                return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
+            }();
+
+            if (ttl > 0) {
+                // Origin does the send manually, however I can't see a super great reason to do so.
+                // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
+                // in both cases.
+                // FIXME: verify that the above is reasonably true.
+                co_await limiter.reserve(size);
+                stats.write_attempts += mutations.size();
+                auto timeout = db::timeout_clock::now() + write_timeout;
+                if (cleanup) {
+                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
+                } else {
+                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                }
+            }
+        }
+    } catch (data_dictionary::no_such_keyspace& ex) {
+        // should probably ignore and drop the batch
+    } catch (const data_dictionary::no_such_column_family&) {
+        // As above -- we should drop the batch if the table doesn't exist anymore.
+    } catch (...) {
+        blogger.warn("Replay failed (will retry): {}", std::current_exception());
+        // timeout, overload etc.
+        // Do _not_ remove the batch, assuning we got a node write error.
+        // Since we don't have hints (which origin is satisfied with),
+        // we have to resort to keeping this batch to next lap.
+        if (is_v1 || !cleanup || stage == db::batchlog_stage::failed_replay) {
+            co_return db::all_batches_replayed::no;
+        }
+        send_failed = true;
+    }
+
+    auto& sp = qp.proxy();
+
+    if (send_failed) {
+        blogger.debug("Moving batch {} to stage failed_replay", id);
+        auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, db::batchlog_stage::failed_replay, written_at, id);
+        co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+    }
+
+    // delete batch
+    auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
+    co_await qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+    shard_written_at.need_cleanup = true;
+
+    co_return db::all_batches_replayed(!send_failed);
+}
+
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v1(post_replay_cleanup) {
    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
-    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);
+    utils::rate_limiter limiter(throttle);

-    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
-
-    struct replay_stats {
-        std::optional<db_clock::time_point> min_too_fresh;
-        bool need_cleanup = false;
-    };
+    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);

    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;

@@ -324,125 +479,49 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
    // same across a while prefix of written_at (across all ids).
    const auto now = db_clock::now();

-    auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
-        const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
-        const auto batch_shard = row.get_as<int32_t>("shard");
-        auto written_at = row.get_as<db_clock::time_point>("written_at");
-        auto id = row.get_as<utils::UUID>("id");
-        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-        auto timeout = _replay_timeout;
+    auto batch = [this, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
+        all_replayed = all_replayed && co_await process_batch(_qp, _stats, post_replay_cleanup::no, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
+        co_return stop_iteration::no;
+    };

-        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
-            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
-            all_replayed = all_batches_replayed::no;
-            co_return stop_iteration::no;
-        }
+    co_await with_gate(_gate, [this, &all_replayed, batch = std::move(batch)] () mutable -> future<> {
+        blogger.debug("Started replayAllFailedBatches");
+        co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));

-        auto data = row.get_blob_unfragmented("data");
+        auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);

-        blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
+        co_await _qp.query_internal(
+                format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
+                db::consistency_level::ONE,
+                {},
+                page_size,
+                batch);

-        utils::chunked_vector<mutation> mutations;
-        bool send_failed = false;
+        blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
+    });

-        auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
+    co_return all_replayed;
+}

-        try {
-            utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
-            auto in = ser::as_input_stream(data);
-            while (in.size()) {
-                auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
-                const auto tbl = _qp.db().try_find_table(fm.column_family_id());
-                if (!tbl) {
-                    continue;
-                }
-                if (written_at <= tbl->get_truncation_time()) {
-                    continue;
-                }
-                schema_ptr s = tbl->schema();
-                if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
-                    timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
-                }
-                fms.emplace_back(std::move(fm), std::move(s));
-            }
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v2(post_replay_cleanup cleanup) {
+    co_await maybe_migrate_v1_to_v2();

-            if (now < written_at + timeout) {
-                blogger.debug("Skipping replay of {}, too fresh", id);
+    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
+    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
+    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
+    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
+    utils::rate_limiter limiter(throttle);

-                shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
+    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);

-                co_return stop_iteration::no;
-            }
+    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;

-            auto size = data.size();
-
-            for (const auto& [fm, s] : fms) {
-                mutations.emplace_back(fm.to_mutation(s));
-                co_await coroutine::maybe_yield();
-            }
-
-            if (!mutations.empty()) {
-                const auto ttl = [written_at]() -> clock_type {
-                    /*
-                    * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-                    * This ensures that deletes aren't "undone" by an old batch replay.
-                    */
-                    auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
-                    warn(unimplemented::cause::HINT);
-#if 0
-                    for (auto& m : *mutations) {
-                        unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
-                    }
-#endif
-                    return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
-                }();
-
-                if (ttl > 0) {
-                    // Origin does the send manually, however I can't see a super great reason to do so.
-                    // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
-                    // in both cases.
-                    // FIXME: verify that the above is reasonably true.
-                    co_await limiter->reserve(size);
-                    _stats.write_attempts += mutations.size();
-                    auto timeout = db::timeout_clock::now() + write_timeout;
-                    if (cleanup) {
-                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
-                    } else {
-                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
-                    }
-                }
-            }
-        } catch (data_dictionary::no_such_keyspace& ex) {
-            // should probably ignore and drop the batch
-        } catch (const data_dictionary::no_such_column_family&) {
-            // As above -- we should drop the batch if the table doesn't exist anymore.
-        } catch (...) {
-            blogger.warn("Replay failed (will retry): {}", std::current_exception());
-            all_replayed = all_batches_replayed::no;
-            // timeout, overload etc.
-            // Do _not_ remove the batch, assuning we got a node write error.
-            // Since we don't have hints (which origin is satisfied with),
-            // we have to resort to keeping this batch to next lap.
-            if (!cleanup || stage == batchlog_stage::failed_replay) {
-                co_return stop_iteration::no;
-            }
-            send_failed = true;
-        }
-
-        auto& sp = _qp.proxy();
-
-        if (send_failed) {
-            blogger.debug("Moving batch {} to stage failed_replay", id);
-            auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
-            co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-        }
-
-        // delete batch
-        auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
-        co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-
-        shard_written_at.need_cleanup = true;
+    // Use a stable `now` across all batches, so skip/replay decisions are the
+    // same across a while prefix of written_at (across all ids).
+    const auto now = db_clock::now();

+    auto batch = [this, cleanup, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
+        all_replayed = all_replayed && co_await process_batch(_qp, _stats, cleanup, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
        co_return stop_iteration::no;
    };

@@ -501,3 +580,10 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches

    co_return all_replayed;
 }
+
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
+    if (_fs.batchlog_v2) {
+        return replay_all_failed_batches_v2(cleanup);
+    }
+    return replay_all_failed_batches_v1(cleanup);
+}
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -27,6 +27,12 @@ class query_processor;

 } // namespace cql3

+namespace gms {
+
+class feature_service;
+
+} // namespace gms
+
 namespace db {

 class system_keyspace;
@@ -49,6 +55,11 @@ class batchlog_manager : public peering_sharded_service<batchlog_manager> {
 public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

+    struct stats {
+        uint64_t write_attempts = 0;
+    };
+
+
 private:
    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
@@ -56,14 +67,13 @@ private:

    using clock_type = lowres_clock;

-    struct stats {
-        uint64_t write_attempts = 0;
-    } _stats;
+    stats _stats;

    seastar::metrics::metric_groups _metrics;

    cql3::query_processor& _qp;
    db::system_keyspace& _sys_ks;
+    gms::feature_service& _fs;
    db_clock::duration _replay_timeout;
    uint64_t _replay_rate;
    std::chrono::milliseconds _delay;
@@ -84,12 +94,14 @@ private:

    future<> maybe_migrate_v1_to_v2();

+    future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
+    future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
    // to be per shard and does no dispatching beyond delegating the the
    // shard qp (which is what you feed here).
-    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, batchlog_manager_config config);
+    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);

    // abort the replay loop and return its future.
    future<> drain();
@@ -102,7 +114,7 @@ public:
        return _last_replay;
    }

-    const stats& stats() const {
+    const stats& get_stats() const {
        return _stats;
    }
 private:
--- a/db/cache_mutation_reader.hh
+++ b/db/cache_mutation_reader.hh
@@ -199,18 +199,9 @@ class cache_mutation_reader final : public mutation_reader::impl {
        return *_snp->schema();
    }

-    gc_clock::time_point get_read_time() {
-        return _read_context.tombstone_gc_state() ? gc_clock::now() : gc_clock::time_point::min();
-    }
-
    gc_clock::time_point get_gc_before() {
        if (!_gc_before.has_value()) {
-            auto gc_state = _read_context.tombstone_gc_state();
-            if (gc_state) {
-                _gc_before = gc_state->with_commitlog_check_disabled().get_gc_before_for_key(_schema, _dk, _read_time);
-            } else {
-                _gc_before = gc_clock::time_point::min();
-            }
+            _gc_before = _read_context.tombstone_gc_state().with_commitlog_check_disabled().get_gc_before_for_key(_schema, _dk, _read_time);
        }
        return *_gc_before;
    }
@@ -242,7 +233,7 @@ public:
        , _read_context_holder()
        , _read_context(ctx)    // ctx is owned by the caller, who's responsible for closing it.
        , _next_row(*_schema, *_snp, false, _read_context.is_reversed())
-        , _read_time(get_read_time())
+        , _read_time(gc_clock::now())
    {
        clogger.trace("csm {}: table={}.{}, dk={}, reversed={}, snap={}",
                fmt::ptr(this),
@@ -801,7 +792,7 @@ void cache_mutation_reader::copy_from_cache_to_buffer() {
    if (_next_row_in_range) {
        bool remove_row = false;

-        if (_read_context.tombstone_gc_state() // do not compact rows when tombstone_gc_state is not set (used in some unit tests)
+        if (_read_context.tombstone_gc_state().is_gc_enabled() // do not compact rows when set to no_gc() (used in some unit tests)
            && !_next_row.dummy()
            && _snp->at_latest_version()
            && _snp->at_oldest_version()) {
--- a/db/config.cc
+++ b/db/config.cc
@@ -266,6 +266,13 @@ const config_type& config_type_for<std::vector<enum_option<db::replication_strat
    return ct;
 }

+template <>
+const config_type& config_type_for<std::vector<enum_option<db::consistency_level_restriction_t>>>() {
+    static config_type ct(
+        "consistency level list", printable_vector_to_json<enum_option<db::consistency_level_restriction_t>>);
+    return ct;
+}
+
 template <>
 const config_type& config_type_for<enum_option<db::tri_mode_restriction_t>>() {
    static config_type ct(
@@ -415,6 +422,23 @@ public:
    }
 };

+template <>
+class convert<enum_option<db::consistency_level_restriction_t>> {
+public:
+    static bool decode(const Node& node, enum_option<db::consistency_level_restriction_t>& rhs) {
+        std::string name;
+        if (!convert<std::string>::decode(node, name)) {
+            return false;
+        }
+        try {
+            std::istringstream(name) >> rhs;
+        } catch (boost::program_options::invalid_option_value&) {
+            return false;
+        }
+        return true;
+    }
+};
+
 template <>
 class convert<enum_option<db::tri_mode_restriction_t>> {
 public:
@@ -1066,7 +1090,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Enable or disable the native transport server. Uses the same address as the rpc_address, but the port is different from the rpc_port. See native_transport_port.")
    , native_transport_port(this, "native_transport_port", "cql_port", value_status::Used, 9042,
        "Port on which the CQL native transport listens for clients.")
-    , maintenance_socket(this, "maintenance_socket", value_status::Used, "ignore",
+    , maintenance_socket(this, "maintenance_socket", value_status::Used, "workdir",
        "The Unix Domain Socket the node uses for maintenance socket.\n"
        "The possible options are:\n"
        "\tignore         the node will not open the maintenance socket.\n"
@@ -1292,7 +1316,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , fd_initial_value_ms(this, "fd_initial_value_ms", value_status::Used, 2 * 1000, "The initial failure_detector interval time in milliseconds.")
    , shutdown_announce_in_ms(this, "shutdown_announce_in_ms", value_status::Used, 2 * 1000, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.")
    , developer_mode(this, "developer_mode", value_status::Used, DEVELOPER_MODE_DEFAULT, "Relax environment checks. Setting to true can reduce performance and reliability significantly.")
-    , skip_wait_for_gossip_to_settle(this, "skip_wait_for_gossip_to_settle", value_status::Used, -1, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.")
+    , skip_wait_for_gossip_to_settle(this, "skip_wait_for_gossip_to_settle", value_status::Deprecated, -1, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.")
    , force_gossip_generation(this, "force_gossip_generation", liveness::LiveUpdate, value_status::Used, -1 , "Force gossip to use the generation number provided by user.")
    , experimental_features(this, "experimental_features", value_status::Used, {}, experimental_features_help_string())
    , lsa_reclamation_step(this, "lsa_reclamation_step", value_status::Used, 1, "Minimum number of segments to reclaim in a single step.")
@@ -1515,10 +1539,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Ignored if authentication tables already contain a super user password.")
    , auth_certificate_role_queries(this, "auth_certificate_role_queries", value_status::Used, { { { "source", "SUBJECT" }, {"query", "CN=([^,]+)" } } },
        "Regular expression used by CertificateAuthenticator to extract role name from an accepted transport authentication certificate subject info.")
+    , enable_create_table_with_compact_storage(this, "enable_create_table_with_compact_storage", liveness::LiveUpdate, value_status::Used, false, "Enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.  This feature will eventually be removed in a future version.")
    , minimum_replication_factor_fail_threshold(this, "minimum_replication_factor_fail_threshold", liveness::LiveUpdate, value_status::Used, -1, "")
    , minimum_replication_factor_warn_threshold(this, "minimum_replication_factor_warn_threshold", liveness::LiveUpdate, value_status::Used,  3, "")
-    , maximum_replication_factor_warn_threshold(this, "maximum_replication_factor_warn_threshold", liveness::LiveUpdate, value_status::Used, -1, "")
    , maximum_replication_factor_fail_threshold(this, "maximum_replication_factor_fail_threshold", liveness::LiveUpdate, value_status::Used, -1, "")
+    , maximum_replication_factor_warn_threshold(this, "maximum_replication_factor_warn_threshold", liveness::LiveUpdate, value_status::Used, -1, "")
+    , replication_strategy_fail_list(this, "replication_strategy_fail_list", liveness::LiveUpdate, value_status::Used, {}, "Controls which replication strategies are disallowed to be used when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
+    , replication_strategy_warn_list(this, "replication_strategy_warn_list", liveness::LiveUpdate, value_status::Used, {locator::replication_strategy_type::simple}, "Controls which replication strategies to warn about when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
+    , write_consistency_levels_disallowed(this, "write_consistency_levels_disallowed", liveness::LiveUpdate, value_status::Used, {}, "A list of consistency levels that are not allowed for write operations. Requests using these levels will fail.")
+    , write_consistency_levels_warned(this, "write_consistency_levels_warned", liveness::LiveUpdate, value_status::Used, {}, "A list of consistency levels that will trigger a warning when used in write operations. Requests using these levels will contain a warning in the query response.")
    , tablets_initial_scale_factor(this, "tablets_initial_scale_factor", liveness::LiveUpdate, value_status::Used, 10,
         "Minimum average number of tablet replicas per shard per table. Suppressed by tablet options in table's schema: min_per_shard_tablet_count and min_tablet_count")
    , tablets_per_shard_goal(this, "tablets_per_shard_goal", liveness::LiveUpdate, value_status::Used, 100,
@@ -1531,8 +1560,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
         "Maximum number of tablets which may be leaving a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
    , tablet_streaming_write_concurrency_per_shard(this, "tablet_streaming_write_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
         "Maximum number of tablets which may be pending on a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
-    , replication_strategy_warn_list(this, "replication_strategy_warn_list", liveness::LiveUpdate, value_status::Used, {locator::replication_strategy_type::simple}, "Controls which replication strategies to warn about when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
-    , replication_strategy_fail_list(this, "replication_strategy_fail_list", liveness::LiveUpdate, value_status::Used, {}, "Controls which replication strategies are disallowed to be used when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
    , service_levels_interval(this, "service_levels_interval_ms", liveness::LiveUpdate, value_status::Used, 10000, "Controls how often service levels module polls configuration table")

    , audit(this, "audit", value_status::Used, "table",
@@ -1570,7 +1597,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , disk_space_monitor_high_polling_interval_in_seconds(this, "disk_space_monitor_high_polling_interval_in_seconds", value_status::Used, 1, "Disk-space polling interval at or above polling threshold")
    , disk_space_monitor_polling_interval_threshold(this, "disk_space_monitor_polling_interval_threshold", value_status::Used, 0.9, "Disk-space polling threshold. Polling interval is increased when disk utilization is greater than or equal to this threshold")
    , critical_disk_utilization_level(this, "critical_disk_utilization_level", liveness::LiveUpdate, value_status::Used, 0.98, "Disk utilization level above which mechanisms preventing a node getting out of space are activated")
-    , enable_create_table_with_compact_storage(this, "enable_create_table_with_compact_storage", liveness::LiveUpdate, value_status::Used, false, "Enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.  This feature will eventually be removed in a future version.")
    , rf_rack_valid_keyspaces(this, "rf_rack_valid_keyspaces", liveness::MustRestart, value_status::Used, false,
        "Enforce RF-rack-valid keyspaces. Additionally, if there are existing RF-rack-invalid "
        "keyspaces, attempting to start a node with this option ON will fail. "
@@ -1843,6 +1869,30 @@ std::unordered_map<sstring, locator::replication_strategy_type> db::replication_
            {"EverywhereStrategy", locator::replication_strategy_type::everywhere_topology}};
 }

+std::unordered_map<sstring, db::consistency_level> db::consistency_level_restriction_t::map() {
+    using cl = db::consistency_level;
+    std::unordered_map<sstring, cl> result = {
+        {"ANY", cl::ANY},
+        {"ONE", cl::ONE},
+        {"TWO", cl::TWO},
+        {"THREE", cl::THREE},
+        {"QUORUM", cl::QUORUM},
+        {"ALL", cl::ALL},
+        {"LOCAL_QUORUM", cl::LOCAL_QUORUM},
+        {"EACH_QUORUM", cl::EACH_QUORUM},
+        {"SERIAL", cl::SERIAL},
+        {"LOCAL_SERIAL", cl::LOCAL_SERIAL},
+        {"LOCAL_ONE", cl::LOCAL_ONE},
+    };
+
+    constexpr auto expected_size = static_cast<size_t>(cl::MAX_VALUE) - static_cast<size_t>(cl::MIN_VALUE) + 1;
+    if (result.size() != expected_size) {
+        on_internal_error_noexcept(dblog, format("consistency_level_option::map() has {} entries but expected {}", result.size(), expected_size));
+    }
+
+    return result;
+}
+
 std::vector<enum_option<db::experimental_features_t>> db::experimental_features_t::all() {
    std::vector<enum_option<db::experimental_features_t>> ret;
    for (const auto& f : db::experimental_features_t::map()) {
--- a/db/config.hh
+++ b/db/config.hh
@@ -24,6 +24,7 @@
 #include "utils/error_injection.hh"
 #include "message/dict_trainer.hh"
 #include "message/advanced_rpc_compressor.hh"
+#include "db/consistency_level_type.hh"
 #include "db/tri_mode_restriction.hh"
 #include "sstables/compressor.hh"

@@ -126,6 +127,10 @@ struct replication_strategy_restriction_t {
    static std::unordered_map<sstring, locator::replication_strategy_type> map(); // for enum_option<>
 };

+struct consistency_level_restriction_t {
+    static std::unordered_map<sstring, db::consistency_level> map(); // for enum_option<>
+};
+
 constexpr unsigned default_murmur3_partitioner_ignore_msb_bits = 12;

 struct tablets_mode_t {
@@ -534,10 +539,16 @@ public:

    named_value<std::vector<std::unordered_map<sstring, sstring>>> auth_certificate_role_queries;

+    // guardrails options
+    named_value<bool> enable_create_table_with_compact_storage;
    named_value<int> minimum_replication_factor_fail_threshold;
    named_value<int> minimum_replication_factor_warn_threshold;
-    named_value<int> maximum_replication_factor_warn_threshold;
    named_value<int> maximum_replication_factor_fail_threshold;
+    named_value<int> maximum_replication_factor_warn_threshold;
+    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_fail_list;
+    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_warn_list;
+    named_value<std::vector<enum_option<consistency_level_restriction_t>>> write_consistency_levels_disallowed;
+    named_value<std::vector<enum_option<consistency_level_restriction_t>>> write_consistency_levels_warned;

    named_value<double> tablets_initial_scale_factor;
    named_value<unsigned> tablets_per_shard_goal;
@@ -545,9 +556,6 @@ public:
    named_value<unsigned> tablet_streaming_read_concurrency_per_shard;
    named_value<unsigned> tablet_streaming_write_concurrency_per_shard;

-    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_warn_list;
-    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_fail_list;
-
    named_value<uint32_t> service_levels_interval;

    named_value<sstring> audit;
@@ -598,8 +606,6 @@ public:
    named_value<float> disk_space_monitor_polling_interval_threshold;
    named_value<float> critical_disk_utilization_level;

-    named_value<bool> enable_create_table_with_compact_storage;
-
    named_value<bool> rf_rack_valid_keyspaces;
    named_value<bool> enforce_rack_list;

--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -154,7 +154,10 @@ hint_sender::~hint_sender() {


 future<> hint_sender::stop(drain should_drain) noexcept {
-    return seastar::async([this, should_drain] {
+    seastar::thread_attributes attr;
+
+    attr.sched_group = _hints_cpu_sched_group;
+    return seastar::async(std::move(attr), [this, should_drain] {
        set_stopping();
        _stop_as.request_abort();
        _stopped.get();
--- a/db/marshal/type_parser.cc
+++ b/db/marshal/type_parser.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <tuple>

+#include "cql3/cql3_type.hh"
 #include "types/user.hh"
 #include "types/map.hh"
 #include "types/list.hh"
@@ -113,7 +114,7 @@ std::vector<data_type> type_parser::get_type_parameters(bool multicell)
    throw parse_exception(_str, _idx, "unexpected end of string");
 }

-std::tuple<data_type, size_t> type_parser::get_vector_parameters()
+std::tuple<data_type, vector_dimension_t> type_parser::get_vector_parameters()
 {
    if (is_eos() || _str[_idx] != '(') {
        throw std::logic_error("internal error");
@@ -128,7 +129,7 @@ std::tuple<data_type, size_t> type_parser::get_vector_parameters()
    }

    data_type type = do_parse(true);
-    size_t size = 0;
+    vector_dimension_t size = 0;
    if (_str[_idx] == ',') {
        ++_idx;
        skip_blank();
@@ -142,7 +143,20 @@ std::tuple<data_type, size_t> type_parser::get_vector_parameters()
            throw parse_exception(_str, _idx, "expected digit or ')'");
        }

-        size = std::stoul(_str.substr(i, _idx - i));
+        unsigned long parsed_size;
+        try {
+            parsed_size = std::stoul(_str.substr(i, _idx - i));
+        } catch (const std::exception& e) {
+            throw parse_exception(_str, i, format("Invalid vector dimension: {}", e.what()));
+        }
+        static_assert(sizeof(unsigned long) >= sizeof(vector_dimension_t));
+        if (parsed_size == 0) {
+            throw parse_exception(_str, _idx, "Vectors must have a dimension greater than 0");
+        }
+        if (parsed_size > cql3::cql3_type::MAX_VECTOR_DIMENSION) {
+            throw parse_exception(_str, _idx, format("Vectors must have a dimension less than or equal to {}", cql3::cql3_type::MAX_VECTOR_DIMENSION));
+        }
+        size = static_cast<vector_dimension_t>(parsed_size);

        ++_idx; // skipping ')'
        return std::make_tuple(type, size);
--- a/db/marshal/type_parser.hh
+++ b/db/marshal/type_parser.hh
@@ -97,7 +97,7 @@ public:
    }
 #endif
    std::vector<data_type> get_type_parameters(bool multicell=true);
-    std::tuple<data_type, size_t> get_vector_parameters();
+    std::tuple<data_type, vector_dimension_t> get_vector_parameters();
    std::tuple<sstring, bytes, std::vector<bytes>, std::vector<data_type>> get_user_type_parameters();
    data_type do_parse(bool multicell = true);

--- a/db/read_context.hh
+++ b/db/read_context.hh
@@ -125,7 +125,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    tracing::trace_state_ptr _trace_state;
    mutation_reader::forwarding _fwd_mr;
    bool _range_query;
-    const tombstone_gc_state* _tombstone_gc_state;
+    tombstone_gc_state _tombstone_gc_state;
    max_purgeable_fn _get_max_purgeable;
    // When reader enters a partition, it must be set up for reading that
    // partition from the underlying mutation source (_underlying) in one of two ways:
@@ -149,7 +149,7 @@ public:
            reader_permit permit,
            const dht::partition_range& range,
            const query::partition_slice& slice,
-            const tombstone_gc_state* gc_state,
+            tombstone_gc_state gc_state,
            max_purgeable_fn get_max_purgeable,
            tracing::trace_state_ptr trace_state,
            mutation_reader::forwarding fwd_mr)
@@ -161,7 +161,7 @@ public:
        , _trace_state(std::move(trace_state))
        , _fwd_mr(fwd_mr)
        , _range_query(!query::is_single_partition(range))
-        , _tombstone_gc_state(gc_state)
+        , _tombstone_gc_state(std::move(gc_state))
        , _get_max_purgeable(std::move(get_max_purgeable))
        , _underlying(_cache, *this)
    {
@@ -197,7 +197,7 @@ public:
    bool partition_exists() const { return _partition_exists; }
    void on_underlying_created() { ++_underlying_created; }
    bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
-    const tombstone_gc_state* tombstone_gc_state() const { return _tombstone_gc_state; }
+    const tombstone_gc_state& tombstone_gc_state() const { return _tombstone_gc_state; }
    max_purgeable get_max_purgeable(const dht::decorated_key& dk, is_shadowable is) const { return _get_max_purgeable(dk, is); }
 public:
    future<> ensure_underlying() {
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -775,7 +775,7 @@ row_cache::make_reader_opt(schema_ptr s,
                       reader_permit permit,
                       const dht::partition_range& range,
                       const query::partition_slice& slice,
-                       const tombstone_gc_state* gc_state,
+                       tombstone_gc_state gc_state,
                       max_purgeable_fn get_max_purgeable,
                       tracing::trace_state_ptr trace_state,
                       streamed_mutation::forwarding fwd,
--- a/db/row_cache.hh
+++ b/db/row_cache.hh
@@ -373,7 +373,7 @@ public:
                                     tracing::trace_state_ptr trace_state = nullptr,
                                     streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
                                     mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no,
-                                     const tombstone_gc_state* gc_state = nullptr,
+                                     tombstone_gc_state gc_state = tombstone_gc_state::no_gc(),
                                     max_purgeable_fn get_max_purgeable = can_never_purge) {
        if (auto reader_opt = make_reader_opt(s, permit, range, slice, gc_state, std::move(get_max_purgeable), std::move(trace_state), fwd, fwd_mr)) {
            return std::move(*reader_opt);
@@ -386,7 +386,7 @@ public:
                                     reader_permit permit,
                                     const dht::partition_range&,
                                     const query::partition_slice&,
-                                     const tombstone_gc_state*,
+                                     tombstone_gc_state,
                                     max_purgeable_fn get_max_purgeable,
                                     tracing::trace_state_ptr trace_state = nullptr,
                                     streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
@@ -395,7 +395,7 @@ public:
    mutation_reader make_reader(schema_ptr s,
                                    reader_permit permit,
                                    const dht::partition_range& range = query::full_partition_range,
-                                    const tombstone_gc_state* gc_state = nullptr,
+                                    tombstone_gc_state gc_state = tombstone_gc_state::no_gc(),
                                    max_purgeable_fn get_max_purgeable = can_never_purge) {
        auto& full_slice = s->full_slice();
        return make_reader(std::move(s), std::move(permit), range, full_slice, nullptr,
--- a/db/schema_applier.cc
+++ b/db/schema_applier.cc
@@ -1139,14 +1139,17 @@ future<> schema_applier::finalize_tables_and_views() {
    // was already dropped (see https://github.com/scylladb/scylla/issues/5614)
    for (auto& dropped_view : diff.tables_and_views.local().views.dropped) {
        auto s = dropped_view.get();
+        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }
    for (auto& dropped_table : diff.tables_and_views.local().tables.dropped) {
        auto s = dropped_table.get();
+        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }
    for (auto& dropped_cdc : diff.tables_and_views.local().cdc.dropped) {
        auto s = dropped_cdc.get();
+        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }

--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -105,7 +105,7 @@ namespace {
        schema_builder::register_schema_initializer([](schema_builder& builder) {
            if (builder.ks_name() == schema_tables::NAME) {
                // all schema tables are group0 tables
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });
 }
@@ -2840,20 +2840,15 @@ void check_no_legacy_secondary_index_mv_schema(replica::database& db, const view
 static auto GET_COLUMN_MAPPING_QUERY = format("SELECT column_name, clustering_order, column_name_bytes, kind, position, type FROM system.{} WHERE cf_id = ? AND schema_version = ?",
    db::schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY);

-future<column_mapping> get_column_mapping(db::system_keyspace& sys_ks, ::table_id table_id, table_schema_version version) {
-    shared_ptr<cql3::untyped_result_set> results = co_await sys_ks._qp.execute_internal(
+future<std::optional<column_mapping>> get_column_mapping_if_exists(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version) {
+    shared_ptr<cql3::untyped_result_set> results = co_await sys_ks.query_processor().execute_internal(
        GET_COLUMN_MAPPING_QUERY,
        db::consistency_level::LOCAL_ONE,
        {table_id.uuid(), version.uuid()},
        cql3::query_processor::cache_internal::no
    );
    if (results->empty()) {
-        // If we don't have a stored column_mapping for an obsolete schema version
-        // then it means it's way too old and been cleaned up already.
-        // Fail the whole learn stage in this case.
-        co_await coroutine::return_exception(std::runtime_error(
-            format("Failed to look up column mapping for schema version {}",
-                version)));
+        co_return std::nullopt;
    }
    std::vector<column_definition>  static_columns, regular_columns;
    for (const auto& row : *results) {
@@ -2881,6 +2876,18 @@ future<column_mapping> get_column_mapping(db::system_keyspace& sys_ks, ::table_i
    co_return std::move(cm);
 }

+future<column_mapping> get_column_mapping(db::system_keyspace& sys_ks, ::table_id table_id, table_schema_version version) {
+    auto cm_opt = co_await schema_tables::get_column_mapping_if_exists(sys_ks, table_id, version);
+    if (!cm_opt) {
+        // If we don't have a stored column_mapping for an obsolete schema version
+        // then it means it's way too old and been cleaned up already.
+        co_await coroutine::return_exception(std::runtime_error(
+            format("Failed to look up column mapping for schema version {}",
+                version)));
+    }
+    co_return std::move(*cm_opt);
+}
+
 future<bool> column_mapping_exists(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version) {
    shared_ptr<cql3::untyped_result_set> results = co_await sys_ks._qp.execute_internal(
        GET_COLUMN_MAPPING_QUERY,
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -320,6 +320,8 @@ std::optional<std::map<K, V>> get_map(const query::result_set_row& row, const ss
 future<> store_column_mapping(sharded<service::storage_proxy>& proxy, schema_ptr s, bool with_ttl);
 /// Query column mapping for a given version of the table locally.
 future<column_mapping> get_column_mapping(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version);
+/// Returns the same result as `get_column_mapping()` wrapped in optional and returns nullopt if the mapping doesn't exist.
+future<std::optional<column_mapping>> get_column_mapping_if_exists(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version);
 /// Check that column mapping exists for a given version of the table
 future<bool> column_mapping_exists(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version);
 /// Delete matching column mapping entries from the `system.scylla_table_schema_history` table
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -21,14 +21,16 @@
 #include "replica/database.hh"
 #include "replica/global_table_ptr.hh"
 #include "sstables/sstables_manager.hh"
+#include "service/storage_proxy.hh"

 logging::logger snap_log("snapshots");

 namespace db {

-snapshot_ctl::snapshot_ctl(sharded<replica::database>& db, tasks::task_manager& tm, sstables::storage_manager& sstm, config cfg)
+snapshot_ctl::snapshot_ctl(sharded<replica::database>& db, sharded<service::storage_proxy>& sp, tasks::task_manager& tm, sstables::storage_manager& sstm, config cfg)
    : _config(std::move(cfg))
    , _db(db)
+    , _sp(sp)
    , _ops("snapshot_ctl")
    , _task_manager_module(make_shared<snapshot::task_manager_module>(tm))
    , _storage_manager(sstm)
@@ -104,6 +106,45 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
    });
 }

+future<> snapshot_ctl::take_cluster_column_family_snapshot(std::vector<sstring> ks_names, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
+    if (tag.empty()) {
+        throw std::invalid_argument("You must supply a snapshot name.");
+    }
+    if (ks_names.size() != 1 && !tables.empty()) {
+        throw std::invalid_argument("Cannot name tables when doing multiple keyspaces snapshot");
+    }
+    if (ks_names.empty()) {
+        std::ranges::copy(_db.local().get_keyspaces() | std::views::keys, std::back_inserter(ks_names));
+    }
+
+    return run_snapshot_modify_operation([this, ks_names = std::move(ks_names), tables = std::move(tables), tag = std::move(tag), opts] () mutable {
+        return do_take_cluster_column_family_snapshot(std::move(ks_names), std::move(tables), std::move(tag), opts);
+    });
+}
+
+future<> snapshot_ctl::do_take_cluster_column_family_snapshot(std::vector<sstring> ks_names, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
+    if (tables.empty()) {
+        co_await coroutine::parallel_for_each(ks_names, [tag, this] (const auto& ks_name) {
+            return check_snapshot_not_exist(ks_name, tag);
+        });
+        co_await _sp.local().snapshot_keyspace(
+            ks_names | std::views::transform([&](auto& ks) { return std::make_pair(ks, sstring{}); }) 
+                | std::ranges::to<std::unordered_multimap>(),
+                tag, opts
+        );
+        co_return;
+    };
+
+    auto ks = ks_names[0];
+    co_await check_snapshot_not_exist(ks, tag, tables);
+
+    co_await _sp.local().snapshot_keyspace(
+        tables | std::views::transform([&](auto& cf) { return std::make_pair(ks, cf); }) 
+            | std::ranges::to<std::unordered_multimap>(),
+            tag, opts
+    );
+}
+
 future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
    co_await check_snapshot_not_exist(ks_name, tag, tables);
    co_await replica::database::snapshot_tables_on_all_shards(_db, ks_name, std::move(tables), std::move(tag), opts);
@@ -185,4 +226,4 @@ future<int64_t> snapshot_ctl::true_snapshots_size(sstring ks, sstring cf) {
    }));
 }

-}
+}
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -24,6 +24,7 @@
 using namespace seastar;

 namespace sstables { class storage_manager; }
+namespace service { class storage_proxy; }

 namespace db {

@@ -63,7 +64,7 @@ public:

    using db_snapshot_details = std::vector<table_snapshot_details_ext>;

-    snapshot_ctl(sharded<replica::database>& db, tasks::task_manager& tm, sstables::storage_manager& sstm, config cfg);
+    snapshot_ctl(sharded<replica::database>& db, sharded<service::storage_proxy>&, tasks::task_manager& tm, sstables::storage_manager& sstm, config cfg);

    future<> stop();

@@ -95,6 +96,17 @@ public:
     */
    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});

+    /**
+     * Takes the snapshot of multiple tables or a whole keyspace, or all keyspaces,
+     * using global, clusterwide topology coordinated op.
+     * A snapshot name must be specified.
+     *
+     * @param ks_names the keyspaces to snapshot
+     * @param tables optional - a vector of tables names to snapshot
+     * @param tag the tag given to the snapshot; may not be null or empty
+     */
+    future<> take_cluster_column_family_snapshot(std::vector<sstring>  ks_names, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
+
    /**
     * Remove the snapshot with the given name from the given keyspaces.
     * If no tag is specified we will remove all snapshots.
@@ -111,6 +123,7 @@ public:
 private:
    config _config;
    sharded<replica::database>& _db;
+    sharded<service::storage_proxy>& _sp;
    seastar::rwlock _lock;
    seastar::named_gate _ops;
    shared_ptr<snapshot::task_manager_module> _task_manager_module;
@@ -133,6 +146,7 @@ private:

    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts = {}  );
    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
+    future<> do_take_cluster_column_family_snapshot(std::vector<sstring> ks_names, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
 };

-}
+}
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -770,13 +770,6 @@ system_distributed_keyspace::get_cdc_desc_v1_timestamps(context ctx) {
    co_return res;
 }

-bool system_distributed_keyspace::workload_prioritization_tables_exists() {
-    auto wp_table = get_updated_service_levels(_qp.db(), true);
-    auto table = _qp.db().try_find_table(NAME, wp_table->cf_name());
-
-    return table && table->schema()->equal_columns(*wp_table);
-}
-
 future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
    return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
 }
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -117,7 +117,6 @@ public:
    future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
    future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
    future<> drop_service_level(sstring service_level_name) const;
-    bool workload_prioritization_tables_exists();

 private:
    future<> create_tables(std::vector<schema_ptr> tables);
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -87,31 +87,15 @@ namespace {
        static const std::unordered_set<sstring> tables = {
            schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
            system_keyspace::BROADCAST_KV_STORE,
-            system_keyspace::CDC_GENERATIONS_V3,
            system_keyspace::RAFT,
            system_keyspace::RAFT_SNAPSHOTS,
            system_keyspace::RAFT_SNAPSHOT_CONFIG,
            system_keyspace::GROUP0_HISTORY,
            system_keyspace::DISCOVERY,
-            system_keyspace::TABLETS,
-            system_keyspace::TOPOLOGY,
-            system_keyspace::TOPOLOGY_REQUESTS,
            system_keyspace::LOCAL,
            system_keyspace::PEERS,
-            system_keyspace::SCYLLA_LOCAL,
            system_keyspace::COMMITLOG_CLEANUPS,
-            system_keyspace::SERVICE_LEVELS_V2,
-            system_keyspace::VIEW_BUILD_STATUS_V2,
-            system_keyspace::CDC_STREAMS_STATE,
-            system_keyspace::CDC_STREAMS_HISTORY,
-            system_keyspace::ROLES,
-            system_keyspace::ROLE_MEMBERS,
-            system_keyspace::ROLE_ATTRIBUTES,
-            system_keyspace::ROLE_PERMISSIONS,
            system_keyspace::CDC_LOCAL,
-            system_keyspace::DICTS,
-            system_keyspace::VIEW_BUILDING_TASKS,
-            system_keyspace::CLIENT_ROUTES,
        };
        if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
            builder.enable_schema_commitlog();
@@ -143,7 +127,7 @@ namespace {
                system_keyspace::REPAIR_TASKS,
            };
            if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });
 }
@@ -335,6 +319,10 @@ schema_ptr system_keyspace::topology_requests() {
            .with_column("truncate_table_id", uuid_type)
            .with_column("new_keyspace_rf_change_ks_name", utf8_type)
            .with_column("new_keyspace_rf_change_data", map_type_impl::get_instance(utf8_type, utf8_type, false))
+            .with_column("snapshot_table_ids", set_type_impl::get_instance(uuid_type, false))
+            .with_column("snapshot_tag", utf8_type)
+            .with_column("snapshot_expiry", timestamp_type)
+            .with_column("snapshot_skip_flush", boolean_type)
            .set_comment("Topology request tracking")
            .with_hash_version()
            .build();
@@ -412,26 +400,7 @@ schema_ptr system_keyspace::cdc_streams_history() {
 }

 schema_ptr system_keyspace::raft() {
-    static thread_local auto schema = [] {
-        auto id = generate_legacy_id(NAME, RAFT);
-        return schema_builder(NAME, RAFT, std::optional(id))
-            .with_column("group_id", timeuuid_type, column_kind::partition_key)
-            // raft log part
-            .with_column("index", long_type, column_kind::clustering_key)
-            .with_column("term", long_type)
-            .with_column("data", bytes_type) // decltype(raft::log_entry::data) - serialized variant
-            // persisted term and vote
-            .with_column("vote_term", long_type, column_kind::static_column)
-            .with_column("vote", uuid_type, column_kind::static_column)
-            // id of the most recent persisted snapshot
-            .with_column("snapshot_id", uuid_type, column_kind::static_column)
-            .with_column("commit_idx", long_type, column_kind::static_column)
-
-            .set_comment("Persisted RAFT log, votes and snapshot info")
-            .with_hash_version()
-            .set_caching_options(caching_options::get_disabled_caching_options())
-            .build();
-    }();
+    static thread_local auto schema = replica::make_raft_schema(db::system_keyspace::RAFT, true);
    return schema;
 }

@@ -439,35 +408,32 @@ schema_ptr system_keyspace::raft() {
 // on user-provided state machine and could be stored anywhere else in any other form.
 // This should be seen as a snapshot descriptor, instead.
 schema_ptr system_keyspace::raft_snapshots() {
-    static thread_local auto schema = [] {
-        auto id = generate_legacy_id(NAME, RAFT_SNAPSHOTS);
-        return schema_builder(NAME, RAFT_SNAPSHOTS, std::optional(id))
-            .with_column("group_id", timeuuid_type, column_kind::partition_key)
-            .with_column("snapshot_id", uuid_type)
-            // Index and term of last entry in the snapshot
-            .with_column("idx", long_type)
-            .with_column("term", long_type)
-
-            .set_comment("Persisted RAFT snapshot descriptors info")
-            .with_hash_version()
-            .build();
-    }();
+    static thread_local auto schema = replica::make_raft_snapshots_schema(db::system_keyspace::RAFT_SNAPSHOTS, true);
    return schema;
 }

 schema_ptr system_keyspace::raft_snapshot_config() {
-    static thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_keyspace::NAME, RAFT_SNAPSHOT_CONFIG);
-        return schema_builder(system_keyspace::NAME, RAFT_SNAPSHOT_CONFIG, std::optional(id))
-            .with_column("group_id", timeuuid_type, column_kind::partition_key)
-            .with_column("disposition", ascii_type, column_kind::clustering_key) // can be 'CURRENT` or `PREVIOUS'
-            .with_column("server_id", uuid_type, column_kind::clustering_key)
-            .with_column("can_vote", boolean_type)
+    static thread_local auto schema = replica::make_raft_snapshot_config_schema(db::system_keyspace::RAFT_SNAPSHOT_CONFIG, true);
+    return schema;
+}

-            .set_comment("RAFT configuration for the latest snapshot descriptor")
-            .with_hash_version()
-            .build();
-    }();
+// Raft tables for strongly consistent tablets.
+// These tables have partition keys of the form (shard, group_id), allowing the data
+// to be co-located with the tablet replica that owns the raft group.
+// The raft_groups_partitioner creates tokens that map to the specified shard.
+
+schema_ptr system_keyspace::raft_groups() {
+    static thread_local auto schema = replica::make_raft_schema(db::system_keyspace::RAFT_GROUPS, false);
+    return schema;
+}
+
+schema_ptr system_keyspace::raft_groups_snapshots() {
+    static thread_local auto schema = replica::make_raft_snapshots_schema(db::system_keyspace::RAFT_GROUPS_SNAPSHOTS, false);
+    return schema;
+}
+
+schema_ptr system_keyspace::raft_groups_snapshot_config() {
+    static thread_local auto schema = replica::make_raft_snapshot_config_schema(db::system_keyspace::RAFT_GROUPS_SNAPSHOT_CONFIG, false);
    return schema;
 }

@@ -2312,21 +2278,29 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
        r.insert(r.end(), {sstables_registry()});
    }

+    if (cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
+        r.insert(r.end(), {raft_groups(), raft_groups_snapshots(), raft_groups_snapshot_config()});
+    }
+
    return r;
 }

-static bool maybe_write_in_user_memory(schema_ptr s) {
+static bool maybe_write_in_user_memory(schema_ptr s, replica::database& db) {
+    bool strongly_consistent = db.get_config().check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES);
    return (s.get() == system_keyspace::batchlog().get())
            || (s.get() == system_keyspace::batchlog_v2().get())
            || (s.get() == system_keyspace::paxos().get())
-            || s == system_keyspace::scylla_views_builds_in_progress();
+            || s == system_keyspace::scylla_views_builds_in_progress()
+            || (strongly_consistent && s == system_keyspace::raft_groups())
+            || (strongly_consistent && s == system_keyspace::raft_groups_snapshots())
+            || (strongly_consistent && s == system_keyspace::raft_groups_snapshot_config());
 }

 future<> system_keyspace::make(
        locator::effective_replication_map_factory& erm_factory,
        replica::database& db) {
    for (auto&& table : system_keyspace::all_tables(db.get_config())) {
-        co_await db.create_local_system_table(table, maybe_write_in_user_memory(table), erm_factory);
+        co_await db.create_local_system_table(table, maybe_write_in_user_memory(table, db), erm_factory);
        co_await db.find_column_family(table).init_storage();
    }

@@ -3581,6 +3555,18 @@ system_keyspace::topology_requests_entry system_keyspace::topology_request_row_t
        entry.new_keyspace_rf_change_ks_name = row.get_as<sstring>("new_keyspace_rf_change_ks_name");
        entry.new_keyspace_rf_change_data = row.get_map<sstring,sstring>("new_keyspace_rf_change_data");
    }
+    if (row.has("snapshot_table_ids")) {
+        entry.snapshot_tag = row.get_as<sstring>("snapshot_tag");
+        entry.snapshot_skip_flush = row.get_as<bool>("snapshot_skip_flush");
+        entry.snapshot_table_ids = row.get_set<utils::UUID>("snapshot_table_ids")
+            | std::views::transform([](auto& uuid) { return table_id(uuid); })
+            | std::ranges::to<std::unordered_set>()
+            ;
+        ;
+        if (row.has("snapshot_expiry")) {
+            entry.snapshot_expiry = row.get_as<db_clock::time_point>("snapshot_expiry");
+        }
+    }

    return entry;
 }
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -191,6 +191,9 @@ public:
    static constexpr auto RAFT = "raft";
    static constexpr auto RAFT_SNAPSHOTS = "raft_snapshots";
    static constexpr auto RAFT_SNAPSHOT_CONFIG = "raft_snapshot_config";
+    static constexpr auto RAFT_GROUPS = "raft_groups";
+    static constexpr auto RAFT_GROUPS_SNAPSHOTS = "raft_groups_snapshots";
+    static constexpr auto RAFT_GROUPS_SNAPSHOT_CONFIG = "raft_groups_snapshot_config";
    static constexpr auto REPAIR_HISTORY = "repair_history";
    static constexpr auto REPAIR_TASKS = "repair_tasks";
    static constexpr auto GROUP0_HISTORY = "group0_history";
@@ -244,6 +247,9 @@ public:
    static schema_ptr scylla_local();
    static schema_ptr raft();
    static schema_ptr raft_snapshots();
+    static schema_ptr raft_groups();
+    static schema_ptr raft_groups_snapshots();
+    static schema_ptr raft_groups_snapshot_config();
    static schema_ptr repair_history();
    static schema_ptr repair_tasks();
    static schema_ptr group0_history();
@@ -417,6 +423,10 @@ public:
        std::optional<sstring> new_keyspace_rf_change_ks_name;
        // The KS options to be used when executing the scheduled ALTER KS statement
        std::optional<std::unordered_map<sstring, sstring>> new_keyspace_rf_change_data;
+        std::optional<std::unordered_set<table_id>> snapshot_table_ids;
+        std::optional<sstring> snapshot_tag;
+        std::optional<db_clock::time_point> snapshot_expiry;
+        bool snapshot_skip_flush;
    };
    using topology_requests_entries = std::unordered_map<utils::UUID, system_keyspace::topology_requests_entry>;

--- a/db/tablet_options.cc
+++ b/db/tablet_options.cc
@@ -9,6 +9,7 @@

 #include "exceptions/exceptions.hh"
 #include "db/tablet_options.hh"
+#include <seastar/core/bitops.hh>
 #include "utils/log.hh"

 extern logging::logger dblog;
@@ -23,6 +24,11 @@ tablet_options::tablet_options(const map_type& map) {
                min_tablet_count.emplace(value);
            }
            break;
+        case tablet_option_type::max_tablet_count:
+            if (auto value = std::atol(value_str.c_str())) {
+                max_tablet_count.emplace(value);
+            }
+            break;
        case tablet_option_type::min_per_shard_tablet_count:
            if (auto value = std::atof(value_str.c_str())) {
                min_per_shard_tablet_count.emplace(value);
@@ -40,6 +46,7 @@ tablet_options::tablet_options(const map_type& map) {
 sstring tablet_options::to_string(tablet_option_type hint) {
    switch (hint) {
    case tablet_option_type::min_tablet_count: return "min_tablet_count";
+    case tablet_option_type::max_tablet_count: return "max_tablet_count";
    case tablet_option_type::min_per_shard_tablet_count: return "min_per_shard_tablet_count";
    case tablet_option_type::expected_data_size_in_gb: return "expected_data_size_in_gb";
    }
@@ -48,6 +55,8 @@ sstring tablet_options::to_string(tablet_option_type hint) {
 tablet_option_type tablet_options::from_string(sstring hint_desc) {
    if (hint_desc == "min_tablet_count") {
        return tablet_option_type::min_tablet_count;
+    } else if (hint_desc == "max_tablet_count") {
+        return tablet_option_type::max_tablet_count;
    } else if (hint_desc == "min_per_shard_tablet_count") {
        return tablet_option_type::min_per_shard_tablet_count;
    } else if (hint_desc == "expected_data_size_in_gb") {
@@ -62,6 +71,9 @@ std::map<sstring, sstring> tablet_options::to_map() const {
    if (min_tablet_count) {
        res[to_string(tablet_option_type::min_tablet_count)] = fmt::to_string(*min_tablet_count);
    }
+    if (max_tablet_count) {
+        res[to_string(tablet_option_type::max_tablet_count)] = fmt::to_string(*max_tablet_count);
+    }
    if (min_per_shard_tablet_count) {
        res[to_string(tablet_option_type::min_per_shard_tablet_count)] = fmt::to_string(*min_per_shard_tablet_count);
    }
@@ -72,11 +84,23 @@ std::map<sstring, sstring> tablet_options::to_map() const {
 }

 void tablet_options::validate(const map_type& map) {
+    std::optional<ssize_t> min_tablets;
+    std::optional<ssize_t> max_tablets;
+    
    for (auto& [key, value_str] : map) {
        switch (tablet_options::from_string(key)) {
        case tablet_option_type::min_tablet_count:
            if (auto value = std::atol(value_str.c_str()); value < 0) {
                throw exceptions::configuration_exception(format("Invalid value '{}' for min_tablet_count", value));
+            } else {
+                min_tablets = value;
+            }
+            break;
+        case tablet_option_type::max_tablet_count:
+            if (auto value = std::atol(value_str.c_str()); value <= 0) {
+                throw exceptions::configuration_exception(format("Invalid value '{}' for max_tablet_count", value));
+            } else {
+                max_tablets = value;
            }
            break;
        case tablet_option_type::min_per_shard_tablet_count:
@@ -91,6 +115,20 @@ void tablet_options::validate(const map_type& map) {
            break;
        }
    }
+
+    if (min_tablets && max_tablets) {
+        auto effective_min = 1u << log2ceil(static_cast<size_t>(*min_tablets));
+        auto effective_max = 1u << log2floor(static_cast<size_t>(*max_tablets));
+
+        if (effective_min > effective_max) {
+            throw exceptions::configuration_exception(
+                    format("Invalid tablet count range: min_tablet_count={} (effective: {}) and max_tablet_count={} (effective: {}) "
+                           "result in conflicting constraints after rounding to powers of 2. "
+                           "Since tablet counts must be powers of 2, min_tablet_count rounds up and max_tablet_count rounds down"
+                           "Please adjust the values so that the smallest power of 2 greater than min_tablet_count is <= the largest power of 2 <= max_tablet_count.",
+                            *min_tablets, effective_min, *max_tablets, effective_max));
+        }
+    }
 }

 } // namespace db
--- a/db/tablet_options.hh
+++ b/db/tablet_options.hh
@@ -18,6 +18,7 @@ namespace db {
 // Per-table tablet options
 enum class tablet_option_type {
    min_tablet_count,
+    max_tablet_count,
    min_per_shard_tablet_count,
    expected_data_size_in_gb,
 };
@@ -26,6 +27,7 @@ struct tablet_options {
    using map_type = std::map<sstring, sstring>;

    std::optional<ssize_t> min_tablet_count;
+    std::optional<ssize_t> max_tablet_count;
    std::optional<double> min_per_shard_tablet_count;
    std::optional<ssize_t> expected_data_size_in_gb;

@@ -33,7 +35,7 @@ struct tablet_options {
    explicit tablet_options(const map_type& map);

    operator bool() const noexcept {
-        return min_tablet_count || min_per_shard_tablet_count || expected_data_size_in_gb;
+        return min_tablet_count || max_tablet_count || min_per_shard_tablet_count || expected_data_size_in_gb;
    }

    map_type to_map() const;
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -932,8 +932,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
    const row& existing_row = existing.cells();
    const row& updated_row = update.cells();

-    const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
-    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
+    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
        const auto view_it = _view->columns_by_name().find(cdef.name());
        const bool column_is_selected = view_it != _view->columns_by_name().end();

@@ -941,7 +940,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
        // as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
        // Because of that, we don't generate view updates when the value in an unselected column is created
        // or changes.
-        if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
+        if (!column_is_selected) {
            return true;
        }

@@ -950,40 +949,20 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
            return false;
        }

-        // We cannot skip if the value was created or deleted, unless we have a non-expiring marker
+        // We cannot skip if the value was created or deleted
        const auto* existing_cell = existing_row.find_cell(cdef.id);
        const auto* updated_cell = updated_row.find_cell(cdef.id);
        if (existing_cell == nullptr || updated_cell == nullptr) {
-            return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
+            return existing_cell == updated_cell;
        }
        atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
        atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);

        // We cannot skip when a selected column is changed
-        if (column_is_selected) {
-            if (view_it->second->is_view_virtual()) {
-                return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
-            }
-            return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
+        if (view_it->second->is_view_virtual()) {
+            return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
        }
-
-        // With non-expiring row marker, liveness checks below are not relevant
-        if (base_has_nonexpiring_marker) {
-            return true;
-        }
-
-        if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
-            return false;
-        }
-
-        // We cannot skip if the change updates TTL
-        const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
-        const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
-        if (existing_has_ttl || updated_has_ttl) {
-            return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
-        }
-
-        return true;
+        return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
    });
 }

@@ -1460,7 +1439,7 @@ void view_update_builder::generate_update(clustering_row&& update, std::optional
    }

    auto dk = dht::decorate_key(*_schema, _key);
-    const auto& gc_state = _base.get_compaction_manager().get_tombstone_gc_state();
+    const auto gc_state = _base.get_tombstone_gc_state();
    auto gc_before = gc_state.get_gc_before_for_key(_schema, dk, _now);

    // We allow existing to be disengaged, which we treat the same as an empty row.
@@ -1489,7 +1468,7 @@ void view_update_builder::generate_update(static_row&& update, const tombstone&
    }

    auto dk = dht::decorate_key(*_schema, _key);
-    const auto& gc_state = _base.get_compaction_manager().get_tombstone_gc_state();
+    const auto gc_state = _base.get_tombstone_gc_state();
    auto gc_before = gc_state.get_gc_before_for_key(_schema, dk, _now);

    // We allow existing to be disengaged, which we treat the same as an empty row.
@@ -2308,6 +2287,7 @@ future<> view_builder::drain() {
    vlogger.info("Draining view builder");
    _as.request_abort();
    co_await _mnotifier.unregister_listener(this);
+    co_await _ops_gate.close();
    co_await _vug.drain();
    co_await _sem.wait();
    _sem.broken();
@@ -2742,30 +2722,48 @@ void view_builder::on_create_view(const sstring& ks_name, const sstring& view_na
    }

    // Do it in the background, serialized and broadcast from shard 0.
-    static_cast<void>(dispatch_create_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_create_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
        vlogger.warn("Failed to dispatch view creation {}.{}: {}", ks_name, view_name, ep);
    }));
 }

-void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
+future<> view_builder::dispatch_update_view(sstring ks_name, sstring view_name) {
    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return;
+        co_return;
    }

+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+
+    auto view = view_ptr(_db.find_schema(ks_name, view_name));
+    auto step_it = _base_to_build_step.find(view->view_info()->base_id());
+    if (step_it == _base_to_build_step.end()) {
+        co_return; // In case all the views for this CF have finished building already.
+    }
+    auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
+        return bs.view->id() == view->id();
+    });
+    if (status_it != step_it->second.build_status.end()) {
+        status_it->view = std::move(view);
+    }
+}
+
+void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
    // Do it in the background, serialized.
-    (void)with_semaphore(_sem, view_builder_semaphore_units, [ks_name, view_name, this] {
-        auto view = view_ptr(_db.find_schema(ks_name, view_name));
-        auto step_it = _base_to_build_step.find(view->view_info()->base_id());
-        if (step_it == _base_to_build_step.end()) {
-            return;// In case all the views for this CF have finished building already.
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_update_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+        try {
+            std::rethrow_exception(ep);
+        } catch (const seastar::gate_closed_exception&) {
+            vlogger.warn("Ignoring gate_closed_exception during view update {}.{}", ks_name, view_name);
+        } catch (const seastar::broken_named_semaphore&) {
+            vlogger.warn("Ignoring broken_named_semaphore during view update {}.{}", ks_name, view_name);
+        } catch (const replica::no_such_column_family&) {
+            vlogger.warn("Ignoring no_such_column_family during view update {}.{}", ks_name, view_name);
        }
-        auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
-            return bs.view->id() == view->id();
-        });
-        if (status_it != step_it->second.build_status.end()) {
-            status_it->view = std::move(view);
-        }
-    }).handle_exception_type([] (replica::no_such_column_family&) { });
+    }));
 }

 future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
@@ -2827,7 +2825,9 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
    }

    // Do it in the background, serialized and broadcast from shard 0.
-    static_cast<void>(dispatch_drop_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_drop_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
        vlogger.warn("Failed to dispatch view drop {}.{}: {}", ks_name, view_name, ep);
    }));
 }
@@ -3300,7 +3300,7 @@ void view_builder::execute(build_step& step, exponential_backoff_retry r) {
            step.pslice,
            batch_size,
            query::max_partitions,
-            tombstone_gc_state(nullptr));
+            tombstone_gc_state::no_gc());
    auto consumer = compact_for_query<view_builder::consumer>(compaction_state, view_builder::consumer{*this, _vug.shared_from_this(), step, now});
    auto built = step.reader.consume_in_thread(std::move(consumer));
    if (auto ds = std::move(*compaction_state).detach_state()) {
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -16,6 +16,7 @@

 #include <seastar/core/abort_source.hh>
 #include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/condition-variable.hh>
 #include <seastar/core/sharded.hh>
@@ -190,6 +191,7 @@ class view_builder final : public service::migration_listener::only_view_notific
    // Guard the whole startup routine with a semaphore so that it's not intercepted by
    // `on_drop_view`, `on_create_view`, or `on_update_view` events.
    seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
+    seastar::gate _ops_gate;
    seastar::abort_source _as;
    future<> _step_fiber = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
@@ -284,6 +286,7 @@ private:
    future<> mark_as_built(view_ptr);
    void setup_metrics();
    future<> dispatch_create_view(sstring ks_name, sstring view_name);
+    future<> dispatch_update_view(sstring ks_name, sstring view_name);
    future<> dispatch_drop_view(sstring ks_name, sstring view_name);
    future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
    future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -610,7 +610,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
                slice,
                query::max_rows,
                query::max_partitions,
-                base_cf->get_compaction_manager().get_tombstone_gc_state());
+                base_cf->get_tombstone_gc_state());
        auto consumer = compact_for_query<view_building_worker::consumer>(compaction_state, view_building_worker::consumer(
                _db,
                views_ids,
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -98,14 +98,13 @@ public:
                auto hostid = eps.get_host_id();

                set_cell(cr, "up", gossiper.is_alive(hostid));
-                if (!ss.raft_topology_change_enabled() || gossiper.is_shutdown(endpoint)) {
+                if (gossiper.is_shutdown(endpoint)) {
                    set_cell(cr, "status", gossiper.get_gossip_status(endpoint));
+                } else {
+                    set_cell(cr, "status", boost::to_upper_copy<std::string>(fmt::format("{}", ss.get_node_state(hostid))));
                }
                set_cell(cr, "load", gossiper.get_application_state_value(endpoint, gms::application_state::LOAD));

-                if (ss.raft_topology_change_enabled() && !gossiper.is_shutdown(endpoint)) {
-                    set_cell(cr, "status", boost::to_upper_copy<std::string>(fmt::format("{}", ss.get_node_state(hostid))));
-                }
                set_cell(cr, "host_id", hostid.uuid());

                if (tm.get_topology().has_node(hostid)) {
@@ -835,7 +834,10 @@ class clients_table : public streaming_virtual_table {
            auto& clients = cd_map[dip.ip];

            std::ranges::sort(clients, [] (const foreign_ptr<std::unique_ptr<client_data>>& a, const foreign_ptr<std::unique_ptr<client_data>>& b) {
-                return a->port < b->port || a->client_type_str() < b->client_type_str();
+                if (a->port != b->port) {
+                    return a->port < b->port;
+                }
+                return a->client_type_str() < b->client_type_str();
            });

            for (const auto& cd : clients) {
--- a/dht/CMakeLists.txt
+++ b/dht/CMakeLists.txt
@@ -4,6 +4,7 @@ add_library(scylla_dht STATIC)
 target_sources(scylla_dht
  PRIVATE
    boot_strapper.cc
+    fixed_shard.cc
    i_partitioner.cc
    murmur3_partitioner.cc
    range_streamer.cc
--- a/dht/fixed_shard.cc
+++ b/dht/fixed_shard.cc
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include <seastar/core/on_internal_error.hh>
+
+#include "dht/fixed_shard.hh"
+#include "dht/token.hh"
+#include "schema/schema.hh"
+#include "sstables/key.hh"
+#include "utils/class_registrator.hh"
+#include "keys/keys.hh"
+#include "keys/compound_compat.hh"
+#include "utils/murmur_hash.hh"
+#include "utils/log.hh"
+
+namespace dht {
+
+static logging::logger fslog("fixed_shard");
+
+const sstring fixed_shard_partitioner::classname = "com.scylladb.dht.FixedShardPartitioner";
+
+const sstring fixed_shard_partitioner::name() const {
+    return classname;
+}
+
+dht::token fixed_shard_partitioner::token_for_shard(uint16_t shard, uint64_t hash_bits) {
+    int64_t token_value = (static_cast<int64_t>(shard) << shard_shift) | static_cast<int64_t>(hash_bits & hash_mask);
+    return dht::token(token_value);
+}
+
+unsigned fixed_shard_partitioner::shard_of(dht::token token) {
+    uint64_t token_bits = static_cast<uint64_t>(token.raw());
+    return static_cast<unsigned>(token_bits >> shard_shift);
+}
+
+// Called with the bytes of the first partition key component, representing the shard.
+static uint16_t compute_shard(managed_bytes_view mb) {
+    if (mb.size() != sizeof(uint16_t)) {
+        on_internal_error(fslog, format("Invalid shard value size: expected {}, got {}", sizeof(uint16_t), mb.size()));
+    }
+
+    // No need to linearize, 2 bytes are represented as a single fragment
+    auto shard_bytes = mb.current_fragment();
+    uint16_t shard_value = net::ntoh(read_unaligned<uint16_t>(shard_bytes.begin()));
+
+    if (shard_value > fixed_shard_partitioner::max_shard) {
+        on_internal_error(fslog, format("Shard value {} exceeds maximum allowed shard {}", shard_value, fixed_shard_partitioner::max_shard));
+    }
+
+    return shard_value;
+}
+
+dht::token fixed_shard_partitioner::get_token(const schema& s, partition_key_view key) const {
+    uint16_t shard_value = compute_shard(*key.begin());
+    std::array<uint64_t, 2> hash;
+    auto&& legacy = key.legacy_form(s);
+    utils::murmur_hash::hash3_x64_128(legacy.begin(), legacy.size(), 0, hash);
+    auto token = fixed_shard_partitioner::token_for_shard(shard_value, hash[0]);
+    fslog.trace("get_token: shard={}, token={}", shard_value, token);
+    return token;
+}
+
+dht::token fixed_shard_partitioner::get_token(const sstables::key_view& key) const {
+    return key.with_linearized([&](bytes_view v) {
+        auto comp = composite_view(v, true);
+        uint16_t shard_value = compute_shard(comp.begin()->first);
+        std::array<uint64_t, 2> hash;
+        utils::murmur_hash::hash3_x64_128(v, 0, hash);
+        auto token = fixed_shard_partitioner::token_for_shard(shard_value, hash[0]);
+        fslog.trace("get_token: shard={}, token={}", shard_value, token);
+        return token;
+    });
+}
+
+using registry = class_registrator<dht::i_partitioner, fixed_shard_partitioner>;
+static registry registrator(fixed_shard_partitioner::classname);
+static registry registrator_short_name("FixedShardPartitioner");
+
+fixed_shard_sharder& fixed_shard_sharder::instance() {
+    static thread_local fixed_shard_sharder sharder;
+    return sharder;
+}
+
+fixed_shard_sharder::fixed_shard_sharder()
+    : static_sharder(smp::count, 0)
+{
+}
+
+unsigned fixed_shard_sharder::shard_of(const dht::token& t) const {
+    if (t.is_minimum()) {
+        return dht::token::shard_of_minimum_token();
+    }
+    if (t.is_maximum()) {
+        return shard_count() - 1;
+    }
+    auto shard = fixed_shard_partitioner::shard_of(t);
+    fslog.trace("shard_of({}) = {}", t, std::min(shard, shard_count() - 1));
+    return std::min(shard, shard_count() - 1);
+}
+
+std::optional<unsigned> fixed_shard_sharder::try_get_shard_for_reads(const dht::token& t) const {
+    return shard_of(t);
+}
+
+dht::shard_replica_set fixed_shard_sharder::shard_for_writes(const dht::token& t, std::optional<dht::write_replica_set_selector>) const {
+    // We don't support migrations of the data in raft tables for strongly consistent tables.
+    // When migrating a strongly consistent tablet, we'll need to move its metadata
+    // explicitly to the new shard along with its raft group data.
+    auto shard = try_get_shard_for_reads(t);
+    if (!shard) {
+        return {};
+    }
+    return { *shard };
+}
+
+dht::token fixed_shard_sharder::token_for_next_shard(const dht::token& t, shard_id shard, unsigned spans) const {
+    return token_for_next_shard_for_reads(t, shard, spans);
+}
+
+dht::token fixed_shard_sharder::token_for_next_shard_for_reads(const dht::token& t, shard_id shard, unsigned spans) const {
+    // With the fixed_shard_partitioner, there's only one token range per shard, so spans > 1 always overflows.
+    if (spans > 1 || shard >= shard_count() || t.is_maximum()) {
+        return dht::maximum_token();
+    }
+
+    int64_t token_value = t.is_minimum() ? 0 : t.raw();
+    int64_t start = static_cast<int64_t>(shard) << fixed_shard_partitioner::shard_shift;
+    if (token_value < start) {
+        return dht::token(start);
+    }
+    return dht::maximum_token();
+}
+
+std::optional<dht::shard_and_token> fixed_shard_sharder::next_shard(const dht::token& t) const {
+    auto shard = try_get_shard_for_reads(t);
+    if (!shard || *shard + 1 >= shard_count()) {
+        return std::nullopt;
+    }
+    auto next_shard = *shard + 1;
+    auto next_token = token_for_next_shard_for_reads(t, next_shard);
+    if (next_token.is_maximum()) {
+        return std::nullopt;
+    }
+    return dht::shard_and_token{next_shard, next_token};
+}
+
+std::optional<dht::shard_and_token> fixed_shard_sharder::next_shard_for_reads(const dht::token& t) const {
+    return next_shard(t);
+}
+
+} // namespace dht
--- a/dht/fixed_shard.hh
+++ b/dht/fixed_shard.hh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+
+#include "dht/i_partitioner.hh"
+#include "dht/token-sharding.hh"
+
+class schema;
+
+namespace sstables {
+
+class key_view;
+
+}
+
+namespace dht {
+
+/// A partitioner mainly for Raft metadata tables for strongly consistent tables
+/// (raft_groups, raft_groups_snapshots, raft_groups_snapshot_config).
+///
+/// These tables have partition keys with the shard as the first column.
+/// The partitioner creates tokens that will be assigned to the shard specified
+/// in the partition key's first column.
+///
+/// Token encoding:
+///   [shard: 16 bits][hash: 48 bits]
+///
+/// To skip converting between signed and unsigned tokens (biasing), the top bit
+/// is always 0, so we can effectively use only 15 bits for the shard.
+/// This correlates with the limit enforced by the Raft tables' schema which uses
+/// a signed int (smallint) for the shard column, where allowing only positive
+/// shards allows up to 32767 (1 << 15 - 1) shards.
+///
+/// The lower 48 bits is a hash of the entire partition key.
+///
+/// This encoding is shard-count independent - the shard can be extracted by simple
+/// bit shifting regardless of how many shards exist in the cluster.
+struct fixed_shard_partitioner final : public dht::i_partitioner {
+    static constexpr unsigned shard_bits = 16;
+    static constexpr unsigned shard_shift = 64 - shard_bits;
+    static constexpr uint16_t max_shard = std::numeric_limits<int16_t>::max();
+    static constexpr uint64_t hash_mask = (uint64_t(1) << shard_shift) - 1;
+    static const sstring classname;
+
+    fixed_shard_partitioner() = default;
+    virtual const sstring name() const override;
+    virtual dht::token get_token(const schema& s, partition_key_view key) const override;
+    virtual dht::token get_token(const sstables::key_view& key) const override;
+
+    static dht::token token_for_shard(uint16_t shard, uint64_t hash_bits);
+    static unsigned shard_of(dht::token token);
+};
+
+/// A sharder for Raft metadata tables for strongly consistent tables (raft_groups,
+/// raft_groups_snapshots, raft_groups_snapshot_config).
+///
+/// These tables store raft group state for all tablets of strongly consistent tables.
+/// The sharder allows specifying the shard where the metadata should be located by
+/// including the shard id in the partition key.
+///
+/// The shard is encoded in the token by fixed_shard_partitioner. The sharder extracts
+/// the shard by decoding the token bits used for shard encoding.
+///
+/// We inherit from static_sharder because that's what we use for system tables.
+class fixed_shard_sharder : public dht::static_sharder {
+public:
+    /// Singleton instance for the raft tablet sharder.
+    static fixed_shard_sharder& instance();
+
+    fixed_shard_sharder();
+    virtual ~fixed_shard_sharder() = default;
+
+    /// Returns the shard for a token by extracting it from the token's high bits.
+    /// This overrides static_sharder::shard_of to use the bit-based encoding.
+    virtual unsigned shard_of(const dht::token& t) const override;
+
+    virtual std::optional<unsigned> try_get_shard_for_reads(const dht::token& t) const override;
+    virtual dht::shard_replica_set shard_for_writes(const dht::token& t, std::optional<dht::write_replica_set_selector> sel) const override;
+    virtual dht::token token_for_next_shard(const dht::token& t, shard_id shard, unsigned spans = 1) const override;
+    virtual dht::token token_for_next_shard_for_reads(const dht::token& t, shard_id shard, unsigned spans = 1) const override;
+    virtual std::optional<dht::shard_and_token> next_shard(const dht::token& t) const override;
+    virtual std::optional<dht::shard_and_token> next_shard_for_reads(const dht::token& t) const override;
+};
+
+} // namespace dht
--- a/dist/common/kernel_conf/post_install.sh
+++ b/dist/common/kernel_conf/post_install.sh
@@ -31,6 +31,23 @@ EOS
    sysctl -p /etc/sysctl.d/99-scylla-perfevent.conf
 fi

+# Tune tcp_mem to max out at 3% of total system memory.
+# Seastar defaults to allocating 93% of physical memory. The kernel's default
+# allocation for TCP is ~9%. This adds up to 102%. Reduce the TCP allocation
+# to 3% to avoid OOM.
+PAGE_SIZE=$(getconf PAGE_SIZE)
+TOTAL_MEM_KB=$(sed -n 's/^MemTotal:[[:space:]]*\([0-9]*\).*/\1/p' /proc/meminfo)
+TOTAL_MEM_BYTES=$((TOTAL_MEM_KB * 1024))
+TCP_MEM_MAX=$((TOTAL_MEM_BYTES * 3 / 100))
+TCP_MEM_MAX_PAGES=$((TCP_MEM_MAX / PAGE_SIZE))
+TCP_MEM_MID_PAGES=$((TCP_MEM_MAX * 2 / 3 / PAGE_SIZE))
+TCP_MEM_MIN_PAGES=$((TCP_MEM_MAX / 2 / PAGE_SIZE))
+cat << EOS > /etc/sysctl.d/99-scylla-tcp.conf
+# Scylla: limit TCP memory to 3% of total system memory
+net.ipv4.tcp_mem = $TCP_MEM_MIN_PAGES $TCP_MEM_MID_PAGES $TCP_MEM_MAX_PAGES
+EOS
+sysctl -p /etc/sysctl.d/99-scylla-tcp.conf || :
+
 if [ ! -d /run/systemd/system ]; then
    exit 0
 fi
--- a/dist/debian/control.template
+++ b/dist/debian/control.template
@@ -39,7 +39,7 @@ Description: debugging symbols for %{product}-server

 Package: %{product}-kernel-conf
 Architecture: any
-Depends: procps
+Depends: procps, sed
 Replaces: scylla-enterprise-kernel-conf (<< 2025.1.0~)
 Breaks: scylla-enterprise-kernel-conf (<< 2025.1.0~)
 Description: Scylla kernel tuning configuration
--- a/dist/debian/debian/scylla-kernel-conf.postrm
+++ b/dist/debian/debian/scylla-kernel-conf.postrm
@@ -6,6 +6,7 @@ case "$1" in
    purge|remove)
        if [ "$1" = "purge" ]; then
            rm -f /etc/sysctl.d/99-scylla-perfevent.conf
+            rm -f /etc/sysctl.d/99-scylla-tcp.conf
        fi
        ;;
 esac
--- a/dist/docker/redhat/build_docker.sh
+++ b/dist/docker/redhat/build_docker.sh
@@ -97,7 +97,9 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/

 run microdnf clean all
 run microdnf --setopt=tsflags=nodocs -y update
-run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
+run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip cpio
+# Extract only systemctl binary from systemd package to avoid installing the whole systemd in the container.
+run bash -rc "microdnf download systemd && rpm2cpio systemd-*.rpm | cpio -idmv ./usr/bin/systemctl && rm -rf systemd-*.rpm"
 run curl -L --output /etc/yum.repos.d/scylla.repo ${repo_file_url}
 run pip3 install --no-cache-dir --prefix /usr supervisor
 run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
@@ -106,6 +108,8 @@ run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
 run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --network-stack posix"/' /etc/sysconfig/scylla-server
+# Cleanup packages not needed in the final image and clean package manager cache to reduce image size.
+run bash -rc "microdnf remove -y cpio && microdnf clean all"

 run mkdir -p /opt/scylladb/supervisor
 run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -186,7 +186,7 @@ This package contains the main scylla configuration file.
 %package kernel-conf
 Group:          Applications/Databases
 Summary:        Scylla configuration package for the Linux kernel
-Requires:       kmod
+Requires:       kmod sed
 # tuned overwrites our sysctl settings
 Obsoletes:      tuned >= 2.11.0
 Provides:       scylla-enterprise-kernel-conf = %{version}-%{release}
@@ -220,6 +220,7 @@ fi
 %{_unitdir}/scylla-tune-sched.service
 /opt/scylladb/kernel_conf/*
 %ghost /etc/sysctl.d/99-scylla-perfevent.conf
+%ghost /etc/sysctl.d/99-scylla-tcp.conf


 %package node-exporter
--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -142,10 +142,6 @@ want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
 Alternator implements such requests by reading the entire top-level
 attribute a, modifying only a.b[3].c, and then writing back a.

-Currently, Alternator doesn't use Tablets. That's because Alternator relies
-on LWT (lightweight transactions), and LWT is not supported in keyspaces
-with Tablets enabled.
-
 ```{eval-rst}
 .. toctree::
    :maxdepth: 2
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -316,6 +316,17 @@ experimental:
    example, a single PutItem is represented by a REMOVE + MODIFY event,
    instead of just a single MODIFY or INSERT.
    <https://github.com/scylladb/scylla/issues/6930>
+  * Alternator Streams cannot always distinguish between INSERT and MODIFY
+    events - the distinction depends on whether the item existed before the
+    change. Alternator Streams may also produce spurious REMOVE or MODIFY
+    events when a non-existent item is deleted or when an item is set to the
+    same value it already had.
+    This incompatibility can be resolved by setting the configuration option
+    ``alternator_streams_increased_compatibility=true``, but this comes with
+    a performance penalty because Alternator needs to read the old value of
+    the item during data-modifying operations on tables with Alternator
+    Streams enabled. By default (``alternator_streams_increased_compatibility=false``),
+    this incompatibility remains.
    <https://github.com/scylladb/scylla/issues/6918>
  * In GetRecords responses, Alternator sets `eventSource` to
    `scylladb:alternator`, rather than `aws:dynamodb`, and doesn't set the
--- a/Show More
+++ b/Show More