Initial plan

test: test_raft_recovery_stuck: ensure mutual visibility before using driver
Not waiting for nodes to see each other as alive can cause the driver to fail the request sent in `wait_for_upgrade_state()`. scylladb/scylladb#19771 has already replaced concurrent restarts with `ManagerClient.rolling_restart()`, but it has missed this single place, probably because we do concurrent starts here. Fixes #27055 Closes scylladb/scylladb#27075
2026-05-14 03:42:14 +00:00 · 2025-11-19 11:28:59 +00:00 · 2025-11-19 05:54:12 +01:00 · 2025-11-18 15:28:55 +01:00 · 2025-11-18 13:50:36 +01:00 · 2025-11-18 08:17:17 +02:00
242 changed files with 8741 additions and 3377 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -142,20 +142,31 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):


 def with_github_keyword_prefix(repo, pr):
-    pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
-    match = re.findall(pattern, pr.body, re.IGNORECASE)
-    if not match:
-        for commit in pr.get_commits():
-            match = re.findall(pattern, commit.commit.message, re.IGNORECASE)
-            if match:
-                print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
-                break
-    if not match:
-        print(f'No valid close reference for {pr.number}')
-        return False
-    else:
+    # GitHub issue pattern: #123, scylladb/scylladb#123, or full GitHub URLs
+    github_pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
+    
+    # JIRA issue pattern: PKG-92 or https://scylladb.atlassian.net/browse/PKG-92
+    jira_pattern = r"(?:fix(?:|es|ed))\s*:?\s*(?:(?:https://scylladb\.atlassian\.net/browse/)?([A-Z]+-\d+))"
+    
+    # Check PR body for GitHub issues
+    github_match = re.findall(github_pattern, pr.body, re.IGNORECASE)
+    # Check PR body for JIRA issues
+    jira_match = re.findall(jira_pattern, pr.body, re.IGNORECASE)
+    
+    match = github_match or jira_match
+
+    if match:
        return True

+    for commit in pr.get_commits():
+        github_match = re.findall(github_pattern, commit.commit.message, re.IGNORECASE)
+        jira_match = re.findall(jira_pattern, commit.commit.message, re.IGNORECASE)
+        if github_match or jira_match:
+            print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
+            return True
+
+    print(f'No valid close reference for {pr.number}')
+    return False

 def main():
    args = parse_args()
--- a/.github/workflows/trigger_ci.yaml
+++ b/.github/workflows/trigger_ci.yaml
@@ -0,0 +1,242 @@
+name: Trigger next gating
+
+on:
+  pull_request_target:
+    types: [opened, reopened, synchronize]
+  issue_comment:
+    types: [created]
+    
+jobs:
+  trigger-ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dump GitHub context
+        env:
+          GITHUB_CONTEXT: ${{ toJson(github) }}
+        run: echo "$GITHUB_CONTEXT"
+      - name: Checkout PR code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # Needed to access full history
+          ref: ${{ github.event.pull_request.head.ref }}
+
+      - name: Fetch before commit if needed
+        run: |
+          if ! git cat-file -e ${{ github.event.before }} 2>/dev/null; then
+            echo "Fetching before commit ${{ github.event.before }}"
+            git fetch --depth=1 origin ${{ github.event.before }}
+          fi
+
+      - name: Compare commits for file changes
+        if: github.action == 'synchronize'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          echo "Base: ${{ github.event.before }}"
+          echo "Head: ${{ github.event.after }}"
+
+          TREE_BEFORE=$(git show -s --format=%T ${{ github.event.before }})
+          TREE_AFTER=$(git show -s --format=%T ${{ github.event.after }})
+          
+          echo "TREE_BEFORE=$TREE_BEFORE" >> $GITHUB_ENV
+          echo "TREE_AFTER=$TREE_AFTER" >> $GITHUB_ENV
+
+      - name: Check if last push has file changes
+        run: |
+          if [[ "${{ env.TREE_BEFORE }}" == "${{ env.TREE_AFTER }}" ]]; then
+            echo "No file changes detected in the last push, only commit message edit."
+            echo "has_file_changes=false" >> $GITHUB_ENV
+          else
+            echo "File changes detected in the last push."
+            echo "has_file_changes=true" >> $GITHUB_ENV
+          fi
+
+      - name: Rule 1 - Check PR draft or conflict status
+        run: |
+          # Check if PR is in draft mode
+          IS_DRAFT="${{ github.event.pull_request.draft }}"
+          
+          # Check if PR has 'conflict' label
+          HAS_CONFLICT_LABEL="false"
+          LABELS='${{ toJson(github.event.pull_request.labels) }}'
+          if echo "$LABELS" | jq -r '.[].name' | grep -q "^conflict$"; then
+            HAS_CONFLICT_LABEL="true"
+          fi
+          
+          # Set draft_or_conflict variable
+          if [[ "$IS_DRAFT" == "true" || "$HAS_CONFLICT_LABEL" == "true" ]]; then
+            echo "draft_or_conflict=true" >> $GITHUB_ENV
+            echo "✅ Rule 1: PR is in draft mode or has conflict label - setting draft_or_conflict=true"
+          else
+            echo "draft_or_conflict=false" >> $GITHUB_ENV
+            echo "✅ Rule 1: PR is ready and has no conflict label - setting draft_or_conflict=false"
+          fi
+          
+          echo "Draft status: $IS_DRAFT"
+          echo "Has conflict label: $HAS_CONFLICT_LABEL"
+          echo "Result: draft_or_conflict = $draft_or_conflict"
+
+      - name: Rule 2 - Check labels
+        run: |
+          # Check if PR has P0 or P1 labels
+          HAS_P0_P1_LABEL="false"
+          LABELS='${{ toJson(github.event.pull_request.labels) }}'
+          if echo "$LABELS" | jq -r '.[].name' | grep -E "^(P0|P1)$" > /dev/null; then
+            HAS_P0_P1_LABEL="true"
+          fi
+          
+          # Check if PR already has force_on_cloud label
+          echo "HAS_FORCE_ON_CLOUD_LABEL=false" >> $GITHUB_ENV
+          if echo "$LABELS" | jq -r '.[].name' | grep -q "^force_on_cloud$"; then
+            HAS_FORCE_ON_CLOUD_LABEL="true"
+            echo "HAS_FORCE_ON_CLOUD_LABEL=true" >> $GITHUB_ENV
+          fi
+          
+          echo "Has P0/P1 label: $HAS_P0_P1_LABEL"
+          echo "Has force_on_cloud label: $HAS_FORCE_ON_CLOUD_LABEL"
+          
+          # Add force_on_cloud label if PR has P0/P1 and doesn't already have force_on_cloud
+          if [[ "$HAS_P0_P1_LABEL" == "true" && "$HAS_FORCE_ON_CLOUD_LABEL" == "false" ]]; then
+            echo "✅ Rule 2: PR has P0 or P1 label - adding force_on_cloud label"
+            curl -X POST \
+              -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+              -H "Accept: application/vnd.github.v3+json" \
+              "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels" \
+              -d '{"labels":["force_on_cloud"]}'
+          elif [[ "$HAS_P0_P1_LABEL" == "true" && "$HAS_FORCE_ON_CLOUD_LABEL" == "true" ]]; then
+            echo "✅ Rule 2: PR has P0 or P1 label and already has force_on_cloud label - no action needed"
+          else
+            echo "✅ Rule 2: PR does not have P0 or P1 label - no force_on_cloud label needed"
+          fi
+
+          SKIP_UNIT_TEST_CUSTOM="false"
+          if echo "$LABELS" | jq -r '.[].name' | grep -q "^ci/skip_unit-tests_custom$"; then
+            SKIP_UNIT_TEST_CUSTOM="true"
+          fi
+          echo "SKIP_UNIT_TEST_CUSTOM=$SKIP_UNIT_TEST_CUSTOM" >> $GITHUB_ENV
+
+      - name: Rule 3 - Analyze changed files and set build requirements
+        run: |
+          # Get list of changed files
+          CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }})
+          echo "Changed files:"
+          echo "$CHANGED_FILES"
+          echo ""
+          
+          # Initialize all requirements to false
+          REQUIRE_BUILD="false"
+          REQUIRE_DTEST="false"
+          REQUIRE_UNITTEST="false"
+          REQUIRE_ARTIFACTS="false"
+          REQUIRE_SCYLLA_GDB="false"
+          
+          # Check each file against patterns
+          while IFS= read -r file; do
+            if [[ -n "$file" ]]; then
+              echo "Checking file: $file"
+              
+              # Build pattern: ^(?!scripts\/pull_github_pr.sh).*$
+              # Everything except scripts/pull_github_pr.sh
+              if [[ "$file" != "scripts/pull_github_pr.sh" ]]; then
+                REQUIRE_BUILD="true"
+                echo "  ✓ Matches build pattern"
+              fi
+              
+              # Dtest pattern: ^(?!test(.py|\/)|dist\/docker\/|dist\/common\/scripts\/).*$
+              # Everything except test files, dist/docker/, dist/common/scripts/
+              if [[ ! "$file" =~ ^test\.(py|/).*$ ]] && [[ ! "$file" =~ ^dist/docker/.*$ ]] && [[ ! "$file" =~ ^dist/common/scripts/.*$ ]]; then
+                REQUIRE_DTEST="true"
+                echo "  ✓ Matches dtest pattern"
+              fi
+              
+              # Unittest pattern: ^(?!dist\/docker\/|dist\/common\/scripts).*$
+              # Everything except dist/docker/, dist/common/scripts/
+              if [[ ! "$file" =~ ^dist/docker/.*$ ]] && [[ ! "$file" =~ ^dist/common/scripts.*$ ]]; then
+                REQUIRE_UNITTEST="true"
+                echo "  ✓ Matches unittest pattern"
+              fi
+              
+              # Artifacts pattern: ^(?:dist|tools\/toolchain).*$
+              # Files starting with dist or tools/toolchain
+              if [[ "$file" =~ ^dist.*$ ]] || [[ "$file" =~ ^tools/toolchain.*$ ]]; then
+                REQUIRE_ARTIFACTS="true"
+                echo "  ✓ Matches artifacts pattern"
+              fi
+              
+              # Scylla GDB pattern: ^(scylla-gdb.py).*$
+              # Files starting with scylla-gdb.py
+              if [[ "$file" =~ ^scylla-gdb\.py.*$ ]]; then
+                REQUIRE_SCYLLA_GDB="true"
+                echo "  ✓ Matches scylla_gdb pattern"
+              fi
+            fi
+          done <<< "$CHANGED_FILES"
+          
+          # Set environment variables
+          echo "requireBuild=$REQUIRE_BUILD" >> $GITHUB_ENV
+          echo "requireDtest=$REQUIRE_DTEST" >> $GITHUB_ENV
+          echo "requireUnittest=$REQUIRE_UNITTEST" >> $GITHUB_ENV
+          echo "requireArtifacts=$REQUIRE_ARTIFACTS" >> $GITHUB_ENV
+          echo "requireScyllaGdb=$REQUIRE_SCYLLA_GDB" >> $GITHUB_ENV
+          
+          echo ""
+          echo "✅ Rule 3: File analysis complete"
+          echo "Build required: $REQUIRE_BUILD"
+          echo "Dtest required: $REQUIRE_DTEST"
+          echo "Unittest required: $REQUIRE_UNITTEST"
+          echo "Artifacts required: $REQUIRE_ARTIFACTS"
+          echo "Scylla GDB required: $REQUIRE_SCYLLA_GDB"
+
+      - name: Determine Jenkins Job Name
+        run: |
+          if [[ "${{ github.ref_name }}" == "next" ]]; then
+            FOLDER_NAME="scylla-master"
+          elif [[ "${{ github.ref_name }}" == "next-enterprise" ]]; then
+            FOLDER_NAME="scylla-enterprise"
+          else
+            VERSION=$(echo "${{ github.ref_name }}" | awk -F'-' '{print $2}')
+            if [[ "$VERSION" =~ ^202[0-4]\.[0-9]+$ ]]; then
+              FOLDER_NAME="enterprise-$VERSION"
+            elif [[ "$VERSION" =~ ^[0-9]+\.[0-9]+$ ]]; then
+              FOLDER_NAME="scylla-$VERSION"
+            fi
+          fi
+          echo "JOB_NAME=${FOLDER_NAME}/job/scylla-ci" >> $GITHUB_ENV
+
+      - name: Trigger Jenkins Job
+        if: env.draft_or_conflict == 'false' && env.has_file_changes == 'true' && github.action == 'opened' || github.action == 'reopened'
+        env:
+          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
+          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
+          JENKINS_URL: "https://jenkins.scylladb.com"
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
+          echo "Triggering Jenkins Job: $JOB_NAME"
+          curl -X POST \
+            "$JENKINS_URL/job/$JOB_NAME/buildWithParameters? \
+            PR_NUMBER=$PR_NUMBER& \
+            RUN_DTEST=$REQUIRE_DTEST& \
+            RUN_ONLY_SCYLLA_GDB=$REQUIRE_SCYLLA_GDB& \
+            RUN_UNIT_TEST=$REQUIRE_UNITTEST& \
+            FORCE_ON_CLOUD=$HAS_FORCE_ON_CLOUD_LABEL& \
+            SKIP_UNIT_TEST_CUSTOM=$SKIP_UNIT_TEST_CUSTOM& \
+            RUN_ARTIFACT_TESTS=$REQUIRE_ARTIFACTS" \
+            --fail \
+            --user "$JENKINS_USER:$JENKINS_API_TOKEN" \
+            -i -v
+  trigger-ci-via-comment:
+    if: github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger Scylla-CI Jenkins Job
+        env:
+          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
+          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
+          JENKINS_URL: "https://jenkins.scylladb.com"
+        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
+          curl -X POST "$JENKINS_URL/job/$JOB_NAME/buildWithParameters?PR_NUMBER=$PR_NUMBER" \
+          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -136,6 +136,7 @@ future<> controller::start_server() {
                [this, addr, alternator_port, alternator_https_port, creds = std::move(creds)] (server& server) mutable {
            return server.init(addr, alternator_port, alternator_https_port, creds,
                    _config.alternator_enforce_authorization,
+                    _config.alternator_warn_authorization,
                    _config.alternator_max_users_query_size_in_trace_output,
                    &_memory_limiter.local().get_semaphore(),
                    _config.max_concurrent_requests_per_shard);
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -109,6 +109,20 @@ extern const sstring TTL_TAG_KEY("system:ttl_attribute");
 // following ones are base table's keys added as needed or range key list will be empty.
 static const sstring SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY("system:spurious_range_key_added_to_gsi_and_user_didnt_specify_range_key");

+// The following tags also have the "system:" prefix but are NOT used
+// by Alternator to store table properties - only the user ever writes to
+// them, as a way to configure the table. As such, these tags are writable
+// (and readable) by the user, and not hidden by tag_key_is_internal().
+// The reason why both hidden (internal) and user-configurable tags share the
+// same "system:" prefix is historic.
+
+// Setting the tag with a numeric value will enable a specific initial number
+// of tablets (setting the value to 0 means enabling tablets with
+// an automatic selection of the best number of tablets).
+// Setting this tag to any non-numeric value (e.g., an empty string or the
+// word "none") will ask to disable tablets.
+static constexpr auto INITIAL_TABLETS_TAG_KEY = "system:initial_tablets";
+

 enum class table_status {
    active = 0,
@@ -131,7 +145,8 @@ static std::string_view table_status_to_sstring(table_status tbl_status) {
    return "UNKNOWN";
 }

-static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type, const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat);
+static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type,
+        const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode);

 static map_type attrs_type() {
    static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true);
@@ -245,7 +260,8 @@ executor::executor(gms::gossiper& gossiper,
      _mm(mm),
      _sdks(sdks),
      _cdc_metadata(cdc_metadata),
-      _enforce_authorization(_proxy.data_dictionary().get_config().alternator_enforce_authorization()),
+      _enforce_authorization(_proxy.data_dictionary().get_config().alternator_enforce_authorization),
+      _warn_authorization(_proxy.data_dictionary().get_config().alternator_warn_authorization),
      _ssg(ssg),
      _parsed_expression_cache(std::make_unique<parsed::expression_cache>(
        parsed::expression_cache::config{_proxy.data_dictionary().get_config().alternator_max_expression_cache_entries_per_shard},
@@ -881,15 +897,37 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    co_return rjson::print(std::move(response));
 }

+// This function increments the authorization_failures counter, and may also
+// log a warn-level message and/or throw an access_denied exception, depending
+// on what enforce_authorization and warn_authorization are set to.
+// Note that if enforce_authorization is false, this function will return
+// without throwing. So a caller that doesn't want to continue after an
+// authorization_error must explicitly return after calling this function.
+static void authorization_error(alternator::stats& stats, bool enforce_authorization, bool warn_authorization, std::string msg) {
+    stats.authorization_failures++;
+    if (enforce_authorization) {
+        if (warn_authorization) {
+            elogger.warn("alternator_warn_authorization=true: {}", msg);
+        }
+        throw api_error::access_denied(std::move(msg));
+    } else {
+        if (warn_authorization) {
+            elogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {}", msg);
+        }
+    }
+}
+
 // Check CQL's Role-Based Access Control (RBAC) permission_to_check (MODIFY,
 // SELECT, DROP, etc.) on the given table. When permission is denied an
 // appropriate user-readable api_error::access_denied is thrown.
 future<> verify_permission(
    bool enforce_authorization,
+    bool warn_authorization,
    const service::client_state& client_state,
    const schema_ptr& schema,
-    auth::permission permission_to_check) {
-    if (!enforce_authorization) {
+    auth::permission permission_to_check,
+    alternator::stats& stats) {
+    if (!enforce_authorization && !warn_authorization) {
        co_return;
    }
    // Unfortunately, the fix for issue #23218 did not modify the function
@@ -904,31 +942,33 @@ future<> verify_permission(
                if (client_state.user() && client_state.user()->name) {
                    username = client_state.user()->name.value();
                }
-                throw api_error::access_denied(fmt::format(
+                authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
                    "Write access denied on internal table {}.{} to role {} because it is not a superuser",
                    schema->ks_name(), schema->cf_name(), username));
+                co_return;
        }
    }
    auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
-    if (!co_await client_state.check_has_permission(auth::command_desc(permission_to_check, resource))) {
+    if (!client_state.user() || !client_state.user()->name ||
+        !co_await client_state.check_has_permission(auth::command_desc(permission_to_check, resource))) {
        sstring username = "<anonymous>";
        if (client_state.user() && client_state.user()->name) {
            username = client_state.user()->name.value();
        }
        // Using exceptions for errors makes this function faster in the
        // success path (when the operation is allowed).
-        throw api_error::access_denied(format(
-            "{} access on table {}.{} is denied to role {}",
+        authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
+            "{} access on table {}.{} is denied to role {}, client address {}",
            auth::permissions::to_string(permission_to_check),
-            schema->ks_name(), schema->cf_name(), username));
+            schema->ks_name(), schema->cf_name(), username, client_state.get_client_address()));
    }
 }

 // Similar to verify_permission() above, but just for CREATE operations.
 // Those do not operate on any specific table, so require permissions on
 // ALL KEYSPACES instead of any specific table.
-future<> verify_create_permission(bool enforce_authorization, const service::client_state& client_state) {
-    if (!enforce_authorization) {
+static future<> verify_create_permission(bool enforce_authorization, bool warn_authorization, const service::client_state& client_state, alternator::stats& stats) {
+    if (!enforce_authorization && !warn_authorization) {
        co_return;
    }
    auto resource = auth::resource(auth::resource_kind::data);
@@ -937,7 +977,7 @@ future<> verify_create_permission(bool enforce_authorization, const service::cli
        if (client_state.user() && client_state.user()->name) {
            username = client_state.user()->name.value();
        }
-        throw api_error::access_denied(format(
+        authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
            "CREATE access on ALL KEYSPACES is denied to role {}", username));
    }
 }
@@ -954,7 +994,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien

    schema_ptr schema = get_table(_proxy, request);
    rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, _proxy, client_state, trace_state, permit);
-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::DROP);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::DROP, _stats);
    co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
        size_t retries = mm.get_concurrent_ddl_retries();
        for (;;) {
@@ -1206,12 +1246,13 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
 // Alternator uses tags whose keys start with the "system:" prefix for
 // internal purposes. Those should not be readable by ListTagsOfResource,
 // nor writable with TagResource or UntagResource (see #24098).
-// Only a few specific system tags, currently only system:write_isolation,
-// are deliberately intended to be set and read by the user, so are not
-// considered "internal".
+// Only a few specific system tags, currently only "system:write_isolation"
+// and "system:initial_tablets", are deliberately intended to be set and read
+// by the user, so are not considered "internal".
 static bool tag_key_is_internal(std::string_view tag_key) {
-    return tag_key.starts_with("system:") &&
-        tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY;
+    return tag_key.starts_with("system:")
+        && tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY
+        && tag_key != INITIAL_TABLETS_TAG_KEY;
 }

 enum class update_tags_action { add_tags, delete_tags };
@@ -1292,7 +1333,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    if (tags->Size() < 1) {
        co_return api_error::validation("The number of tags must be at least 1") ;
    }
-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::ALTER);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
    });
@@ -1313,7 +1354,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli

    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
    get_stats_from_schema(_proxy, *schema)->api_operations.untag_resource++;
-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::ALTER);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
    });
@@ -1516,7 +1557,8 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
    }
 }

-static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, bool enforce_authorization) {
+static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request,
+            service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, bool enforce_authorization, bool warn_authorization, stats& stats, const db::tablets_mode_t::mode tablets_mode) {
    SCYLLA_ASSERT(this_shard_id() == 0);

    // We begin by parsing and validating the content of the CreateTable
@@ -1722,7 +1764,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
    set_table_creation_time(tags_map, db_clock::now());
    builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));

-    co_await verify_create_permission(enforce_authorization, client_state);
+    co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, stats);

    schema_ptr schema = builder.build();
    for (auto& view_builder : view_builders) {
@@ -1743,7 +1785,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
        auto group0_guard = co_await mm.start_group0_operation();
        auto ts = group0_guard.write_timestamp();
        utils::chunked_vector<mutation> schema_mutations;
-        auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features());
+        auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features(), tablets_mode);
        // Alternator Streams doesn't yet work when the table uses tablets (#23838)
        if (stream_specification && stream_specification->IsObject()) {
            auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
@@ -1753,7 +1795,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
                auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
                if (rs->uses_tablets()) {
                    co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
-                    "If you want to use streams, create a table with vnodes by setting the tag 'experimental:initial_tablets' set to 'none'.");
+                    "If you want to use streams, create a table with vnodes by setting the tag 'system:initial_tablets' set to 'none'.");
                }
            }
        }
@@ -1823,9 +1865,10 @@ future<executor::request_return_type> executor::create_table(client_state& clien
    _stats.api_operations.create_table++;
    elogger.trace("Creating table {}", request);

-    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization)]
+    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
                                        (service::migration_manager& mm) mutable -> future<executor::request_return_type> {
-        co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization);
+        const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
+        co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, _stats, std::move(tablets_mode));
    });
 }

@@ -1878,7 +1921,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
        verify_billing_mode(request);
    }

-    co_return co_await _mm.container().invoke_on(0, [&p = _proxy.container(), request = std::move(request), gt = tracing::global_trace_state_ptr(std::move(trace_state)), enforce_authorization = bool(_enforce_authorization), client_state_other_shard = client_state.move_to_other_shard(), empty_request]
+    co_return co_await _mm.container().invoke_on(0, [&p = _proxy.container(), request = std::move(request), gt = tracing::global_trace_state_ptr(std::move(trace_state)), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization), client_state_other_shard = client_state.move_to_other_shard(), empty_request, &e = this->container()]
                                                (service::migration_manager& mm) mutable -> future<executor::request_return_type> {
        schema_ptr schema;
        size_t retries = mm.get_concurrent_ddl_retries();
@@ -1909,7 +1952,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                    if (stream_enabled->GetBool()) {
                        if (p.local().local_db().find_keyspace(tab->ks_name()).get_replication_strategy().uses_tablets()) {
                        co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
-                            "If you want to enable streams, re-create this table with vnodes (with the tag 'experimental:initial_tablets' set to 'none').");
+                            "If you want to enable streams, re-create this table with vnodes (with the tag 'system:initial_tablets' set to 'none').");
                        }
                        if (tab->cdc_options().enabled()) {
                            co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
@@ -2049,7 +2092,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                co_return api_error::validation("UpdateTable requires one of GlobalSecondaryIndexUpdates, StreamSpecification or BillingMode to be specified");
            }

-            co_await verify_permission(enforce_authorization, client_state_other_shard.get(), schema, auth::permission::ALTER);
+            co_await verify_permission(enforce_authorization, warn_authorization, client_state_other_shard.get(), schema, auth::permission::ALTER, e.local()._stats);
            auto m = co_await service::prepare_column_family_update_announcement(p.local(), schema, std::vector<view_ptr>(), group0_guard.write_timestamp());
            for (view_ptr view : new_views) {
                auto m2 = co_await service::prepare_new_view_announcement(p.local(), view, group0_guard.write_timestamp());
@@ -2685,7 +2728,6 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
    if (!cas_shard) {
        on_internal_error(elogger, "cas_shard is not set");
    }
-
    // If we're still here, we need to do this write using LWT:
    global_stats.write_using_lwt++;
    per_table_stats.write_using_lwt++;
@@ -2817,7 +2859,7 @@ future<executor::request_return_type> executor::put_item(client_state& client_st
    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
    const bool needs_read_before_write = op->needs_read_before_write();

-    co_await verify_permission(_enforce_authorization, client_state, op->schema(), auth::permission::MODIFY);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);

    auto cas_shard = op->shard_for_execute(needs_read_before_write);

@@ -2921,7 +2963,7 @@ future<executor::request_return_type> executor::delete_item(client_state& client
    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
    const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();

-    co_await verify_permission(_enforce_authorization, client_state, op->schema(), auth::permission::MODIFY);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);

    auto cas_shard = op->shard_for_execute(needs_read_before_write);

@@ -3199,7 +3241,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
        per_table_wcu.emplace_back(std::make_pair(per_table_stats, schema));
    }
    for (const auto& b : mutation_builders) {
-        co_await verify_permission(_enforce_authorization, client_state, b.first, auth::permission::MODIFY);
+        co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, b.first, auth::permission::MODIFY, _stats);
    }
    // If alternator_force_read_before_write is true we will first get the previous item size
    // and only then do send the mutation.
@@ -4425,7 +4467,7 @@ future<executor::request_return_type> executor::update_item(client_state& client
    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
    const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();

-    co_await verify_permission(_enforce_authorization, client_state, op->schema(), auth::permission::MODIFY);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);

    auto cas_shard = op->shard_for_execute(needs_read_before_write);

@@ -4536,7 +4578,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
    rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM);
-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::SELECT);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats);
    service::storage_proxy::coordinator_query_result qr =
        co_await _proxy.query(
            schema, std::move(command), std::move(partition_ranges), cl,
@@ -4668,7 +4710,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    }

    for (const table_requests& tr : requests) {
-        co_await verify_permission(_enforce_authorization, client_state, tr.schema, auth::permission::SELECT);
+        co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, tr.schema, auth::permission::SELECT, _stats);
    }

    _stats.api_operations.batch_get_item_batch_total += batch_size;
@@ -5128,10 +5170,11 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
        filter filter,
        query::partition_slice::option_set custom_opts,
        service::client_state& client_state,
-        cql3::cql_stats& cql_stats,
+        alternator::stats& stats,
        tracing::trace_state_ptr trace_state,
        service_permit permit,
-        bool enforce_authorization) {
+        bool enforce_authorization,
+        bool warn_authorization) {
    lw_shared_ptr<service::pager::paging_state> old_paging_state = nullptr;

    tracing::trace(trace_state, "Performing a database query");
@@ -5158,7 +5201,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
        old_paging_state = make_lw_shared<service::pager::paging_state>(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0);
    }

-    co_await verify_permission(enforce_authorization, client_state, table_schema, auth::permission::SELECT);
+    co_await verify_permission(enforce_authorization, warn_authorization, client_state, table_schema, auth::permission::SELECT, stats);

    auto regular_columns =
            table_schema->regular_columns() | std::views::transform(&column_definition::id)
@@ -5194,9 +5237,9 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
        rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
    }
    if (has_filter) {
-        cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
+        stats.cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
        // update our "filtered_row_matched_total" for all the rows matched, despited the filter
-        cql_stats.filtered_rows_matched_total += size;
+        stats.cql_stats.filtered_rows_matched_total += size;
    }
    if (opt_items) {
        if (opt_items->size() >= max_items_for_rapidjson_array) {
@@ -5320,7 +5363,7 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
    verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan");

    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
-            std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit), _enforce_authorization);
+            std::move(filter), query::partition_slice::option_set(), client_state, _stats, trace_state, std::move(permit), _enforce_authorization, _warn_authorization);
 }

 static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, const rjson::value& comp_definition, const rjson::value& attrs) {
@@ -5801,7 +5844,7 @@ future<executor::request_return_type> executor::query(client_state& client_state
    query::partition_slice::option_set opts;
    opts.set_if<query::partition_slice::option::reversed>(!forward);
    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
-            std::move(filter), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit), _enforce_authorization);
+            std::move(filter), opts, client_state, _stats, std::move(trace_state), std::move(permit), _enforce_authorization, _warn_authorization);
 }

 future<executor::request_return_type> executor::list_tables(client_state& client_state, service_permit permit, rjson::value request) {
@@ -5929,22 +5972,20 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
 // of nodes in the cluster: A cluster with 3 or more live nodes, gets RF=3.
 // A smaller cluster (presumably, a test only), gets RF=1. The user may
 // manually create the keyspace to override this predefined behavior.
-static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type ts, const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat) {
-    // Even if the "tablets" experimental feature is available, we currently
-    // do not enable tablets by default on Alternator tables because LWT is
-    // not yet fully supported with tablets.
-    // The user can override the choice of whether or not to use tablets at
-    // table-creation time by supplying the following tag with a numeric value
-    // (setting the value to 0 means enabling tablets with automatic selection
-    // of the best number of tablets).
+static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type ts,
+            const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode) {
+    // Whether to use tablets for the table (actually for the keyspace of the
+    // table) is determined by tablets_mode (taken from the configuration
+    // option "tablets_mode_for_new_keyspaces"), as well as the presence and
+    // the value of a per-table tag system:initial_tablets
+    // (INITIAL_TABLETS_TAG_KEY).
+    // Setting the tag with a numeric value will enable a specific initial number
+    // of tablets (setting the value to 0 means enabling tablets with
+    // an automatic selection of the best number of tablets).
    // Setting this tag to any non-numeric value (e.g., an empty string or the
    // word "none") will ask to disable tablets.
-    // If we make this tag a permanent feature, it will get a "system:" prefix -
-    // until then we give it the "experimental:" prefix to not commit to it.
-    static constexpr auto INITIAL_TABLETS_TAG_KEY = "experimental:initial_tablets";
-    // initial_tablets currently defaults to unset, so tablets will not be
-    // used by default on new Alternator tables. Change this initialization
-    // to 0 enable tablets by default, with automatic number of tablets.
+    // When vnodes are asked for by the tag value, but tablets are enforced by config,
+    // throw an exception to the client.
    std::optional<unsigned> initial_tablets;
    if (feat.tablets) {
        auto it = tags_map.find(INITIAL_TABLETS_TAG_KEY);
@@ -5955,7 +5996,20 @@ static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_vie
            try {
                initial_tablets = std::stol(tags_map.at(INITIAL_TABLETS_TAG_KEY));
            } catch (...) {
+                if (tablets_mode == db::tablets_mode_t::mode::enforced) {
+                    throw api_error::validation(format("Tag {} containing non-numerical value requests vnodes, but vnodes are forbidden by configuration option `tablets_mode_for_new_keyspaces: enforced`", INITIAL_TABLETS_TAG_KEY));
+                }
                initial_tablets = std::nullopt;
+                elogger.trace("Following {} tag containing non-numerical value, Alternator will attempt to create a keyspace {} with vnodes.", INITIAL_TABLETS_TAG_KEY, keyspace_name);
+            }
+        } else {
+            // No per-table tag present, use the value from config
+            if (tablets_mode == db::tablets_mode_t::mode::enabled || tablets_mode == db::tablets_mode_t::mode::enforced) {
+                initial_tablets = 0;
+                elogger.trace("Following the `tablets_mode_for_new_keyspaces` flag from the settings, Alternator will attempt to create a keyspace {} with tablets.", keyspace_name);
+            } else {
+                initial_tablets = std::nullopt;
+                elogger.trace("Following the `tablets_mode_for_new_keyspaces` flag from the settings, Alternator will attempt to create a keyspace {} with vnodes.", keyspace_name);
            }
        }
    }
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -139,6 +139,7 @@ class executor : public peering_sharded_service<executor> {
    db::system_distributed_keyspace& _sdks;
    cdc::metadata& _cdc_metadata;
    utils::updateable_value<bool> _enforce_authorization;
+    utils::updateable_value<bool> _warn_authorization;
    // An smp_service_group to be used for limiting the concurrency when
    // forwarding Alternator request between shards - if necessary for LWT.
    smp_service_group _ssg;
@@ -264,7 +265,7 @@ bool is_big(const rjson::value& val, int big_size = 100'000);
 // Check CQL's Role-Based Access Control (RBAC) permission (MODIFY,
 // SELECT, DROP, etc.) on the given table. When permission is denied an
 // appropriate user-readable api_error::access_denied is thrown.
-future<> verify_permission(bool enforce_authorization, const service::client_state&, const schema_ptr&, auth::permission);
+future<> verify_permission(bool enforce_authorization, bool warn_authorization, const service::client_state&, const schema_ptr&, auth::permission, alternator::stats& stats);

 /**
 * Make return type for serializing the object "streamed",
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -31,6 +31,7 @@
 #include "utils/overloaded_functor.hh"
 #include "utils/aws_sigv4.hh"
 #include "client_data.hh"
+#include "utils/updateable_value.hh"

 static logging::logger slogger("alternator-server");

@@ -270,24 +271,57 @@ protected:
    }
 };

+// This function increments the authentication_failures counter, and may also
+// log a warn-level message and/or throw an exception, depending on what
+// enforce_authorization and warn_authorization are set to.
+// The username and client address are only used for logging purposes -
+// they are not included in the error message returned to the client, since
+// the client knows who it is.
+// Note that if enforce_authorization is false, this function will return
+// without throwing. So a caller that doesn't want to continue after an
+// authentication_error must explicitly return after calling this function.
+template<typename Exception>
+static void authentication_error(alternator::stats& stats, bool enforce_authorization, bool warn_authorization, Exception&& e, std::string_view user, gms::inet_address client_address) {
+    stats.authentication_failures++;
+    if (enforce_authorization) {
+        if (warn_authorization) {
+            slogger.warn("alternator_warn_authorization=true: {} for user {}, client address {}", e.what(), user, client_address);
+        }
+        throw std::move(e);
+    } else {
+        if (warn_authorization) {
+            slogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {} for user {}, client address {}", e.what(), user, client_address);
+        }
+    }
+}
+
 future<std::string> server::verify_signature(const request& req, const chunked_content& content) {
-    if (!_enforce_authorization) {
+    if (!_enforce_authorization.get() && !_warn_authorization.get()) {
        slogger.debug("Skipping authorization");
        return make_ready_future<std::string>();
    }
    auto host_it = req._headers.find("Host");
    if (host_it == req._headers.end()) {
-        throw api_error::invalid_signature("Host header is mandatory for signature verification");
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::invalid_signature("Host header is mandatory for signature verification"), 
+            "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    auto authorization_it = req._headers.find("Authorization");
    if (authorization_it == req._headers.end()) {
-        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::missing_authentication_token("Authorization header is mandatory for signature verification"),
+            "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    std::string host = host_it->second;
    std::string_view authorization_header = authorization_it->second;
    auto pos = authorization_header.find_first_of(' ');
    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
-        throw api_error::invalid_signature(fmt::format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::invalid_signature(fmt::format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header)),
+            "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    authorization_header.remove_prefix(pos+1);
    std::string credential;
@@ -322,7 +356,9 @@ future<std::string> server::verify_signature(const request& req, const chunked_c

    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
-        throw api_error::validation(fmt::format("Incorrect credential information format: {}", credential));
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::validation(fmt::format("Incorrect credential information format: {}", credential)), "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    std::string user(credential_split[0]);
    std::string datestamp(credential_split[1]);
@@ -346,7 +382,7 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
    auto cache_getter = [&proxy = _proxy, &as = _auth_service] (std::string username) {
        return get_key_from_roles(proxy, as, std::move(username));
    };
-    return _key_cache.get_ptr(user, cache_getter).then([this, &req, &content,
+    return _key_cache.get_ptr(user, cache_getter).then_wrapped([this, &req, &content,
                                                    user = std::move(user),
                                                    host = std::move(host),
                                                    datestamp = std::move(datestamp),
@@ -354,18 +390,32 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
                                                    signed_headers_map = std::move(signed_headers_map),
                                                    region = std::move(region),
                                                    service = std::move(service),
-                                                    user_signature = std::move(user_signature)] (key_cache::value_ptr key_ptr) {
+                                                    user_signature = std::move(user_signature)] (future<key_cache::value_ptr> key_ptr_fut) {
+        key_cache::value_ptr key_ptr(nullptr);
+        try {
+            key_ptr = key_ptr_fut.get();
+        } catch (const api_error& e) {
+            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+                e, user, req.get_client_address());
+            return std::string();
+        }
        std::string signature;
        try {
            signature = utils::aws::get_signature(user, *key_ptr, std::string_view(host), "/", req._method,
                datestamp, signed_headers_str, signed_headers_map, &content, region, service, "");
        } catch (const std::exception& e) {
-            throw api_error::invalid_signature(e.what());
+            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+                api_error::invalid_signature(fmt::format("invalid signature: {}", e.what())),
+                user, req.get_client_address());
+            return std::string();
        }

        if (signature != std::string_view(user_signature)) {
            _key_cache.remove(user);
-            throw api_error::unrecognized_client("The security token included in the request is invalid.");
+            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+                api_error::unrecognized_client("wrong signature"),
+                user, req.get_client_address());
+            return std::string();
        }
        return user;
    });
@@ -618,7 +668,6 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
        , _auth_service(auth_service)
        , _sl_controller(sl_controller)
        , _key_cache(1024, 1min, slogger)
-        , _enforce_authorization(false)
        , _max_users_query_size_in_trace_output(1024)
        , _enabled_servers{}
        , _pending_requests("alternator::server::pending_requests")
@@ -700,10 +749,11 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
 }

 future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-        utils::updateable_value<bool> enforce_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
+        utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
        semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests) {
    _memory_limiter = memory_limiter;
    _enforce_authorization = std::move(enforce_authorization);
+    _warn_authorization = std::move(warn_authorization);
    _max_concurrent_requests = std::move(max_concurrent_requests);
    _max_users_query_size_in_trace_output = std::move(max_users_query_size_in_trace_output);
    if (!port && !https_port) {
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -47,6 +47,7 @@ class server : public peering_sharded_service<server> {

    key_cache _key_cache;
    utils::updateable_value<bool> _enforce_authorization;
+    utils::updateable_value<bool> _warn_authorization;
    utils::updateable_value<uint64_t> _max_users_query_size_in_trace_output;
    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
    named_gate _pending_requests;
@@ -99,7 +100,7 @@ public:
    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-            utils::updateable_value<bool> enforce_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
+            utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
            semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
    future<> stop();
    // get_client_data() is called (on each shard separately) when the virtual
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -188,6 +188,16 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
            seastar::metrics::make_total_operations("expression_cache_misses", stats.expression_cache.requests[stats::expression_types::PROJECTION_EXPRESSION].misses,
                    seastar::metrics::description("Counts number of misses of cached expressions"), labels)(expression_label("ProjectionExpression")).aggregate(aggregate_labels).set_skip_when_empty()
    });
+
+    // Only register the following metrics for the global metrics, not per-table
+    if (!has_table) {
+        metrics.add_group("alternator", {
+            seastar::metrics::make_counter("authentication_failures", stats.authentication_failures,
+                seastar::metrics::description("total number of authentication failures"), labels).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_counter("authorization_failures", stats.authorization_failures,
+                seastar::metrics::description("total number of authorization failures"), labels).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+        });
+    }
 }

 void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats) {
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -105,6 +105,17 @@ public:
        // The sizes are the the written items' sizes grouped per table.
        utils::estimated_histogram batch_write_item_op_size_kb{30};
    } operation_sizes;
+    // Count of authentication and authorization failures, counted if either
+    // alternator_enforce_authorization or alternator_warn_authorization are
+    // set to true. If both are false, no authentication or authorization
+    // checks are performed, so failures are not recognized or counted.
+    // "authentication" failure means the request was not signed with a valid
+    // user and key combination. "authorization" failure means the request was
+    // authenticated to a valid user - but this user did not have permissions
+    // to perform the operation (considering RBAC settings and the user's
+    // superuser status).
+    uint64_t authentication_failures = 0;
+    uint64_t authorization_failures = 0;
    // Miscellaneous event counters
    uint64_t total_operations = 0;
    uint64_t unsupported_operations = 0;
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -827,7 +827,7 @@ future<executor::request_return_type> executor::get_records(client_state& client

    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::SELECT);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats);

    db::consistency_level cl = db::consistency_level::LOCAL_QUORUM;
    partition_key pk = iter.shard.id.to_partition_key(*schema);
@@ -1073,9 +1073,7 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
    }

    if (stream_enabled->GetBool()) {
-        auto db = sp.data_dictionary();
-
-        if (!db.features().alternator_streams) {
+        if (!sp.features().alternator_streams) {
            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
        }

--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -68,7 +68,7 @@ extern const sstring TTL_TAG_KEY;

 future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.update_time_to_live++;
-    if (!_proxy.data_dictionary().features().alternator_ttl) {
+    if (!_proxy.features().alternator_ttl) {
        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Experimental support is available if the 'alternator-ttl' experimental feature is enabled on all nodes.");
    }

@@ -95,7 +95,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::ALTER);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
        if (enabled) {
            if (tags_map.contains(TTL_TAG_KEY)) {
@@ -753,7 +753,7 @@ static future<bool> scan_table(
        auto my_host_id = erm->get_topology().my_host_id();
        const auto &tablet_map = erm->get_token_metadata().tablets().get_tablet_map(s->id());
        for (std::optional tablet = tablet_map.first_tablet(); tablet; tablet = tablet_map.next_tablet(*tablet)) {
-            auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet);
+            auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet, erm->get_topology());
            // check if this is the primary replica for the current tablet
            if (tablet_primary_replica.host == my_host_id && tablet_primary_replica.shard == this_shard_id()) {
                co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -220,6 +220,25 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/nodes/excluded",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Retrieve host ids of nodes which are marked as excluded",
+               "type":"array",
+               "items":{
+                  "type":"string"
+               },
+               "nickname":"get_excluded_nodes",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/nodes/joining",
         "operations":[
@@ -942,6 +961,14 @@
                          "type":"string",
                          "paramType":"query",
                          "enum": ["all", "dc", "rack", "node"]
+                      },
+                      {
+                         "name":"primary_replica_only",
+                         "description":"Load the sstables and stream to the primary replica node within the scope, if one is specified. If not, stream to the global primary replica.",
+                         "required":false,
+                         "allowMultiple":false,
+                         "type":"boolean",
+                         "paramType":"query"
                      }
                  ]
              }
@@ -1028,7 +1055,7 @@
         ]
      },
      {
-         "path":"/storage_service/cleanup_all",
+         "path":"/storage_service/cleanup_all/",
         "operations":[
            {
               "method":"POST",
@@ -1038,6 +1065,30 @@
               "produces":[
                  "application/json"
               ],
+               "parameters":[
+                    {
+                     "name":"global",
+                     "description":"true if cleanup of entire cluster is requested",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/mark_node_as_clean",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Mark the node as clean. After that the node will not be considered as needing cleanup during automatic cleanup which is triggered by some topology operations",
+               "type":"void",
+               "nickname":"reset_cleanup_needed",
+               "produces":[
+                  "application/json"
+               ],
               "parameters":[]
            }
         ]
@@ -1571,6 +1622,30 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/exclude_node",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Marks the node as permanently down (excluded).",
+               "type":"void",
+               "nickname":"exclude_node",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"hosts",
+                     "description":"Comma-separated list of host ids to exclude",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/removal_status",
         "operations":[
--- a/api/api-doc/tasks.json
+++ b/api/api-doc/tasks.json
@@ -42,6 +42,14 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"consider_only_existing_data",
+                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -20,6 +20,7 @@
 #include "utils/hash.hh"
 #include <optional>
 #include <sstream>
+#include <stdexcept>
 #include <time.h>
 #include <algorithm>
 #include <functional>
@@ -504,6 +505,7 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto bucket = req->get_query_param("bucket");
        auto prefix = req->get_query_param("prefix");
        auto scope = parse_stream_scope(req->get_query_param("scope"));
+        auto primary_replica_only = validate_bool_x(req->get_query_param("primary_replica_only"), false);

        rjson::chunked_content content = co_await util::read_entire_stream(*req->content_stream);
        rjson::value parsed = rjson::parse(std::move(content));
@@ -513,7 +515,7 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
-        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope);
+        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });

@@ -763,8 +765,14 @@ rest_cdc_streams_check_and_repair(sharded<service::storage_service>& ss, std::un
 static
 future<json::json_return_type>
 rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("cleanup_all");
-        auto done = co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+        bool global = true;
+        if (auto global_param = req->get_query_param("global"); !global_param.empty()) {
+            global = validate_bool(global_param);
+        }
+
+        apilog.info("cleanup_all global={}", global);
+
+        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
            if (!ss.is_topology_coordinator_enabled()) {
                co_return false;
            }
@@ -774,14 +782,35 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local global cleanup if topology coordinator is not enabled
+        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
        co_await task->done();
+
+        // Mark this node as clean
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
+            if (ss.is_topology_coordinator_enabled()) {
+                co_await ss.reset_cleanup_needed();
+            }
+        });
+
        co_return json::json_return_type(0);
 }

+static
+future<json::json_return_type>
+rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        apilog.info("reset_cleanup_needed");
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
+            if (!ss.is_topology_coordinator_enabled()) {
+                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
+            }
+            return ss.reset_cleanup_needed();
+        });
+        co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_force_flush(http_context& ctx, std::unique_ptr<http::request> req) {
@@ -844,6 +873,25 @@ rest_remove_node(sharded<service::storage_service>& ss, std::unique_ptr<http::re
        });
 }

+static
+future<json::json_return_type>
+rest_exclude_node(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto hosts = utils::split_comma_separated_list(req->get_query_param("hosts"))
+        | std::views::transform([] (const sstring& s) { return locator::host_id(utils::UUID(s)); })
+        | std::ranges::to<std::vector<locator::host_id>>();
+
+    auto& topo = ss.local().get_token_metadata().get_topology();
+    for (auto host : hosts) {
+        if (!topo.has_node(host)) {
+            throw bad_param_exception(fmt::format("Host ID {} does not belong to this cluster", host));
+        }
+    }
+
+    apilog.info("exclude_node: hosts={}", hosts);
+    co_await ss.local().mark_excluded(hosts);
+    co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_get_removal_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1764,11 +1812,13 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
    ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
+    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
    ss::decommission.set(r, rest_bind(rest_decommission, ss));
    ss::move.set(r, rest_bind(rest_move, ss));
    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
+    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
    ss::get_removal_status.set(r, rest_bind(rest_get_removal_status, ss));
    ss::force_remove_completion.set(r, rest_bind(rest_force_remove_completion, ss));
    ss::set_logging_level.set(r, rest_bind(rest_set_logging_level));
@@ -1841,11 +1891,13 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_natural_endpoints.unset(r);
    ss::cdc_streams_check_and_repair.unset(r);
    ss::cleanup_all.unset(r);
+    ss::reset_cleanup_needed.unset(r);
    ss::force_flush.unset(r);
    ss::force_keyspace_flush.unset(r);
    ss::decommission.unset(r);
    ss::move.unset(r);
    ss::remove_node.unset(r);
+    ss::exclude_node.unset(r);
    ss::get_removal_status.unset(r);
    ss::force_remove_completion.unset(r);
    ss::set_logging_level.unset(r);
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -38,76 +38,78 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    };
 }

+static future<shared_ptr<compaction::major_keyspace_compaction_task_impl>> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+    auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+    auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
+    apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    std::optional<compaction::flush_mode> fmopt;
+    if (!flush && !consider_only_existing_data) {
+        fmopt = compaction::flush_mode::skip;
+    }
+    return compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
+}
+
+static future<shared_ptr<compaction::upgrade_sstables_compaction_task_impl>> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) {
+    auto& db = ctx.db;
+    bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+    apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    return compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+}
+
+static future<shared_ptr<compaction::cleanup_keyspace_compaction_task_impl>> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
+    const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
+    if (rs.is_local() || !rs.is_vnode_based()) {
+        auto reason = rs.is_local() ? "require" : "support";
+        apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
+        co_return nullptr;
+    }
+    apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+    if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
+        auto msg = "Can not perform cleanup operation when topology changes";
+        apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+        co_await coroutine::return_exception(std::runtime_error(msg));
+    }
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    co_return co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
+        {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
+}
+
 void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
    t::force_keyspace_compaction_async.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-        apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<compaction::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = compaction::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
-
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
-        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<compaction::flush_mode> fmopt;
-        if (!flush && !consider_only_existing_data) {
-            fmopt = compaction::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
        co_await task->done();
        co_return json_void();
    });

    t::force_keyspace_cleanup_async.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        apilog.info("force_keyspace_cleanup_async: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup_async: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
+        tasks::task_id id = tasks::task_id::create_null_id();
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
+            id = task->get_status().id;
        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
-
-        co_return json::json_return_type(task->get_status().id.to_sstring());
+        co_return json::json_return_type(id.to_sstring());
    });

    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
-        if (rs.is_local() || !rs.is_vnode_based()) {
-            auto reason = rs.is_local() ? "require" : "support";
-            apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
-            co_return json::json_return_type(0);
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
+            co_await task->done();
        }
-        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
-        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
-            {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
-        co_await task->done();
        co_return json::json_return_type(0);
    });

@@ -129,25 +131,12 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
    }));

    t::upgrade_sstables_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    }));

    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
        co_await task->done();
        co_return json::json_return_type(0);
    }));
--- a/api/token_metadata.cc
+++ b/api/token_metadata.cc
@@ -62,6 +62,17 @@ void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_to
        return addr | std::ranges::to<std::vector>();
    });

+    ss::get_excluded_nodes.set(r, [&tm](const_req req) {
+        const auto& local_tm = *tm.local().get();
+        std::vector<sstring> eps;
+        local_tm.get_topology().for_each_node([&] (auto& node) {
+            if (node.is_excluded()) {
+                eps.push_back(node.host_id().to_sstring());
+            }
+        });
+        return eps;
+    });
+
    ss::get_joining_nodes.set(r, [&tm, &g](const_req req) {
        const auto& local_tm = *tm.local().get();
        const auto& points = local_tm.get_bootstrap_tokens();
@@ -130,6 +141,7 @@ void unset_token_metadata(http_context& ctx, routes& r) {
    ss::get_leaving_nodes.unset(r);
    ss::get_moving_nodes.unset(r);
    ss::get_joining_nodes.unset(r);
+    ss::get_excluded_nodes.unset(r);
    ss::get_host_id_map.unset(r);
    httpd::endpoint_snitch_info_json::get_datacenter.unset(r);
    httpd::endpoint_snitch_info_json::get_rack.unset(r);
--- a/audit/CMakeLists.txt
+++ b/audit/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(scylla_audit
  PRIVATE
    audit.cc
    audit_cf_storage_helper.cc
+    audit_composite_storage_helper.cc
    audit_syslog_storage_helper.cc)
 target_include_directories(scylla_audit
  PUBLIC
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -13,9 +13,11 @@
 #include "cql3/statements/batch_statement.hh"
 #include "cql3/statements/modification_statement.hh"
 #include "storage_helper.hh"
+#include "audit_cf_storage_helper.hh"
+#include "audit_syslog_storage_helper.hh"
+#include "audit_composite_storage_helper.hh"
 #include "audit.hh"
 #include "../db/config.hh"
-#include "utils/class_registrator.hh"

 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/trim.hpp>
@@ -26,6 +28,47 @@ namespace audit {

 logging::logger logger("audit");

+static std::set<sstring> parse_audit_modes(const sstring& data) {
+    std::set<sstring> result;
+    if (!data.empty()) {
+        std::vector<sstring> audit_modes;
+        boost::split(audit_modes, data, boost::is_any_of(","));
+        if (audit_modes.empty()) {
+            return {};
+        }
+        for (sstring& audit_mode : audit_modes) {
+            boost::trim(audit_mode);
+            if (audit_mode == "none") {
+                return {};
+            }
+            if (audit_mode != "table" && audit_mode != "syslog") {
+                throw audit_exception(fmt::format("Bad configuration: invalid 'audit': {}", audit_mode));
+            }
+            result.insert(std::move(audit_mode));
+        }
+    }
+    return result;
+}
+
+static std::unique_ptr<storage_helper> create_storage_helper(const std::set<sstring>& audit_modes, cql3::query_processor& qp, service::migration_manager& mm) {
+    SCYLLA_ASSERT(!audit_modes.empty() && !audit_modes.contains("none"));
+
+    std::vector<std::unique_ptr<storage_helper>> helpers;
+    for (const sstring& audit_mode : audit_modes) {
+        if (audit_mode == "table") {
+            helpers.emplace_back(std::make_unique<audit_cf_storage_helper>(qp, mm));
+        } else if (audit_mode == "syslog") {
+            helpers.emplace_back(std::make_unique<audit_syslog_storage_helper>(qp, mm));
+        }
+    }
+
+    SCYLLA_ASSERT(!helpers.empty());
+    if (helpers.size() == 1) {
+        return std::move(helpers.front());
+    }
+    return std::make_unique<audit_composite_storage_helper>(std::move(helpers));
+}
+
 static sstring category_to_string(statement_category category)
 {
    switch (category) {
@@ -103,7 +146,9 @@ static std::set<sstring> parse_audit_keyspaces(const sstring& data) {
 }

 audit::audit(locator::shared_token_metadata& token_metadata,
-             sstring&& storage_helper_name,
+             cql3::query_processor& qp,
+             service::migration_manager& mm,
+             std::set<sstring>&& audit_modes,
             std::set<sstring>&& audited_keyspaces,
             std::map<sstring, std::set<sstring>>&& audited_tables,
             category_set&& audited_categories,
@@ -112,28 +157,21 @@ audit::audit(locator::shared_token_metadata& token_metadata,
    , _audited_keyspaces(std::move(audited_keyspaces))
    , _audited_tables(std::move(audited_tables))
    , _audited_categories(std::move(audited_categories))
-    , _storage_helper_class_name(std::move(storage_helper_name))
    , _cfg(cfg)
    , _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<std::set<sstring>>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
    , _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<std::map<sstring, std::set<sstring>>>(new_value, parse_audit_tables, _audited_tables); }))
    , _cfg_categories_observer(cfg.audit_categories.observe([this] (sstring const& new_value){ update_config<category_set>(new_value, parse_audit_categories, _audited_categories); }))
-{ }
+{
+    _storage_helper_ptr = create_storage_helper(std::move(audit_modes), qp, mm);
+}

 audit::~audit() = default;

-future<> audit::create_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm) {
-    sstring storage_helper_name;
-    if (cfg.audit() == "table") {
-        storage_helper_name = "audit_cf_storage_helper";
-    } else if (cfg.audit() == "syslog") {
-        storage_helper_name = "audit_syslog_storage_helper";
-    } else if (cfg.audit() == "none") {
-        // Audit is off
+future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm) {
+    std::set<sstring> audit_modes = parse_audit_modes(cfg.audit());
+    if (audit_modes.empty()) {
        logger.info("Audit is disabled");
-
        return make_ready_future<>();
-    } else {
-        throw audit_exception(fmt::format("Bad configuration: invalid 'audit': {}", cfg.audit()));
    }
    category_set audited_categories = parse_audit_categories(cfg.audit_categories());
    std::map<sstring, std::set<sstring>> audited_tables = parse_audit_tables(cfg.audit_tables());
@@ -143,19 +181,20 @@ future<> audit::create_audit(const db::config& cfg, sharded<locator::shared_toke
                cfg.audit(), cfg.audit_categories(), cfg.audit_keyspaces(), cfg.audit_tables());

    return audit_instance().start(std::ref(stm),
-                                  std::move(storage_helper_name),
+                                  std::ref(qp),
+                                  std::ref(mm),
+                                  std::move(audit_modes),
                                  std::move(audited_keyspaces),
                                  std::move(audited_tables),
                                  std::move(audited_categories),
-                                  std::cref(cfg));
-}
-
-future<> audit::start_audit(const db::config& cfg, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm) {
-    if (!audit_instance().local_is_initialized()) {
-        return make_ready_future<>();
-    }
-    return audit_instance().invoke_on_all([&cfg, &qp, &mm] (audit& local_audit) {
-        return local_audit.start(cfg, qp.local(), mm.local());
+                                  std::cref(cfg))
+    .then([&cfg] {
+        if (!audit_instance().local_is_initialized()) {
+            return make_ready_future<>();
+        }
+        return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
+            return local_audit.start(cfg);
+        });
    });
 }

@@ -181,15 +220,7 @@ audit_info_ptr audit::create_no_audit_info() {
    return audit_info_ptr();
 }

-future<> audit::start(const db::config& cfg, cql3::query_processor& qp, service::migration_manager& mm) {
-    try {
-        _storage_helper_ptr = create_object<storage_helper>(_storage_helper_class_name, qp, mm);
-    } catch (no_such_class& e) {
-        logger.error("Can't create audit storage helper {}: not supported", _storage_helper_class_name);
-        throw;
-    } catch (...) {
-        throw;
-    }
+future<> audit::start(const db::config& cfg) {
    return _storage_helper_ptr->start(cfg);
 }

--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -102,7 +102,6 @@ class audit final : public seastar::async_sharded_service<audit> {
    std::map<sstring, std::set<sstring>> _audited_tables;
    category_set _audited_categories;

-    sstring _storage_helper_class_name;
    std::unique_ptr<storage_helper> _storage_helper_ptr;

    const db::config& _cfg;
@@ -125,18 +124,20 @@ public:
    static audit& local_audit_instance() {
        return audit_instance().local();
    }
-    static future<> create_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm);
-    static future<> start_audit(const db::config& cfg, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
+    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
    static audit_info_ptr create_no_audit_info();
-    audit(locator::shared_token_metadata& stm, sstring&& storage_helper_name,
+    audit(locator::shared_token_metadata& stm,
+          cql3::query_processor& qp,
+          service::migration_manager& mm,
+          std::set<sstring>&& audit_modes,
          std::set<sstring>&& audited_keyspaces,
          std::map<sstring, std::set<sstring>>&& audited_tables,
          category_set&& audited_categories,
          const db::config& cfg);
    ~audit();
-    future<> start(const db::config& cfg, cql3::query_processor& qp, service::migration_manager& mm);
+    future<> start(const db::config& cfg);
    future<> stop();
    future<> shutdown();
    bool should_log(const audit_info* audit_info) const;
--- a/audit/audit_cf_storage_helper.cc
+++ b/audit/audit_cf_storage_helper.cc
@@ -11,7 +11,6 @@
 #include "cql3/query_processor.hh"
 #include "data_dictionary/keyspace_metadata.hh"
 #include "utils/UUID_gen.hh"
-#include "utils/class_registrator.hh"
 #include "cql3/query_options.hh"
 #include "cql3/statements/ks_prop_defs.hh"
 #include "service/migration_manager.hh"
@@ -198,7 +197,4 @@ cql3::query_options audit_cf_storage_helper::make_login_data(socket_address node
    return cql3::query_options(cql3::default_cql_config, db::consistency_level::ONE, std::nullopt, std::move(values), false, cql3::query_options::specific_options::DEFAULT);
 }

-using registry = class_registrator<storage_helper, audit_cf_storage_helper, cql3::query_processor&, service::migration_manager&>;
-static registry registrator1("audit_cf_storage_helper");
-
 }
--- a/audit/audit_composite_storage_helper.cc
+++ b/audit/audit_composite_storage_helper.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2025 ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include <seastar/core/loop.hh>
+#include <seastar/core/future-util.hh>
+
+#include "audit/audit_composite_storage_helper.hh"
+
+#include "utils/class_registrator.hh"
+
+namespace audit {
+
+audit_composite_storage_helper::audit_composite_storage_helper(std::vector<std::unique_ptr<storage_helper>>&& storage_helpers)
+    : _storage_helpers(std::move(storage_helpers))
+{}
+
+future<> audit_composite_storage_helper::start(const db::config& cfg) {
+    auto res = seastar::parallel_for_each(
+        _storage_helpers,
+        [&cfg] (std::unique_ptr<storage_helper>& h) {
+            return h->start(cfg);
+        }
+    );
+    return res;
+}
+
+future<> audit_composite_storage_helper::stop() {
+    auto res = seastar::parallel_for_each(
+        _storage_helpers,
+        [] (std::unique_ptr<storage_helper>& h) {
+            return h->stop();
+        }
+    );
+    return res;
+}
+
+future<> audit_composite_storage_helper::write(const audit_info* audit_info,
+                                               socket_address node_ip,
+                                               socket_address client_ip,
+                                               db::consistency_level cl,
+                                               const sstring& username,
+                                               bool error) {
+    return seastar::parallel_for_each(
+        _storage_helpers,
+        [audit_info, node_ip, client_ip, cl, &username, error](std::unique_ptr<storage_helper>& h) {
+            return h->write(audit_info, node_ip, client_ip, cl, username, error);
+        }
+    );
+}
+
+future<> audit_composite_storage_helper::write_login(const sstring& username,
+                                                     socket_address node_ip,
+                                                     socket_address client_ip,
+                                                     bool error) {
+    return seastar::parallel_for_each(
+        _storage_helpers,
+        [&username, node_ip, client_ip, error](std::unique_ptr<storage_helper>& h) {
+            return h->write_login(username, node_ip, client_ip, error);
+        }
+    );
+}
+
+} // namespace audit
--- a/audit/audit_composite_storage_helper.hh
+++ b/audit/audit_composite_storage_helper.hh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2025 ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include "audit/audit.hh"
+#include <seastar/core/future.hh>
+
+#include "storage_helper.hh"
+
+namespace audit {
+
+class audit_composite_storage_helper : public storage_helper {
+    std::vector<std::unique_ptr<storage_helper>> _storage_helpers;
+
+public:
+    explicit audit_composite_storage_helper(std::vector<std::unique_ptr<storage_helper>>&&);
+    virtual ~audit_composite_storage_helper() = default;
+    virtual future<> start(const db::config& cfg) override;
+    virtual future<> stop() override;
+    virtual future<> write(const audit_info* audit_info,
+                           socket_address node_ip,
+                           socket_address client_ip,
+                           db::consistency_level cl,
+                           const sstring& username,
+                           bool error) override;
+    virtual future<> write_login(const sstring& username,
+                                 socket_address node_ip,
+                                 socket_address client_ip,
+                                 bool error) override;
+};
+
+} // namespace audit
--- a/audit/audit_syslog_storage_helper.cc
+++ b/audit/audit_syslog_storage_helper.cc
@@ -21,7 +21,6 @@
 #include <fmt/chrono.h>

 #include "cql3/query_processor.hh"
-#include "utils/class_registrator.hh"

 namespace cql3 {

@@ -143,7 +142,4 @@ future<> audit_syslog_storage_helper::write_login(const sstring& username,
    co_await syslog_send_helper(msg.c_str());
 }

-using registry = class_registrator<storage_helper, audit_syslog_storage_helper, cql3::query_processor&, service::migration_manager&>;
-static registry registrator1("audit_syslog_storage_helper");
-
 }
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -1209,7 +1209,7 @@ future<mutation> create_table_streams_mutation(table_id table, db_clock::time_po
    co_return std::move(m);
 }

-future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const std::vector<cdc::stream_id>& stream_ids, api::timestamp_type ts) {
+future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const utils::chunked_vector<cdc::stream_id>& stream_ids, api::timestamp_type ts) {
    auto s = db::system_keyspace::cdc_streams_state();

    mutation m(s, partition_key::from_single_value(*s,
@@ -1252,24 +1252,24 @@ future<> generation_service::load_cdc_tablet_streams(std::optional<std::unordere
        tables_to_process = _cdc_metadata.get_tables_with_cdc_tablet_streams() | std::ranges::to<std::unordered_set<table_id>>();
    }

-    auto read_streams_state = [this] (const std::optional<std::unordered_set<table_id>>& tables, noncopyable_function<future<>(table_id, db_clock::time_point, std::vector<cdc::stream_id>)> f) -> future<> {
+    auto read_streams_state = [this] (const std::optional<std::unordered_set<table_id>>& tables, noncopyable_function<future<>(table_id, db_clock::time_point, utils::chunked_vector<cdc::stream_id>)> f) -> future<> {
        if (tables) {
            for (auto table : *tables) {
-                co_await _sys_ks.local().read_cdc_streams_state(table, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
+                co_await _sys_ks.local().read_cdc_streams_state(table, [&] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
                    return f(table, base_ts, std::move(base_stream_set));
                });
            }
        } else {
-            co_await _sys_ks.local().read_cdc_streams_state(std::nullopt, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
+            co_await _sys_ks.local().read_cdc_streams_state(std::nullopt, [&] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
                return f(table, base_ts, std::move(base_stream_set));
            });
        }
    };

-    co_await read_streams_state(changed_tables, [this, &tables_to_process] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
+    co_await read_streams_state(changed_tables, [this, &tables_to_process] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
        table_streams new_table_map;

-        auto append_stream = [&new_table_map] (db_clock::time_point stream_tp, std::vector<cdc::stream_id> stream_set) {
+        auto append_stream = [&new_table_map] (db_clock::time_point stream_tp, utils::chunked_vector<cdc::stream_id> stream_set) {
            auto ts = std::chrono::duration_cast<api::timestamp_clock::duration>(stream_tp.time_since_epoch()).count();
            new_table_map[ts] = committed_stream_set {stream_tp, std::move(stream_set)};
        };
@@ -1345,7 +1345,7 @@ future<> generation_service::query_cdc_timestamps(table_id table, bool ascending
    }
 }

-future<> generation_service::query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f) {
+future<> generation_service::query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const utils::chunked_vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f) {
    const auto& all_tables = _cdc_metadata.get_all_tablet_streams();
    auto table_it = all_tables.find(table);
    if (table_it == all_tables.end()) {
@@ -1402,8 +1402,8 @@ future<> generation_service::generate_tablet_resize_update(utils::chunked_vector
        co_return;
    }

-    std::vector<cdc::stream_id> new_streams;
-    new_streams.reserve(new_tablet_map.tablet_count());
+    utils::chunked_vector<cdc::stream_id> new_streams;
+    co_await utils::reserve_gently(new_streams, new_tablet_map.tablet_count());
    for (auto tid : new_tablet_map.tablet_ids()) {
        new_streams.emplace_back(new_tablet_map.get_last_token(tid), 0);
        co_await coroutine::maybe_yield();
@@ -1425,7 +1425,7 @@ future<> generation_service::generate_tablet_resize_update(utils::chunked_vector
    muts.emplace_back(std::move(mut));
 }

-future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const std::vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts) {
+future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const utils::chunked_vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts) {
    utils::chunked_vector<mutation> muts;
    muts.reserve(2);

--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -143,12 +143,12 @@ stream_state read_stream_state(int8_t val);

 struct committed_stream_set {
    db_clock::time_point ts;
-    std::vector<cdc::stream_id> streams;
+    utils::chunked_vector<cdc::stream_id> streams;
 };

 struct cdc_stream_diff {
-    std::vector<stream_id> closed_streams;
-    std::vector<stream_id> opened_streams;
+    utils::chunked_vector<stream_id> closed_streams;
+    utils::chunked_vector<stream_id> opened_streams;
 };

 using table_streams = std::map<api::timestamp_type, committed_stream_set>;
@@ -220,11 +220,11 @@ future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);

 future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const locator::tablet_map&, api::timestamp_type);
-future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const std::vector<cdc::stream_id>&, api::timestamp_type);
+future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const utils::chunked_vector<cdc::stream_id>&, api::timestamp_type);
 utils::chunked_vector<mutation> make_drop_table_streams_mutations(table_id, api::timestamp_type ts);

 future<mutation> get_switch_streams_mutation(table_id table, db_clock::time_point stream_ts, cdc_stream_diff diff, api::timestamp_type ts);
-future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const std::vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts);
+future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const utils::chunked_vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts);
 table_streams::const_iterator get_new_base_for_gc(const table_streams&, std::chrono::seconds ttl);

 } // namespace cdc
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -149,7 +149,7 @@ public:
    future<> load_cdc_tablet_streams(std::optional<std::unordered_set<table_id>> changed_tables);

    future<> query_cdc_timestamps(table_id table, bool ascending, noncopyable_function<future<>(db_clock::time_point)> f);
-    future<> query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f);
+    future<> query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const utils::chunked_vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f);

    future<> generate_tablet_resize_update(utils::chunked_vector<canonical_mutation>& muts, table_id table, const locator::tablet_map& new_tablet_map, api::timestamp_type ts);

--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -68,10 +68,15 @@ shared_ptr<locator::abstract_replication_strategy> generate_replication_strategy
    return locator::abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, topo);
 }

+// When dropping a column from a CDC log table, we set the drop timestamp
+// `column_drop_leeway` seconds into the future to ensure that for writes concurrent
+// with column drop, the write timestamp is before the column drop timestamp.
+constexpr auto column_drop_leeway = std::chrono::seconds(5);
+
 } // anonymous namespace

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, const replica::database&, const keyspace_metadata&,
+static schema_ptr create_log_schema(const schema&, const replica::database&, const keyspace_metadata&, api::timestamp_type,
        std::optional<table_id> = {}, schema_ptr = nullptr);
 }

@@ -183,7 +188,7 @@ public:
        muts.emplace_back(std::move(mut));
    }

-    void on_pre_create_column_families(const keyspace_metadata& ksm, std::vector<schema_ptr>& cfms) override {
+    void on_pre_create_column_families(const keyspace_metadata& ksm, std::vector<schema_ptr>& cfms, api::timestamp_type ts) override {
        std::vector<schema_ptr> new_cfms;

        for (auto sp : cfms) {
@@ -202,7 +207,7 @@ public:
            }

            // in seastar thread
-            auto log_schema = create_log_schema(schema, db, ksm);
+            auto log_schema = create_log_schema(schema, db, ksm, ts);
            new_cfms.push_back(std::move(log_schema));
        }

@@ -249,7 +254,7 @@ public:
            }

            std::optional<table_id> maybe_id = log_schema ? std::make_optional(log_schema->id()) : std::nullopt;
-            auto new_log_schema = create_log_schema(new_schema, db, *keyspace.metadata(), std::move(maybe_id), log_schema);
+            auto new_log_schema = create_log_schema(new_schema, db, *keyspace.metadata(), timestamp, std::move(maybe_id), log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(_ctxt._proxy, keyspace.metadata(), log_schema, new_log_schema, timestamp)
@@ -582,7 +587,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
 }

 static schema_ptr create_log_schema(const schema& s, const replica::database& db,
-        const keyspace_metadata& ksm, std::optional<table_id> uuid, schema_ptr old)
+        const keyspace_metadata& ksm, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old)
 {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner(cdc::cdc_partitioner::classname);
@@ -618,6 +623,28 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
+
+    auto validate_new_column = [&] (const sstring& name) {
+        // When dropping a column from a CDC log table, we set the drop timestamp to be
+        // `column_drop_leeway` seconds into the future (see `create_log_schema`).
+        // Therefore, when recreating a column with the same name, we need to validate
+        // that it's not recreated too soon and that the drop timestamp has passed.
+        if (old && old->dropped_columns().contains(name)) {
+            const auto& drop_info = old->dropped_columns().at(name);
+            auto create_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(timestamp));
+            auto drop_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(drop_info.timestamp));
+            if (drop_time > create_time) {
+                throw exceptions::invalid_request_exception(format("Cannot add column {} because a column with the same name was dropped too recently. Please retry after {} seconds",
+                        name, std::chrono::duration_cast<std::chrono::seconds>(drop_time - create_time).count() + 1));
+            }
+        }
+    };
+
+    auto add_column = [&] (sstring name, data_type type) {
+        validate_new_column(name);
+        b.with_column(to_bytes(name), type);
+    };
+
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
            auto type = column.type;
@@ -639,9 +666,9 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
                    }
                ));
            }
-            b.with_column(log_data_column_name_bytes(column.name()), type);
+            add_column(log_data_column_name(column.name_as_text()), type);
            if (is_data_col) {
-                b.with_column(log_data_column_deleted_name_bytes(column.name()), boolean_type);
+                add_column(log_data_column_deleted_name(column.name_as_text()), boolean_type);
            }
            if (column.type->is_multi_cell()) {
                auto dtype = visit(*type, make_visitor(
@@ -657,7 +684,7 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
                        throw std::invalid_argument("Should not reach");
                    }
                ));
-                b.with_column(log_data_column_deleted_elements_name_bytes(column.name()), dtype);
+                add_column(log_data_column_deleted_elements_name(column.name_as_text()), dtype);
            }
        }
    };
@@ -683,7 +710,8 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
        // not super efficient, but we don't do this often.
        for (auto& col : old->all_columns()) {
            if (!b.has_column({col.name(), col.name_as_text() })) {
-                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+                auto drop_ts = api::timestamp_clock::now() + column_drop_leeway;
+                b.without_column(col.name_as_text(), col.type, drop_ts.time_since_epoch().count());
            }
        }
    }
@@ -1590,7 +1618,7 @@ public:
        : _ctx(ctx)
        , _schema(std::move(s))
        , _dk(std::move(dk))
-        , _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
+        , _log_schema(_schema->cdc_schema() ? _schema->cdc_schema() : ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
        , _options(options)
        , _clustering_row_states(0, clustering_key::hashing(*_schema), clustering_key::equality(*_schema))
        , _uses_tablets(ctx._proxy.get_db().local().find_keyspace(_schema->ks_name()).uses_tablets())
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -54,7 +54,7 @@ cdc::stream_id get_stream(
 }

 static cdc::stream_id get_stream(
-        const std::vector<cdc::stream_id>& streams,
+        const utils::chunked_vector<cdc::stream_id>& streams,
        dht::token tok) {
    if (streams.empty()) {
        on_internal_error(cdc_log, "get_stream: streams empty");
@@ -159,7 +159,7 @@ cdc::stream_id cdc::metadata::get_vnode_stream(api::timestamp_type ts, dht::toke
    return ret;
 }

-const std::vector<cdc::stream_id>& cdc::metadata::get_tablet_stream_set(table_id tid, api::timestamp_type ts) const {
+const utils::chunked_vector<cdc::stream_id>& cdc::metadata::get_tablet_stream_set(table_id tid, api::timestamp_type ts) const {
    auto now = api::new_timestamp();
    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(seastar::format(
@@ -259,10 +259,10 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
    return !it->second;
 }

-future<std::vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
-        const std::vector<cdc::stream_id>& prev_stream_set,
-        std::vector<cdc::stream_id> opened,
-        const std::vector<cdc::stream_id>& closed) {
+future<utils::chunked_vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
+        const utils::chunked_vector<cdc::stream_id>& prev_stream_set,
+        utils::chunked_vector<cdc::stream_id> opened,
+        const utils::chunked_vector<cdc::stream_id>& closed) {

    if (closed.size() == prev_stream_set.size()) {
        // all previous streams are closed, so the next stream set is just the opened streams.
@@ -273,8 +273,8 @@ future<std::vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
    // streams and removing the closed streams. we assume each stream set is
    // sorted by token, and the result is sorted as well.

-    std::vector<cdc::stream_id> next_stream_set;
-    next_stream_set.reserve(prev_stream_set.size() + opened.size() - closed.size());
+    utils::chunked_vector<cdc::stream_id> next_stream_set;
+    co_await utils::reserve_gently(next_stream_set, prev_stream_set.size() + opened.size() - closed.size());

    auto next_prev = prev_stream_set.begin();
    auto next_closed = closed.begin();
@@ -318,8 +318,8 @@ std::vector<table_id> cdc::metadata::get_tables_with_cdc_tablet_streams() const
    return _tablet_streams | std::views::keys | std::ranges::to<std::vector<table_id>>();
 }

-future<cdc::cdc_stream_diff> cdc::metadata::generate_stream_diff(const std::vector<stream_id>& before, const std::vector<stream_id>& after) {
-    std::vector<stream_id> closed, opened;
+future<cdc::cdc_stream_diff> cdc::metadata::generate_stream_diff(const utils::chunked_vector<stream_id>& before, const utils::chunked_vector<stream_id>& after) {
+    utils::chunked_vector<stream_id> closed, opened;

    auto before_it = before.begin();
    auto after_it = after.begin();
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -49,7 +49,7 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;

-    const std::vector<stream_id>& get_tablet_stream_set(table_id tid, api::timestamp_type ts) const;
+    const utils::chunked_vector<stream_id>& get_tablet_stream_set(table_id tid, api::timestamp_type ts) const;

 public:
    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
@@ -111,14 +111,14 @@ public:

    std::vector<table_id> get_tables_with_cdc_tablet_streams() const;

-    static future<std::vector<stream_id>> construct_next_stream_set(
-        const std::vector<cdc::stream_id>& prev_stream_set,
-        std::vector<cdc::stream_id> opened,
-        const std::vector<cdc::stream_id>& closed);
+    static future<utils::chunked_vector<stream_id>> construct_next_stream_set(
+        const utils::chunked_vector<cdc::stream_id>& prev_stream_set,
+        utils::chunked_vector<cdc::stream_id> opened,
+        const utils::chunked_vector<cdc::stream_id>& closed);

    static future<cdc_stream_diff> generate_stream_diff(
-        const std::vector<stream_id>& before,
-        const std::vector<stream_id>& after);
+        const utils::chunked_vector<stream_id>& before,
+        const utils::chunked_vector<stream_id>& after);

 };

--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -855,7 +855,7 @@ maintenance_socket: ignore
 # enable_create_table_with_compact_storage: false

 # Control tablets for new keyspaces.
-# Can be set to: disabled|enabled
+# Can be set to: disabled|enabled|enforced
 #
 # When enabled, newly created keyspaces will have tablets enabled by default.
 # That can be explicitly disabled in the CREATE KEYSPACE query
--- a/configure.py
+++ b/configure.py
@@ -642,7 +642,8 @@ raft_tests = set([

 vector_search_tests = set([
    'test/vector_search/vector_store_client_test',
-    'test/vector_search/load_balancer_test'
+    'test/vector_search/load_balancer_test',
+    'test/vector_search/client_test'
 ])

 wasms = set([
@@ -1195,6 +1196,7 @@ scylla_core = (['message/messaging_service.cc',
                'table_helper.cc',
                'audit/audit.cc',
                'audit/audit_cf_storage_helper.cc',
+                'audit/audit_composite_storage_helper.cc',
                'audit/audit_syslog_storage_helper.cc',
                'tombstone_gc_options.cc',
                'tombstone_gc.cc',
@@ -1265,6 +1267,8 @@ scylla_core = (['message/messaging_service.cc',
                'utils/disk_space_monitor.cc',
                'vector_search/vector_store_client.cc',
                'vector_search/dns.cc',
+                'vector_search/client.cc',
+                'vector_search/clients.cc'
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
               )
@@ -1408,6 +1412,8 @@ scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_gener
    'test/lib/key_utils.cc',
    'test/lib/proc_utils.cc',
    'test/lib/gcs_fixture.cc',
+    'test/lib/aws_kms_fixture.cc',
+    'test/lib/azure_kms_fixture.cc',
 ]

 scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc', 'utils/exceptions.cc']
@@ -1660,6 +1666,7 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',

 deps['test/vector_search/vector_store_client_test'] =  ['test/vector_search/vector_store_client_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
+deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies

 wasm_deps = {}

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1349,7 +1349,7 @@ static managed_bytes reserialize_value(View value_bytes,
    if (type.is_map()) {
        std::vector<std::pair<managed_bytes, managed_bytes>> elements = partially_deserialize_map(value_bytes);

-        const map_type_impl mapt = dynamic_cast<const map_type_impl&>(type);
+        const map_type_impl& mapt = dynamic_cast<const map_type_impl&>(type);
        const abstract_type& key_type = mapt.get_keys_type()->without_reversed();
        const abstract_type& value_type = mapt.get_values_type()->without_reversed();

@@ -1391,7 +1391,7 @@ static managed_bytes reserialize_value(View value_bytes,
        const vector_type_impl& vtype = dynamic_cast<const vector_type_impl&>(type);
        std::vector<managed_bytes> elements = vtype.split_fragmented(value_bytes);

-        auto elements_type = vtype.get_elements_type()->without_reversed();
+        const auto& elements_type = vtype.get_elements_type()->without_reversed();

        if (elements_type.bound_value_needs_to_be_reserialized()) {
            for (size_t i = 0; i < elements.size(); i++) {
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -145,9 +145,7 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
            throw exceptions::configuration_exception(sstring("Missing sub-option '") + compression_parameters::SSTABLE_COMPRESSION + "' for the '" + KW_COMPRESSION + "' option.");
        }
        compression_parameters cp(*compression_options);
-        cp.validate(
-            compression_parameters::dicts_feature_enabled(bool(db.features().sstable_compression_dicts)),
-            compression_parameters::dicts_usage_allowed(db.get_config().sstable_compression_dictionaries_allow_in_ddl()));
+        cp.validate(compression_parameters::dicts_feature_enabled(bool(db.features().sstable_compression_dicts)));
    }

    auto per_partition_rate_limit_options = get_per_partition_rate_limit_options(schema_extensions);
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -112,14 +112,8 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chun
            ksm->strategy_name(),
            locator::replication_strategy_params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option()),
            tmptr->get_topology());
-        if (rs->uses_tablets()) {
-            warnings.push_back(
-                "Tables in this keyspace will be replicated using Tablets "
-                "and will not support counters features. To use counters, drop this keyspace and re-create it "
-                "without tablets by adding AND TABLETS = {'enabled': false} to the CREATE KEYSPACE statement.");
-            if (ksm->initial_tablets().value()) {
-                warnings.push_back("Keyspace `initial` tablets option is deprecated.  Use per-table tablet options instead.");
-            }
+        if (rs->uses_tablets() && ksm->initial_tablets().value()) {
+            warnings.push_back("Keyspace `initial` tablets option is deprecated.  Use per-table tablet options instead.");
        }

        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to create an RF-rack-invalid keyspace.
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -222,7 +222,7 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
            throw exceptions::invalid_request_exception("Cannot set default_time_to_live on a table with counters");
        }

-        if (ks_uses_tablets && pt.is_counter()) {
+        if (ks_uses_tablets && pt.is_counter() && !db.features().counters_with_tablets) {
            throw exceptions::invalid_request_exception(format("Cannot use the 'counter' type for table {}.{}: Counters are not yet supported with tablets", keyspace(), cf_name));
        }

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -2016,7 +2016,9 @@ vector_indexed_table_select_statement::vector_indexed_table_select_statement(sch
 future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::do_execute(
        query_processor& qp, service::query_state& state, const query_options& options) const {

-    return measure_index_latency(*_schema, _index, [this, &qp, &state, &options](this auto) -> future<shared_ptr<cql_transport::messages::result_message>> {
+    auto limit = get_limit(options, _limit);
+
+    auto result = co_await measure_index_latency(*_schema, _index, [this, &qp, &state, &options, &limit](this auto) -> future<shared_ptr<cql_transport::messages::result_message>> {
        tracing::add_table_name(state.get_trace_state(), keyspace(), column_family());
        validate_for_read(options.get_consistency());

@@ -2024,8 +2026,6 @@ future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table

        update_stats();

-        auto limit = get_limit(options, _limit);
-
        if (limit > max_ann_query_limit) {
            co_await coroutine::return_exception(exceptions::invalid_request_exception(
                    fmt::format("Use of ANN OF in an ORDER BY clause requires a LIMIT that is not greater than {}. LIMIT was {}", max_ann_query_limit, limit)));
@@ -2040,6 +2040,12 @@ future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table

        co_return co_await query_base_table(qp, state, options, pkeys.value());
    });
+
+    auto page_size = options.get_page_size();
+    if (page_size > 0 && (uint64_t) page_size < limit) {
+        result->add_warning("Paging is not supported for Vector Search queries. The entire result set has been returned.");
+    }
+    co_return result;
 }

 void vector_indexed_table_select_statement::update_stats() const {
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -77,9 +77,11 @@ future<db::all_batches_replayed> db::batchlog_manager::do_batch_log_replay(post_
                });
            });
        }
-        co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
-            bm._last_replay = last_replay;
-        });
+        if (all_replayed == all_batches_replayed::yes) {
+            co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
+                bm._last_replay = last_replay;
+            });
+        }
        blogger.debug("Batchlog replay on shard {}: done", dest);
        co_return all_replayed;
    });
@@ -188,6 +190,7 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches

        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            all_replayed = all_batches_replayed::no;
            co_return stop_iteration::no;
        }

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -3329,7 +3329,6 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
        commit_load_reader_func func;
        input_stream<char> fin;
        replay_state::impl& state;
-        input_stream<char> r;
        uint64_t id = 0;
        size_t pos = 0;
        size_t next = 0;
--- a/db/config.cc
+++ b/db/config.cc
@@ -1315,15 +1315,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Unused, true, "Enable SSTables 'md' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , sstable_format(this, "sstable_format", liveness::LiveUpdate, value_status::Used, "me", "Default sstable file format", {"md", "me", "ms"})
-    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{},
+    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{compression_parameters::algorithm::lz4_with_dicts},
        "Server-global user table compression options. If enabled, all user tables"
        "will be compressed using the provided options, unless overridden"
        "by compression options in the table schema. The available options are:\n"
-        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor (default), LZ4WithDictsCompressor, SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
+        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor, LZ4WithDictsCompressor (default), SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
        "* chunk_length_in_kb: (Default: 4) The size of chunks to compress in kilobytes. Allowed values are powers of two between 1 and 128.\n"
        "* crc_check_chance: (Default: 1.0) Not implemented (option value is ignored).\n"
        "* compression_level: (Default: 3) Compression level for ZstdCompressor and ZstdWithDictsCompressor. Higher levels provide better compression ratios at the cost of speed. Allowed values are integers between 1 and 22.")
-    , sstable_compression_dictionaries_allow_in_ddl(this, "sstable_compression_dictionaries_allow_in_ddl", liveness::LiveUpdate, value_status::Used, true,
+    , sstable_compression_dictionaries_allow_in_ddl(this, "sstable_compression_dictionaries_allow_in_ddl", liveness::LiveUpdate, value_status::Deprecated, true,
        "Allows for configuring tables to use SSTable compression with shared dictionaries. "
        "If the option is disabled, Scylla will reject CREATE and ALTER statements which try to set dictionary-based sstable compressors.\n"
        "This is only enforced when this node validates a new DDL statement; disabling the option won't disable dictionary-based compression "
@@ -1425,7 +1425,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , alternator_port(this, "alternator_port", value_status::Used, 0, "Alternator API port.")
    , alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port.")
    , alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address.")
-    , alternator_enforce_authorization(this, "alternator_enforce_authorization", value_status::Used, false, "Enforce checking the authorization header for every request in Alternator.")
+    , alternator_enforce_authorization(this, "alternator_enforce_authorization", liveness::LiveUpdate, value_status::Used, false, "Enforce checking the authorization header for every request in Alternator.")
+    , alternator_warn_authorization(this, "alternator_warn_authorization", liveness::LiveUpdate, value_status::Used, false, "Count and log warnings about failed authentication or authorization")
    , alternator_write_isolation(this, "alternator_write_isolation", value_status::Used, "", "Default write isolation policy for Alternator.")
    , alternator_streams_time_window_s(this, "alternator_streams_time_window_s", value_status::Used, 10, "CDC query confidence window for alternator streams.")
    , alternator_timeout_in_ms(this, "alternator_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
@@ -1525,9 +1526,9 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , error_injections_at_startup(this, "error_injections_at_startup", error_injection_value_status, {}, "List of error injections that should be enabled on startup.")
    , topology_barrier_stall_detector_threshold_seconds(this, "topology_barrier_stall_detector_threshold_seconds", value_status::Used, 2, "Report sites blocking topology barrier if it takes longer than this.")
    , enable_tablets(this, "enable_tablets", value_status::Used, false, "Enable tablets for newly created keyspaces. (deprecated)")
-    , tablets_mode_for_new_keyspaces(this, "tablets_mode_for_new_keyspaces", value_status::Used, tablets_mode_t::mode::unset, "Control tablets for new keyspaces.  Can be set to the following values:\n"
+    , tablets_mode_for_new_keyspaces(this, "tablets_mode_for_new_keyspaces", liveness::LiveUpdate, value_status::Used, tablets_mode_t::mode::unset, "Control tablets for new keyspaces.  Can be set to the following values:\n"
            "\tdisabled: New keyspaces use vnodes by default, unless enabled by the tablets={'enabled':true} option\n"
-            "\tenabled:  New keyspaces use tablets by default, unless disabled by the tablets={'disabled':true} option\n"
+            "\tenabled:  New keyspaces use tablets by default, unless disabled by the tablets={'enabled':false} option\n"
            "\tenforced: New keyspaces must use tablets. Tablets cannot be disabled using the CREATE KEYSPACE option")
    , view_flow_control_delay_limit_in_ms(this, "view_flow_control_delay_limit_in_ms", liveness::LiveUpdate, value_status::Used, 1000,
        "The maximal amount of time that materialized-view update flow control may delay responses "
--- a/db/config.hh
+++ b/db/config.hh
@@ -458,6 +458,7 @@ public:
    named_value<uint16_t> alternator_https_port;
    named_value<sstring> alternator_address;
    named_value<bool> alternator_enforce_authorization;
+    named_value<bool> alternator_warn_authorization;
    named_value<sstring> alternator_write_isolation;
    named_value<uint32_t> alternator_streams_time_window_s;
    named_value<uint32_t> alternator_timeout_in_ms;
--- a/db/schema_applier.cc
+++ b/db/schema_applier.cc
@@ -30,6 +30,8 @@
 #include "mutation/frozen_mutation.hh"
 #include "schema/schema_fwd.hh"
 #include "utils/assert.hh"
+#include "cdc/log.hh"
+#include "cdc/cdc_partitioner.hh"
 #include "view_info.hh"
 #include "replica/database.hh"
 #include "lang/manager.hh"
@@ -592,9 +594,48 @@ future<> schema_applier::merge_tables_and_views()
    // diffs bound to current shard
    auto& local_views = _affected_tables_and_views.tables_and_views.local().views;
    auto& local_tables = _affected_tables_and_views.tables_and_views.local().tables;
+    auto& local_cdc = _affected_tables_and_views.tables_and_views.local().cdc;

-    local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&] (schema_mutations sm, schema_diff_side) {
-        return create_table_from_mutations(_proxy, std::move(sm), user_types);
+    // Create CDC tables before non-CDC base tables, because we want the base tables with CDC enabled
+    // to point to their CDC tables.
+    local_cdc = diff_table_or_view(_proxy, _before.cdc, _after.cdc, _reload, [&] (schema_mutations sm, schema_diff_side) {
+        return create_table_from_mutations(_proxy, std::move(sm), user_types, nullptr);
+    });
+    local_tables = diff_table_or_view(_proxy, _before.tables, _after.tables, _reload, [&] (schema_mutations sm, schema_diff_side side) {
+        // If the table has CDC enabled, find the CDC schema version and set it in the table schema.
+        // If the table is created or altered with CDC enabled, then the CDC
+        // table is also created or altered in the same operation, so we can
+        // find its schema version in the CDC schemas we created above in
+        // local_cdc.
+        query::result_set rs(sm.columnfamilies_mutation());
+        const query::result_set_row& table_row = rs.row(0);
+
+        auto ks_name = table_row.get_nonnull<sstring>("keyspace_name");
+        auto cf_name = table_row.get_nonnull<sstring>("table_name");
+        auto cdc_name = cdc::log_name(cf_name);
+
+        schema_ptr cdc_schema; // optional CDC schema of this table
+
+        // we only need to set the cdc schema for created schemas and new altered schemas.
+        // old altered schemas that we create here will not be used for generating cdc mutations.
+        if (side == schema_diff_side::right) {
+            for (const auto& cdc_created : local_cdc.created) {
+                const auto& new_cdc_schema = cdc_created;
+                if (new_cdc_schema->ks_name() == ks_name && new_cdc_schema->cf_name() == cdc_name) {
+                    cdc_schema = new_cdc_schema;
+                    break;
+                }
+            }
+            for (const auto& cdc_altered : local_cdc.altered) {
+                const auto& new_cdc_schema = cdc_altered.new_schema;
+                if (new_cdc_schema->ks_name() == ks_name && new_cdc_schema->cf_name() == cdc_name) {
+                    cdc_schema = new_cdc_schema;
+                    break;
+                }
+            }
+        }
+
+        return create_table_from_mutations(_proxy, std::move(sm), user_types, cdc_schema);
    });
    local_views = diff_table_or_view(_proxy, _before.views, _after.views, _reload, [&] (schema_mutations sm, schema_diff_side side) {
        // The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
@@ -641,11 +682,14 @@ future<> schema_applier::merge_tables_and_views()

    // create schema_ptrs for all shards
    frozen_schema_diff tables_frozen = co_await local_tables.freeze();
+    frozen_schema_diff cdc_frozen = co_await local_cdc.freeze();
    frozen_schema_diff views_frozen = co_await local_views.freeze();
-    co_await _affected_tables_and_views.tables_and_views.invoke_on_others([this, &tables_frozen, &views_frozen] (affected_tables_and_views_per_shard& tables_and_views) -> future<> {
+    co_await _affected_tables_and_views.tables_and_views.invoke_on_others([this, &tables_frozen, &cdc_frozen, &views_frozen] (affected_tables_and_views_per_shard& tables_and_views) -> future<> {
        auto& db = _proxy.local().get_db().local();
        tables_and_views.tables = co_await schema_diff_per_shard::copy_from(
                db, _types_storage, tables_frozen);
+        tables_and_views.cdc = co_await schema_diff_per_shard::copy_from(
+                db, _types_storage, cdc_frozen);
        tables_and_views.views = co_await schema_diff_per_shard::copy_from(
                db, _types_storage, views_frozen);
    });
@@ -661,23 +705,28 @@ future<> schema_applier::merge_tables_and_views()
        _affected_tables_and_views.table_shards.insert({uuid,
                co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
    });
+    co_await max_concurrent_for_each(local_cdc.dropped, max_concurrent, [&db, this] (schema_ptr& dt) -> future<> {
+        auto uuid = dt->id();
+        _affected_tables_and_views.table_shards.insert({uuid,
+                co_await replica::database::prepare_drop_table_on_all_shards(db, uuid)});
+    });
 }

 future<frozen_schema_diff> schema_diff_per_shard::freeze() const {
    frozen_schema_diff result;
    for (const auto& c : created) {
-        result.created.emplace_back(frozen_schema_with_base_info(c));
+        result.created.emplace_back(extended_frozen_schema(c));
        co_await coroutine::maybe_yield();
    }
    for (const auto& a : altered) {
        result.altered.push_back(frozen_schema_diff::altered_schema{
-            .old_schema = frozen_schema_with_base_info(a.old_schema),
-            .new_schema = frozen_schema_with_base_info(a.new_schema),
+            .old_schema = extended_frozen_schema(a.old_schema),
+            .new_schema = extended_frozen_schema(a.new_schema),
        });
        co_await coroutine::maybe_yield();
    }
    for (const auto& d : dropped) {
-        result.dropped.emplace_back(frozen_schema_with_base_info(d));
+        result.dropped.emplace_back(extended_frozen_schema(d));
        co_await coroutine::maybe_yield();
    }
    co_return result;
@@ -715,16 +764,20 @@ static future<> notify_tables_and_views(service::migration_notifier& notifier, c
    };

    const auto& tables = diff.tables_and_views.local().tables;
+    const auto& cdc = diff.tables_and_views.local().cdc;
    const auto& views = diff.tables_and_views.local().views;

    // View drops are notified first, because a table can only be dropped if its views are already deleted
    co_await notify(views.dropped, [&] (auto&& dt) { return notifier.drop_view(view_ptr(dt)); });
    co_await notify(tables.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
+    co_await notify(cdc.dropped, [&] (auto&& dt) { return notifier.drop_column_family(dt); });
    // Table creations are notified first, in case a view is created right after the table
    co_await notify(tables.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
+    co_await notify(cdc.created, [&] (auto&& gs) { return notifier.create_column_family(gs); });
    co_await notify(views.created, [&] (auto&& gs) { return notifier.create_view(view_ptr(gs)); });
    // Table altering is notified first, in case new base columns appear
    co_await notify(tables.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
+    co_await notify(cdc.altered, [&] (auto&& altered) { return notifier.update_column_family(altered.new_schema, *it++); });
    co_await notify(views.altered, [&] (auto&& altered) { return notifier.update_view(view_ptr(altered.new_schema), *it++); });
 }

@@ -782,13 +835,38 @@ future<> schema_applier::merge_aggregates() {
    });
 }

+struct extracted_cdc {
+    std::map<table_id, schema_mutations> tables_without_cdc;
+    std::map<table_id, schema_mutations> cdc_tables;
+};
+
+static extracted_cdc extract_cdc(std::map<table_id, schema_mutations> tables) {
+    std::map<table_id, schema_mutations> cdc_tables;
+
+    auto it = tables.begin();
+    while (it != tables.end()) {
+        if (it->second.partitioner() == cdc::cdc_partitioner::classname) {
+            auto node = tables.extract(it++);
+            cdc_tables.insert(std::move(node));
+        } else {
+            ++it;
+        }
+    }
+
+    return extracted_cdc{std::move(tables), std::move(cdc_tables)};
+}
+
 future<schema_persisted_state> schema_applier::get_schema_persisted_state() {
+    auto tables_and_cdc = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::table, _affected_tables);
+    auto [tables, cdc] = extract_cdc(std::move(tables_and_cdc));
+
    schema_persisted_state v{
        .keyspaces = co_await read_schema_for_keyspaces(_proxy, KEYSPACES, _keyspaces),
        .scylla_keyspaces = co_await read_schema_for_keyspaces(_proxy, SCYLLA_KEYSPACES, _keyspaces),
-        .tables = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::table, _affected_tables),
+        .tables = std::move(tables),
        .types = co_await read_schema_for_keyspaces(_proxy, TYPES, _keyspaces),
        .views = co_await read_tables_for_keyspaces(_proxy, _keyspaces, table_kind::view, _affected_tables),
+        .cdc = std::move(cdc),
        .functions = co_await read_schema_for_keyspaces(_proxy, FUNCTIONS, _keyspaces),
        .aggregates = co_await read_schema_for_keyspaces(_proxy, AGGREGATES, _keyspaces),
        .scylla_aggregates = co_await read_schema_for_keyspaces(_proxy, SCYLLA_AGGREGATES, _keyspaces),
@@ -897,6 +975,7 @@ public:
        };
        auto& tables_and_views = _sa._affected_tables_and_views.tables_and_views.local();
        co_await include_pending_changes(tables_and_views.tables);
+        co_await include_pending_changes(tables_and_views.cdc);
        co_await include_pending_changes(tables_and_views.views);

        for (auto& [id, schema] : table_schemas) {
@@ -944,6 +1023,7 @@ void schema_applier::commit_tables_and_views() {
    auto& db = sharded_db.local();
    auto& diff = _affected_tables_and_views;
    const auto& tables = diff.tables_and_views.local().tables;
+    const auto& cdc = diff.tables_and_views.local().cdc;
    const auto& views = diff.tables_and_views.local().views;

    for (auto& dropped_view : views.dropped) {
@@ -954,6 +1034,15 @@ void schema_applier::commit_tables_and_views() {
        auto s = dropped_table.get();
        replica::database::drop_table(sharded_db, s->ks_name(), s->cf_name(), true, diff.table_shards[s->id()]);
    }
+    for (auto& dropped_cdc : cdc.dropped) {
+        auto s = dropped_cdc.get();
+        replica::database::drop_table(sharded_db, s->ks_name(), s->cf_name(), true, diff.table_shards[s->id()]);
+    }
+
+    for (auto& schema : cdc.created) {
+        auto& ks = db.find_keyspace(schema->ks_name());
+        db.add_column_family(ks, schema, ks.make_column_family_config(*schema, db), replica::database::is_new_cf::yes, _pending_token_metadata.local());
+    }

    for (auto& schema : tables.created) {
        auto& ks = db.find_keyspace(schema->ks_name());
@@ -965,7 +1054,11 @@ void schema_applier::commit_tables_and_views() {
        db.add_column_family(ks, schema, ks.make_column_family_config(*schema, db), replica::database::is_new_cf::yes, _pending_token_metadata.local());
    }

-    diff.tables_and_views.local().columns_changed.reserve(tables.altered.size() + views.altered.size());
+    diff.tables_and_views.local().columns_changed.reserve(tables.altered.size() + cdc.altered.size() + views.altered.size());
+    for (auto&& altered : cdc.altered) {
+        bool changed = db.update_column_family(altered.new_schema);
+        diff.tables_and_views.local().columns_changed.push_back(changed);
+    }
    for (auto&& altered : boost::range::join(tables.altered, views.altered)) {
        bool changed = db.update_column_family(altered.new_schema);
        diff.tables_and_views.local().columns_changed.push_back(changed);
@@ -1052,6 +1145,10 @@ future<> schema_applier::finalize_tables_and_views() {
        auto s = dropped_table.get();
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }
+    for (auto& dropped_cdc : diff.tables_and_views.local().cdc.dropped) {
+        auto s = dropped_cdc.get();
+        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
+    }

    if (_tablet_hint) {
        auto& db = sharded_db.local();
@@ -1062,7 +1159,11 @@ future<> schema_applier::finalize_tables_and_views() {

    co_await sharded_db.invoke_on_all([&diff] (replica::database& db) -> future<> {
        const auto& tables = diff.tables_and_views.local().tables;
+        const auto& cdc = diff.tables_and_views.local().cdc;
        const auto& views = diff.tables_and_views.local().views;
+        for (auto& created_cdc : cdc.created) {
+            co_await db.make_column_family_directory(created_cdc);
+        }
        for (auto& created_table : tables.created) {
            co_await db.make_column_family_directory(created_table);
        }
--- a/db/schema_applier.hh
+++ b/db/schema_applier.hh
@@ -48,6 +48,7 @@ struct schema_persisted_state {
    std::map<table_id, schema_mutations> tables;
    schema_tables::schema_result types;
    std::map<table_id, schema_mutations> views;
+    std::map<table_id, schema_mutations> cdc;
    schema_tables::schema_result functions;
    schema_tables::schema_result aggregates;
    schema_tables::schema_result scylla_aggregates;
@@ -105,12 +106,12 @@ public:

 struct frozen_schema_diff {
    struct altered_schema {
-        frozen_schema_with_base_info old_schema;
-        frozen_schema_with_base_info new_schema;
+        extended_frozen_schema old_schema;
+        extended_frozen_schema new_schema;
    };
-    std::vector<frozen_schema_with_base_info> created;
+    std::vector<extended_frozen_schema> created;
    std::vector<altered_schema> altered;
-    std::vector<frozen_schema_with_base_info> dropped;
+    std::vector<extended_frozen_schema> dropped;
 };

 // schema_diff represents what is happening with tables or views during schema merge
@@ -140,6 +141,7 @@ public:

 struct affected_tables_and_views_per_shard {
    schema_diff_per_shard tables;
+    schema_diff_per_shard cdc;
    schema_diff_per_shard views;
    std::vector<bool> columns_changed;
 };
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -28,6 +28,7 @@
 #include "utils/log.hh"
 #include "schema/frozen_schema.hh"
 #include "schema/schema_registry.hh"
+#include "cdc/cdc_options.hh"
 #include "mutation_query.hh"
 #include "system_keyspace.hh"
 #include "system_distributed_keyspace.hh"
@@ -2077,7 +2078,9 @@ future<schema_ptr> create_table_from_name(sharded<service::storage_proxy>& proxy
        co_await coroutine::return_exception(std::runtime_error(format("{}:{} not found in the schema definitions keyspace.", qn.keyspace_name, qn.table_name)));
    }
    const schema_ctxt& ctxt = proxy;
-    co_return create_table_from_mutations(ctxt, std::move(sm), ctxt.user_types());
+    // The CDC schema is set to nullptr because we don't have it yet, but we will
+    // check and update it soon if needed in create_tables_from_tables_partition.
+    co_return create_table_from_mutations(ctxt, std::move(sm), ctxt.user_types(), nullptr);
 }

 // Limit concurrency of user tables to prevent stalls.
@@ -2095,10 +2098,28 @@ constexpr size_t max_concurrent = 8;
 future<std::map<sstring, schema_ptr>> create_tables_from_tables_partition(sharded<service::storage_proxy>& proxy, const schema_result::mapped_type& result)
 {
    auto tables = std::map<sstring, schema_ptr>();
+    auto tables_with_cdc = std::map<sstring, schema_ptr>();
    co_await max_concurrent_for_each(result->rows().begin(), result->rows().end(), max_concurrent, [&] (const query::result_set_row& row) -> future<> {
        schema_ptr cfm = co_await create_table_from_table_row(proxy, row);
-        tables.emplace(cfm->cf_name(), std::move(cfm));
+        if (!cfm->cdc_options().enabled()) {
+            tables.emplace(cfm->cf_name(), std::move(cfm));
+        } else {
+            // defer tables with CDC enabled. we want to construct all CDC tables first
+            // so then we can construct the schemas for these tables with the pointer to
+            // its CDC schema.
+            tables_with_cdc.emplace(cfm->cf_name(), std::move(cfm));
+        }
    });
+    for (auto&& [name, cfm] : tables_with_cdc) {
+        schema_ptr cdc_schema;
+        if (auto it = tables.find(cdc::log_name(name)); it != tables.end()) {
+            cdc_schema = it->second;
+        } else {
+            slogger.warn("Did not find CDC log schema for table {}", name);
+        }
+        schema_ptr extended_cfm = cdc_schema ? cfm->make_with_cdc(cdc_schema) : cfm;
+        tables.emplace(std::move(name), std::move(extended_cfm));
+    }
    co_return std::move(tables);
 }

@@ -2247,7 +2268,7 @@ static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, sche
    }
 }

-schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, const data_dictionary::user_types_storage& user_types, std::optional<table_schema_version> version)
+schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, const data_dictionary::user_types_storage& user_types, schema_ptr cdc_schema, std::optional<table_schema_version> version)
 {
    slogger.trace("create_table_from_mutations: version={}, {}", version, sm);

@@ -2331,6 +2352,10 @@ schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations
        builder.with_version(sm.digest(ctxt.features().cluster_schema_features()));
    }

+    if (cdc_schema) {
+        builder.with_cdc_schema(cdc_schema);
+    }
+
    if (auto partitioner = sm.partitioner()) {
        builder.with_partitioner(*partitioner);
        builder.with_sharder(smp::count, ctxt.murmur3_partitioner_ignore_msb_bits());
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -286,7 +286,7 @@ future<std::map<sstring, schema_ptr>> create_tables_from_tables_partition(sharde

 utils::chunked_vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);

-schema_ptr create_table_from_mutations(const schema_ctxt&, schema_mutations, const data_dictionary::user_types_storage& user_types, std::optional<table_schema_version> version = {});
+schema_ptr create_table_from_mutations(const schema_ctxt&, schema_mutations, const data_dictionary::user_types_storage& user_types, schema_ptr cdc_schema, std::optional<table_schema_version> version = {});

 view_ptr create_view_from_mutations(const schema_ctxt&, schema_mutations, const data_dictionary::user_types_storage&, schema_ptr, std::optional<table_schema_version> version = {});
 view_ptr create_view_from_mutations(const schema_ctxt&, schema_mutations, const data_dictionary::user_types_storage&, std::optional<view::base_dependent_view_info> = {}, std::optional<table_schema_version> version = {});
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2463,14 +2463,14 @@ future<bool> system_keyspace::cdc_is_rewritten() {
 }

 future<> system_keyspace::read_cdc_streams_state(std::optional<table_id> table,
-        noncopyable_function<future<>(table_id, db_clock::time_point, std::vector<cdc::stream_id>)> f) {
+        noncopyable_function<future<>(table_id, db_clock::time_point, utils::chunked_vector<cdc::stream_id>)> f) {
    static const sstring all_tables_query = format("SELECT table_id, timestamp, stream_id FROM {}.{}", NAME, CDC_STREAMS_STATE);
    static const sstring single_table_query = format("SELECT table_id, timestamp, stream_id FROM {}.{} WHERE table_id = ?", NAME, CDC_STREAMS_STATE);

    struct cur_t {
        table_id tid;
        db_clock::time_point ts;
-        std::vector<cdc::stream_id> streams;
+        utils::chunked_vector<cdc::stream_id> streams;
    };
    std::optional<cur_t> cur;

@@ -2487,7 +2487,7 @@ future<> system_keyspace::read_cdc_streams_state(std::optional<table_id> table,
            if (cur) {
                co_await f(cur->tid, cur->ts, std::move(cur->streams));
            }
-            cur = { tid, ts, std::vector<cdc::stream_id>() };
+            cur = { tid, ts, utils::chunked_vector<cdc::stream_id>() };
        }
        cur->streams.push_back(std::move(stream_id));

@@ -3681,6 +3681,11 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
        if (some_row.has("ignore_nodes")) {
            ret.ignored_nodes = decode_nodes_ids(deserialize_set_column(*topology(), some_row, "ignore_nodes"));
        }
+
+        ret.excluded_tablet_nodes = ret.ignored_nodes;
+        for (const auto& [id, _]: ret.left_nodes_rs) {
+            ret.excluded_tablet_nodes.insert(id);
+        }
    }

    co_return ret;
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -601,7 +601,7 @@ public:
    future<bool> cdc_is_rewritten();
    future<> cdc_set_rewritten(std::optional<cdc::generation_id_v1>);

-    future<> read_cdc_streams_state(std::optional<table_id> table, noncopyable_function<future<>(table_id, db_clock::time_point, std::vector<cdc::stream_id>)> f);
+    future<> read_cdc_streams_state(std::optional<table_id> table, noncopyable_function<future<>(table_id, db_clock::time_point, utils::chunked_vector<cdc::stream_id>)> f);
    future<> read_cdc_streams_history(table_id table, std::optional<db_clock::time_point> from, noncopyable_function<future<>(table_id, db_clock::time_point, cdc::cdc_stream_diff)> f);

    // Load Raft Group 0 id from scylla.local
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -3311,15 +3311,6 @@ public:
                          _step.base->schema()->cf_name(), _step.current_token(), view_names);
        }
        if (_step.reader.is_end_of_stream() && _step.reader.is_buffer_empty()) {
-            if (_step.current_key.key().is_empty()) {
-                // consumer got end-of-stream without consuming a single partition
-                vlogger.debug("Reader didn't produce anything, marking views as built");
-                while (!_step.build_status.empty()) {
-                    _built_views.views.push_back(std::move(_step.build_status.back()));
-                    _step.build_status.pop_back();
-                }
-            }
-
            // before going back to the minimum token, advance current_key to the end
            // and check for built views in that range.
            _step.current_key = { _step.prange.end().value_or(dht::ring_position::max()).value().token(), partition_key::make_empty()};
@@ -3338,6 +3329,7 @@ public:

 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
+    inject_failure("dont_start_build_step");
    gc_clock::time_point now = gc_clock::now();
    auto compaction_state = make_lw_shared<compact_for_query_state>(
            *step.reader.schema(),
@@ -3372,6 +3364,7 @@ void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    seastar::when_all_succeed(bookkeeping_ops.begin(), bookkeeping_ops.end()).handle_exception([] (std::exception_ptr ep) {
        vlogger.warn("Failed to update materialized view bookkeeping ({}), continuing anyway.", ep);
    }).get();
+    utils::get_local_injector().inject("delay_finishing_build_step", utils::wait_for_message(60s)).get();
 }

 future<> view_builder::mark_as_built(view_ptr view) {
--- a/db/view/view_building_coordinator.cc
+++ b/db/view/view_building_coordinator.cc
@@ -29,6 +29,8 @@
 #include "db/view/view_building_task_mutation_builder.hh"
 #include "utils/assert.hh"
 #include "idl/view.dist.hh"
+#include "utils/error_injection.hh"
+#include "utils/log.hh"

 static logging::logger vbc_logger("view_building_coordinator");

@@ -334,10 +336,13 @@ future<bool> view_building_coordinator::work_on_view_building(service::group0_gu
                _remote_work.erase(replica);
            }
        }
-        if (!_gossiper.is_alive(replica.host)) {
+
+        const bool ignore_gossiper = utils::get_local_injector().enter("view_building_coordinator_ignore_gossiper");
+        if (!_gossiper.is_alive(replica.host) && !ignore_gossiper) {
            vbc_logger.debug("Replica {} is dead", replica);
            continue;
        }
+
        if (skip_work_on_this_replica) {
            continue;
        }
@@ -439,11 +444,22 @@ void view_building_coordinator::attach_to_started_tasks(const locator::tablet_re
 }

 future<std::optional<view_building_coordinator::remote_work_results>> view_building_coordinator::work_on_tasks(locator::tablet_replica replica, std::vector<utils::UUID> tasks) {
+    constexpr auto backoff_duration = std::chrono::seconds(1);
+    static thread_local logger::rate_limit rate_limit{backoff_duration};
+
    std::vector<view_task_result> remote_results;
+    bool rpc_failed = false;
+
    try {
        remote_results = co_await ser::view_rpc_verbs::send_work_on_view_building_tasks(&_messaging, replica.host, _as, tasks);
    } catch (...) {
-        vbc_logger.warn("Work on tasks {} on replica {}, failed with error: {}", tasks, replica, std::current_exception());
+        vbc_logger.log(log_level::warn, rate_limit, "Work on tasks {} on replica {}, failed with error: {}",
+                tasks, replica, std::current_exception());
+        rpc_failed = true;
+    }
+
+    if (rpc_failed) {
+        co_await seastar::sleep(backoff_duration);
        _vb_sm.event.broadcast();
        co_return std::nullopt;
    }
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -245,23 +245,21 @@ future<> view_building_worker::create_staging_sstable_tasks() {
    // Firstly reorgenize `_sstables_to_register` for easier movement.
    // This is done in separate loop after committing the group0 command, because we need to move values from `_sstables_to_register`
    // (`staging_sstable_task_info` is non-copyable because of `foreign_ptr` field).
-    std::unordered_map<shard_id, std::unordered_map<table_id, std::unordered_map<dht::token, std::vector<foreign_ptr<sstables::shared_sstable>>>>> new_sstables_per_shard;
+    std::unordered_map<shard_id, std::unordered_map<table_id, std::vector<foreign_ptr<sstables::shared_sstable>>>> new_sstables_per_shard;
    for (auto& [table_id, sst_infos]: _sstables_to_register) {
        for (auto& sst_info: sst_infos) {
-            new_sstables_per_shard[sst_info.shard][table_id][sst_info.last_token].push_back(std::move(sst_info.sst_foreign_ptr));
+            new_sstables_per_shard[sst_info.shard][table_id].push_back(std::move(sst_info.sst_foreign_ptr));
        }
    }

    for (auto& [shard, sstables_per_table]: new_sstables_per_shard) {
        co_await container().invoke_on(shard, [sstables_for_this_shard = std::move(sstables_per_table)] (view_building_worker& local_vbw) mutable {
-            for (auto& [tid, ssts_map]: sstables_for_this_shard) {
-                for (auto& [token, ssts]: ssts_map) {
-                    auto unwrapped_ssts = ssts | std::views::as_rvalue | std::views::transform([] (auto&& fptr) {
-                        return fptr.unwrap_on_owner_shard();
-                    }) | std::ranges::to<std::vector>();
-                    auto& tid_ssts = local_vbw._staging_sstables[tid][token];
-                    tid_ssts.insert(tid_ssts.end(), std::make_move_iterator(unwrapped_ssts.begin()), std::make_move_iterator(unwrapped_ssts.end()));
-                }
+            for (auto& [tid, ssts]: sstables_for_this_shard) {
+                auto unwrapped_ssts = ssts | std::views::as_rvalue | std::views::transform([] (auto&& fptr) {
+                    return fptr.unwrap_on_owner_shard();
+                }) | std::ranges::to<std::vector>();
+                auto& tid_ssts = local_vbw._staging_sstables[tid];
+                tid_ssts.insert(tid_ssts.end(), std::make_move_iterator(unwrapped_ssts.begin()), std::make_move_iterator(unwrapped_ssts.end()));
            }
        });
    }
@@ -328,7 +326,7 @@ std::unordered_map<table_id, std::vector<view_building_worker::staging_sstable_t
                //                 or maybe it can be registered to view_update_generator directly.
                tasks_to_create[table_id].emplace_back(table_id, shard, last_token, make_foreign(std::move(sstable)));
            } else {
-                _staging_sstables[table_id][last_token].push_back(std::move(sstable));
+                _staging_sstables[table_id].push_back(std::move(sstable));
            }
        }
    });
@@ -848,13 +846,54 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
 }

 future<> view_building_worker::do_process_staging(table_id table_id, dht::token last_token) {
-    if (_staging_sstables[table_id][last_token].empty()) {
+    if (_staging_sstables[table_id].empty()) {
        co_return;
    }

    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
-    auto sstables = std::exchange(_staging_sstables[table_id][last_token], {});
-    co_await _vug.process_staging_sstables(std::move(table), std::move(sstables));
+    auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
+    auto tid = tablet_map.get_tablet_id(last_token);
+    auto tablet_range = tablet_map.get_token_range(tid);
+
+    // Select sstables belonging to the tablet (identified by `last_token`)
+    std::vector<sstables::shared_sstable> sstables_to_process;
+    for (auto& sst: _staging_sstables[table_id]) {
+        auto sst_last_token = sst->get_last_decorated_key().token();
+        if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
+            sstables_to_process.push_back(sst);
+        }
+    }
+
+    co_await _vug.process_staging_sstables(std::move(table), sstables_to_process);
+
+    try {
+        // Remove processed sstables from `_staging_sstables` map
+        auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
+        std::unordered_set<sstables::shared_sstable> sstables_to_remove(sstables_to_process.begin(), sstables_to_process.end());
+        auto [first, last] = std::ranges::remove_if(_staging_sstables[table_id], [&] (auto& sst) {
+            return sstables_to_remove.contains(sst);
+        });
+        _staging_sstables[table_id].erase(first, last);
+    } catch (semaphore_aborted&) {
+        vbw_logger.warn("Semaphore was aborted while waiting to removed processed sstables for table {}", table_id);
+    }
+}
+
+void view_building_worker::load_sstables(table_id table_id, std::vector<sstables::shared_sstable> ssts) {
+    std::ranges::copy_if(std::move(ssts), std::back_inserter(_staging_sstables[table_id]), [] (auto& sst) {
+        return sst->state() == sstables::sstable_state::staging;
+    });
+}
+
+void view_building_worker::cleanup_staging_sstables(locator::effective_replication_map_ptr erm, table_id table_id, locator::tablet_id tid) {
+    auto& tablet_map = erm->get_token_metadata().tablets().get_tablet_map(table_id);
+    auto tablet_range = tablet_map.get_token_range(tid);
+
+    auto [first, last] = std::ranges::remove_if(_staging_sstables[table_id], [&] (auto& sst) {
+        auto sst_last_token = sst->get_last_decorated_key().token();
+        return tablet_range.contains(sst_last_token, dht::token_comparator());
+    });
+    _staging_sstables[table_id].erase(first, last);
 }

 }
--- a/db/view/view_building_worker.hh
+++ b/db/view/view_building_worker.hh
@@ -14,6 +14,7 @@
 #include <seastar/core/shared_future.hh>
 #include <unordered_map>
 #include <unordered_set>
+#include "locator/abstract_replication_strategy.hh"
 #include "locator/tablets.hh"
 #include "seastar/core/gate.hh"
 #include "db/view/view_building_state.hh"
@@ -160,7 +161,7 @@ private:
    condition_variable _sstables_to_register_event;
    semaphore _staging_sstables_mutex = semaphore(1);
    std::unordered_map<table_id, std::vector<staging_sstable_task_info>> _sstables_to_register;
-    std::unordered_map<table_id, std::unordered_map<dht::token, std::vector<sstables::shared_sstable>>> _staging_sstables;
+    std::unordered_map<table_id, std::vector<sstables::shared_sstable>> _staging_sstables;
    future<> _staging_sstables_registrator = make_ready_future<>();

 public:
@@ -178,6 +179,11 @@ public:
    virtual void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {};
    virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;

+    // Used ONLY to load staging sstables migrated during intra-node tablet migration.
+    void load_sstables(table_id table_id, std::vector<sstables::shared_sstable> ssts);
+    // Used in cleanup/cleanup-target tablet transition stage
+    void cleanup_staging_sstables(locator::effective_replication_map_ptr erm, table_id table_id, locator::tablet_id tid);
+
 private:
    future<> run_view_building_state_observer();
    future<> update_built_views();
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -1278,7 +1278,7 @@ public:
            static_assert(int(cdc::stream_state::current) < int(cdc::stream_state::closed));
            static_assert(int(cdc::stream_state::closed) < int(cdc::stream_state::opened));

-            co_await _ss.query_cdc_streams(table, [&] (db_clock::time_point ts, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff diff) -> future<> {
+            co_await _ss.query_cdc_streams(table, [&] (db_clock::time_point ts, const utils::chunked_vector<cdc::stream_id>& current, cdc::cdc_stream_diff diff) -> future<> {
                co_await emit_stream_set(ts, cdc::stream_state::current, current);
                co_await emit_stream_set(ts, cdc::stream_state::closed, diff.closed_streams);
                co_await emit_stream_set(ts, cdc::stream_state::opened, diff.opened_streams);
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -21,6 +21,11 @@ import urllib.request
 from pkg_resources import parse_version
 import multiprocessing as mp

+# Python 3.14 changed the default to 'forkserver', which is not compatible
+# with our relocatable python. It execs our Python binary, but without our
+# ld.so. Change it back to 'fork' to avoid issues.
+mp.set_start_method('fork')
+
 VERSION = "1.0"
 quiet = False
 # Temporary url for the review
--- a/dist/docker/commandlineparser.py
+++ b/dist/docker/commandlineparser.py
@@ -31,4 +31,5 @@ def parse():
    parser.add_argument('--replace-address-first-boot', default=None, dest='replaceAddressFirstBoot', help="[[deprecated]] IP address of a dead node to replace.")
    parser.add_argument('--dc', default=None, dest='dc', help="The datacenter name for this node, for use with the snitch GossipingPropertyFileSnitch.")
    parser.add_argument('--rack', default=None, dest='rack', help="The rack name for this node, for use with the snitch GossipingPropertyFileSnitch.")
+    parser.add_argument('--blocked-reactor-notify-ms', default='25', dest='blocked_reactor_notify_ms', help="Set the blocked reactor notification timeout in milliseconds. Defaults to 25.")
    return parser.parse_known_args()
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -46,6 +46,7 @@ class ScyllaSetup:
        self._extra_args = extra_arguments
        self._dc = arguments.dc
        self._rack = arguments.rack
+        self._blocked_reactor_notify_ms = arguments.blocked_reactor_notify_ms

    def _run(self, *args, **kwargs):
        logging.info('running: {}'.format(args))
@@ -205,7 +206,7 @@ class ScyllaSetup:
        elif self._replaceAddressFirstBoot is not None:
            args += ["--replace-address-first-boot %s" % self._replaceAddressFirstBoot]

-        args += ["--blocked-reactor-notify-ms 999999999"]
+        args += ["--blocked-reactor-notify-ms %s" % self._blocked_reactor_notify_ms]

        with open("/etc/scylla.d/docker.conf", "w") as cqlshrc:
            cqlshrc.write("SCYLLA_DOCKER_ARGS=\"%s\"\n" % (" ".join(args) + " " + " ".join(self._extra_args)))
--- a/docs/_ext/scylladb_cc_properties.py
+++ b/docs/_ext/scylladb_cc_properties.py
@@ -95,11 +95,12 @@ class DBConfigParser:

        for match in config_matches:
            name = match[1].strip()
+            liveness_value = match[3].strip() if match[3] else ""
            property_data = {
                "name": name,
                "value_status": match[4].strip(),
                "default": match[5].strip(),
-                "liveness": "True" if match[3] else "False",
+                "liveness": "True" if liveness_value == "LiveUpdate" else "False",
                "description": match[6].strip(),
            }
            properties.append(property_data)
@@ -135,7 +136,7 @@ class DBConfigParser:

            end_pos = next_group_match.start() if next_group_match else len(config_content)
            config_group_content = config_content[group_match.end():end_pos]
-            
+
            current_group = self._parse_group(group_match, config_group_content)
            groups.append(current_group)

--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -109,6 +109,32 @@ to do what, configure the following in ScyllaDB's configuration:
    alternator_enforce_authorization: true
 ```

+Note: switching `alternator_enforce_authorization` from `false` to `true`
+before the client application has the proper secret keys and permission
+tables set up will cause the application's requests to immediately fail.
+Therefore, we recommend to begin by keeping `alternator_enforce_authorization`
+set to `false` and setting `alternator_warn_authorization` to `true`.
+This setting will continue to allow all requests without failing on
+authentication or authorization errors - but will _count_ would-be
+authentication and authorization failures in the two metrics:
+
+* `scylla_alternator_authentication_failures`
+* `scylla_alternator_authorization_failures`
+
+`alternator_warn_authorization=true` also generates a WARN-level log message
+on each authentication or authorization failure. These log messages each
+includes the string `alternator_enforce_authorization=true`, and information
+that can help pinpoint the source of the error - such as the username
+involved in the attempt, and the address of the client sending the request.
+
+When you see that both metrics are not increasing (or, alternatively, that no
+more log messages appear), you can be sure that the application is properly
+set up and can finally set `alternator_enforce_authorization` to `true`.
+You can leave `alternator_warn_authorization` set or unset, depending on
+whether or not you want to see log messages when requests fail on
+authentication/authorization (in any case, the metric counts these failures,
+and the client will also get the error).
+
 Alternator implements the same [signature protocol](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html)
 as DynamoDB and the rest of AWS. Clients use, as usual, an access key ID and
 a secret access key to prove their identity and the authenticity of their
@@ -312,7 +338,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
  Currently, *all* Alternator tables are created as global tables and can
  be accessed from all the DCs existing at the time of the table's creation.
  If a DC is added after a table is created, the table won't be visible from
-  the new DC and changing that requires a CQL "ALTER TABLE" statement to
+  the new DC and changing that requires a CQL "ALTER KEYSPACE" statement to
  modify the table's replication strategy.
  <https://github.com/scylladb/scylla/issues/5062>

--- a/docs/alternator/new-apis.md
+++ b/docs/alternator/new-apis.md
@@ -181,17 +181,18 @@ entire data center, or other data centers, in that case.

 ## Tablets
 "Tablets" are ScyllaDB's new approach to replicating data across a cluster.
-It replaces the older approach which was named "vnodes". Compared to vnodes,
-tablets are smaller pieces of tables that are easier to move between nodes,
-and allow for faster growing or shrinking of the cluster when needed.
+It replaces the older approach which was named "vnodes". See
+[Data Distribution with Tablets](../architecture/tablets.rst) for details.

-In this version, tablet support is incomplete and not all of the features
-which Alternator needs are supported with tablets. So currently, new
-Alternator tables default to using vnodes - not tablets.
+In this version, tablet support is almost complete, so new
+Alternator tables default to following what the global configuration flag
+[tablets_mode_for_new_keyspaces](../reference/configuration-parameters.rst#confval-tablets_mode_for_new_keyspaces)
+tells them to.

-However, if you do want to create an Alternator table which uses tablets,
-you can do this by specifying the `experimental:initial_tablets` tag in
-the CreateTable operation. The value of this tag can be:
+If you want to influence whether a specific Alternator table is created with tablets or vnodes,
+you can do this by specifying the `system:initial_tablets` tag
+(in earlier versions of Scylla the tag was `experimental:initial_tablets`)
+in the CreateTable operation. The value of this tag can be:

 * Any valid integer as the value of this tag enables tablets.
  Typically the number "0" is used - which tells ScyllaDB to pick a reasonable
@@ -199,9 +200,11 @@ the CreateTable operation. The value of this tag can be:
  number overrides the default choice of initial number of tablets.

 * Any non-integer value - e.g., the string "none" - creates the table
-  without tablets - i.e., using vnodes.
+  without tablets - i.e., using vnodes. However, when vnodes are asked for by the tag value,
+  but tablets are `enforced` by the `tablets_mode_for_new_keyspaces` configuration flag,
+  an exception will be thrown.

-The `experimental:initial_tablets` tag only has any effect while creating
+The `system:initial_tablets` tag only has any effect while creating
 a new table with CreateTable - changing it later has no effect.

 Because the tablets support is incomplete, when tablets are enabled for an
--- a/docs/architecture/tablets.rst
+++ b/docs/architecture/tablets.rst
@@ -197,12 +197,6 @@ Limitations and Unsupported Features
    throughout its lifetime. Failing to keep that invariant satisfied may result in data inconsistencies,
    performance problems, or other issues.

-The following ScyllaDB features are not supported if a keyspace has tablets
-enabled. If you plan to use any of the features listed below, CREATE your keyspace
-:ref:`with tablets disabled <tablets-enable-tablets>`.
-
-* Counters
-
 To enable materialized views and secondary indexes for tablet keyspaces, use
 the `--rf-rack-valid-keyspaces` See :ref:`Views with tablets <admin-views-with-tablets>` for details.

--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -883,7 +883,8 @@ when replicas are slow or unresponsive.  The following are legal values (case-in
 ``XPERCENTILE``           90.5PERCENTILE   Coordinators record average per-table response times for all replicas.
                                            If a replica takes longer than ``X`` percent of this table's average
                                            response time, the coordinator queries an additional replica.
-                                            ``X`` must be between 0 and 100.
+                                            ``X`` must be between 0 and 100, including those values.
+                                            The value is rounded to the nearest 0.1 (1 decimal place).
 ``XP``                    90.5P            Synonym for ``XPERCENTILE``
 ``Yms``                   25ms             If a replica takes more than ``Y`` milliseconds to respond,
                                            the coordinator queries an additional replica.
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -274,8 +274,8 @@ in this case ``[0.1, 0.2, 0.3, 0.4]``.

 .. warning:: 

-  Currently, vector queries do not support filtering with ``WHERE`` clause, grouping with ``GROUP BY`` and paging.
-  This will be added in the future releases.
+  Currently, vector queries do not support filtering with ``WHERE`` clause, returning similarity distances,
+  grouping with ``GROUP BY`` and paging. This will be added in the future releases.


 .. _limit-clause:
--- a/docs/dev/service_levels.md
+++ b/docs/dev/service_levels.md
@@ -189,3 +189,18 @@ The command displays a table with: option name, effective service level the valu
        workload_type |                     sl2 |       batch
              timeout |                     sl1 |          2s
 ```
+
+## Implementation
+### Integration with auth
+
+Service levels ultimately depend on the state of `auth`. Since `auth::service` is initialized long after
+`service_level_controller`, we register it separately once it's started, and unregister it right before
+it's stopped. For that, we wrap it in a struct called `auth_integration` that manages access to it.
+That ensures that `service_level_controller` will not try to reference it beyond its lifetime.
+
+It's important to note that there may still be attempts to fetch an effective service level for a role
+or indirectly access `auth::service` in some other way when `auth_integration` is absent. One important
+situation to have in mind is when the user connects to Scylla via the maintenance socket. It's possible
+early on, way before Scylla is fully initialized. Since we don't have access to `auth` yet, we need to
+ensure that the semantics of the operations performed on `service_level_controller` still make sense
+in that context.
--- a/docs/dev/topology-over-raft.md
+++ b/docs/dev/topology-over-raft.md
@@ -110,14 +110,17 @@ stateDiagram-v2
 A node state may have additional parameters associated with it. For instance
 'replacing' state has host id of a node been replaced as a parameter.

-Additionally to specific node states, there entire topology can also be in a transitioning state:
+Additionally to specific node states, the entire topology can also be in one of the transitioning states listed below.
+Note that these are not all states, as there are other states specific to tablets described in the following sections.

+- `join_group0` - a join request from a bootstrapping/replacing node has been accepted. The node joins group 0 and,
+    in the case of a bootstrapping node, receives bootstrap tokens.
 - `commit_cdc_generation` - a new CDC generation data was written to internal tables earlier
    and now we need to commit the generation - create a timestamp for it and tell every node
    to start using it for CDC log table writes.
 - `write_both_read_old` - one of the nodes is in a bootstrapping/decommissioning/removing/replacing state.
-    Writes are going to both new and old replicas (new replicas means calculated according to modified
-    token ring), reads are using old replicas.
+    Writes to vnodes-based tables are going to both new and old replicas (new replicas means calculated according
+    to modified token ring), reads are using old replicas.
 - `write_both_read_new` - as above, but reads are using new replicas.
 - `left_token_ring` - the decommissioning node left the token ring, but we still need to wait until other
    nodes observe it and stop sending writes to this node. Then, we tell the node to shut down and remove
@@ -128,8 +131,9 @@ Additionally to specific node states, there entire topology can also be in a tra
    requests from starting. Intended to be used in tests which want to prevent internally-triggered topology
    operations during the test.

-When a node bootstraps, we create new tokens for it and a new CDC generation
-and enter the `commit_cdc_generation` state. Once the generation is committed,
+When a node bootstraps, we move the topology to `join_group0` state, where we add
+the node to group 0, create new tokens for it, and create a new CDC generation.
+Then, we enter the `commit_cdc_generation` state. Once the generation is committed,
 we enter `write_both_read_old` state. After the entire cluster learns about it,
 streaming starts. When streaming finishes, we move to `write_both_read_new`
 state and again the whole cluster needs to learn about it and make sure that no
@@ -172,6 +176,13 @@ are the currently supported global topology operations:
   contain replicas of the table being truncated. It uses [sessions](#Topology guards)
   to make sure that no stale RPCs are executed outside of the scope of the request.

+## Zero-token nodes
+
+Zero-token nodes (the nodes started with `join_ring=false`) never own tokens or become
+tablet replicas. Hence, the logic described above is significantly simplified for them.
+For example, a bootstrapping zero-token node completes the transition on the
+`join_group0` state, as the following tasks (like creating a new CDC generation,
+streaming, and tablet migrations) are unneeded.

 # Load balancing

@@ -678,15 +689,25 @@ CREATE TABLE system.topology (
    rebuild_option text,
    release_version text,
    replaced_id uuid,
-    ignore_nodes set<uuid>,
+    ignore_nodes set<uuid> static,
    shard_count int,
    tokens set<text>,
+    tokens_string text,
    topology_request text,
    transition_state text static,
+    cleanup_status text,
+    supported_features set<uuid>,
+    request_id timeuuid,
+    version bigint static,
+    fence_version bigint static,
    committed_cdc_generations set<tuple<timestamp, timeuuid>> static,
    unpublished_cdc_generations set<tuple<timestamp, timeuuid>> static,
    global_topology_request text static,
    global_topology_request_id timeuuid static,
+    enabled_features set<text> static,
+    session uuid static,
+    tablet_balancing_enabled boolean static,
+    upgrade_state text static,
    new_cdc_generation_data_uuid timeuuid static,
    new_keyspace_rf_change_ks_name text static,
    new_keyspace_rf_change_data frozen<map<text, text>> static,
@@ -709,12 +730,17 @@ Each node has a clustering row in the table where its `host_id` is the clusterin
 - `topology_request`   -  if set contains one of the supported node-specific topology requests
 - `tokens`             -  if set contains a list of tokens that belongs to the node
 - `replaced_id`        -  if the node replacing or replaced another node here will be the id of that node
- `ignore_nodes`       -  if set contains a list of ids of nodes ignored during the remove or replace operation
 - `rebuild_option`     -  if the node is being rebuild contains datacenter name that is used as a rebuild source
 - `num_tokens`         -  the requested number of tokens when the node bootstraps
+- `tokens_string`      -  if set contains the `initial_token` value of the bootstrapping node
+- `cleanup_status`     -  contains the cleanup status of the node (clean, needed, or running)
+- `supported_features` -  if set contains the list of cluster features supported by the node
+- `request_id`         -  the ID of the current request for the node or the last one if there is no current request

 There are also a few static columns for cluster-global properties:

+- `ignore_nodes` - if set, contains a list of node IDs to be ignored during remove or replace topology operations
+                   and tablet-related operations such as migration, split, and merge.
 - `transition_state` - the transitioning state of the cluster (as described earlier), may be null
 - `committed_cdc_generations` - the IDs of the committed CDC generations
 - `unpublished_cdc_generations` - the IDs of the committed yet unpublished CDC generations
@@ -725,7 +751,12 @@ There are also a few static columns for cluster-global properties:
 - `new_keyspace_rf_change_ks_name` - the name of the KS that is being the target of the scheduled ALTER KS statement
 - `new_keyspace_rf_change_data` - the KS options to be used when executing the scheduled ALTER KS statement
 - `global_requests` - contains a list of ids of pending global requests, the information about requests (type and parameters)
-                      can be obtained from topology_requests table by using request's id as a look up key.
+                      can be obtained from topology_requests table by using request's id as a look up key
+- `version` - the current topology version
+- `fence_version` - the current fence version
+- `enabled_features` - the list of cluster features enabled by the cluster
+- `session` - if set contains the ID of the current session
+- `tablet_balancing_enabled` - if false, the tablet balancing has been disabled

 # Join procedure

@@ -847,20 +878,87 @@ topology coordinator fiber and coordinates the remaining steps:

 If a disaster happens and a majority of nodes are lost, changes to the group 0
 state are no longer possible and a manual recovery procedure needs to be performed.
-Our current procedure starts by switching all nodes to a special "recovery" mode
-in which nodes do not use raft at all. In this mode, dead nodes are supposed
-to be removed from the cluster via `nodetool removenode`. After all dead nodes
-are removed, state related to group 0 is deleted and nodes are restarted in
-regular mode, allowing the cluster to re-form group 0.

-Topology on raft fits into this procedure in the following way:
+## The procedure

- When nodes are restarted in recovery mode, they revert to gossiper-based
-  operations. This allows to perform `nodetool removenode` without having
-  a majority of nodes. In this mode, `system.topology` is *not* updated, so
-  it becomes outdated at the end.
- Before disabling recovery mode on the nodes, the `system.topology` table
-  needs to be truncated on all nodes. This will cause nodes to revert to
-  legacy topology operations after exiting recovery mode.
- After re-forming group 0, the cluster needs to be upgraded again to raft
-  topology by the administrator.
+Our current procedure starts by removing the persistent group 0 ID and the group 0
+discovery state on all live nodes. This process ensures that live nodes will try to
+join a new group 0 during the next restart.
+
+The issue is that one of the live nodes has to create the new group 0, and not every
+node is a safe choice. It turns out that we can choose only nodes with the latest
+`commit_index` (see this [section](#choosing-the-recovery-leader) for a detailed
+explanation). We call the chosen node the *recovery leader*.
+
+Once the recovery leader is chosen, all live nodes can join the new group 0 during
+a rolling restart. Nodes learn about the recovery leader through the
+`recovery_leader` config option. Also, the recovery leader must be restarted first
+to create the new group 0 before other nodes try to join it.
+
+After successfully restarting all live nodes, dead nodes can be removed via
+`nodetool removenode` or by replacing them.
+
+Finally, the persisted internal state of the old group 0 can be cleaned up.
+
+## Topology coordinator during recovery
+
+After joining the new group 0 during the procedure, live nodes don't execute any
+"special recovery code" related to topology.
+
+In particular, the recovery leader normally starts the topology coordinator fiber.
+This fiber is designed to ensure that a started topology operation never hangs
+(it succeeds or is rolled back) regardless of the conditions. So, if the majority
+has been lost in the middle of some work done by the topology coordinator, the new
+topology coordinator (run on the recovery leader) will finish this work. It will
+usually fail and be rolled back, e.g., due to `global_token_metadata_barrier`
+failing after a global topology command sent to a dead normal node fails.
+
+Note that this behavior is necessary to ensure that the new topology coordinator
+will eventually be able to start handling the topology requests to remove/replace
+dead nodes. Those requests will succeed thanks to the `ignore_dead_nodes` and
+`ignore_dead_nodes_for_replace` options.
+
+## Gossip during recovery
+
+A node always includes its group 0 ID in `gossip_digest_syn`, and the receiving node
+rejects the message if the ID is different from its local ID. However, nodes can
+temporarily belong to two different group 0's during the recovery procedure. To keep
+the gossip working, we've decided to additionally include the local `recovery_leader`
+value in `gossip_digest_syn`. Nodes ignore group 0 ID mismatch if the sender or the
+receiver has a non-empty `recovery_leader` (usually both have it).
+
+## Choosing the recovery leader
+
+The group 0 state persisted on the recovery leader becomes the initial state of
+other nodes that join the new group 0 (which happens through the Raft snapshot
+transfer). After all, the Raft state machine must be consistent at the beginning.
+
+When a disaster happens, live nodes can have different commit indexes, and the nodes
+that are behind have no way of catching up without majority. Imagine there are two
+live nodes - node A and node B, node A has `commit_index`=10, and node B has
+`commit_index`=11. Also, assume that the log entry with index 11 is a schema change
+that adds a new column to a table. Node B could have already handled some replica
+writes to the new column. If node A became the recovery leader and node B joined the
+new group 0, node B would receive a snapshot that regresses its schema version. Node
+B could end up in an inconsistent state with data in a column that doesn't exist
+according to group 0. Hence, node B must be the recovery leader.
+
+## Loss of committed entries
+
+It can happen that a group 0 entry has been committed by a majority consisting of
+only dead nodes. Then, no matter what recovery leader we choose, it won't have this
+entry. This is fine, assuming that the following group 0 causality property holds on
+all live nodes: any persisted effect on the node’s state is written only after
+the group 0 state it depends on has already been persisted.
+
+For example, the above property holds for schema changes and writes because
+a replica persists a write only after applying the group 0 entry with the latest
+schema, which is ensured by a read barrier.
+
+It's critical for recovery safety to ensure that no subsystem breaks group 0 causality.
+Fortunately, this property is natural and not very limiting.
+
+Losing a committed entry can be observed by external systems. For example, the latest
+schema version in the cluster can go back in time from the driver's perspective. This
+is outside the scope of the recovery procedure, though, and it shouldn't cause
+problems in practice.
--- a/docs/features/counters.rst
+++ b/docs/features/counters.rst
@@ -3,8 +3,6 @@
 ScyllaDB Counters
 ==================

-.. note:: Counters are not supported in keyspaces with :doc:`tablets</architecture/tablets>` enabled.
-
 Counters are useful for any application where you need to increment a count, such as keeping a track of:

 * The number of web page views on a website.
--- a/docs/operating-scylla/nodetool-commands/cleanup.rst
+++ b/docs/operating-scylla/nodetool-commands/cleanup.rst
@@ -20,6 +20,8 @@ To clean up the data of a specific node and specific keyspace, use this command:

   nodetool -h <host name> cleanup <keyspace>

+To clean up entire cluster see :doc:`nodetool cluster cleanup </operating-scylla/nodetool-commands/cluster/cleanup/>`
+
 .. warning::

   Make sure there are no topology changes before running cleanup. To validate, run ``nodetool status``, all nodes should be in status Up Normal (``UN``).
--- a/docs/operating-scylla/nodetool-commands/cluster/cleanup.rst
+++ b/docs/operating-scylla/nodetool-commands/cluster/cleanup.rst
@@ -0,0 +1,15 @@
+Nodetool cluster cleanup
+========================
+
+**cluster cleanup** - A process that runs in the background and removes data no longer owned by nodes. Used for non tablet (vnode-based) tables only.
+
+Running ``cluster cleanup`` on a **single node** cleans up all non tablet tables on all nodes in the cluster (tablet enabled tables are cleaned up automatically).
+
+
+  For example:
+
+  ::
+
+     nodetool cluster cleanup
+
+See also `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_.
--- a/docs/operating-scylla/nodetool-commands/cluster/index.rst
+++ b/docs/operating-scylla/nodetool-commands/cluster/index.rst
@@ -5,6 +5,7 @@ Nodetool cluster
   :hidden:

   repair <repair>
+   cleanup <cleanup>

 **cluster** - Nodetool supercommand for running cluster operations.

@@ -12,3 +13,4 @@ Supported cluster suboperations
 -------------------------------

 * :doc:`repair </operating-scylla/nodetool-commands/cluster/repair>`  :code:`<keyspace>` :code:`<table>` - Repair one or more tablet tables.
+* :doc:`cleanup </operating-scylla/nodetool-commands/cluster/cleanup>`  - Clean up all non tablet (vnode-based) keyspaces in a cluster
--- a/docs/operating-scylla/nodetool-commands/excludenode.rst
+++ b/docs/operating-scylla/nodetool-commands/excludenode.rst
@@ -0,0 +1,47 @@
+Nodetool excludenode
+====================
+
+.. warning::
+    You must never use the ``nodetool excludenode`` on a running node that can be reached by other nodes in the cluster.
+    Before using the command, make sure the node is permanently down and cannot be recovered.
+
+
+Running ``excludenode`` will mark given nodes as permanently down (excluded).
+The cluster will no longer attempt to contact excluded nodes, which unblocks
+tablet load balancing, replication changes, etc.
+The nodes will be permanently banned from the cluster, meaning you won't be able to bring them back.
+
+Data ownership is not changed, and the nodes are still cluster members,
+so have to be eventually removed or replaced.
+
+After nodes are excluded, there is no need to pass them in the list of ignored
+nodes to removnenode, replace, or repair.
+
+Prerequisites
+------------------------
+
+* Using ``excludenode`` requires at least a quorum of nodes in a cluster to be available.
+  If the quorum is lost, it must be restored before you change the cluster topology. 
+  See :doc:`Handling Node Failures </troubleshooting/handling-node-failures>` for details.
+
+Usage
+--------
+
+Provide the Host IDs of the nodes you want to mark as permanently down.
+
+.. code-block:: console
+
+    nodetool excludenode  <Host ID> [ ... <Host ID>]
+
+Examples:
+
+.. code-block:: console
+
+    nodetool excludenode 2d1e1b0a-4ecb-4128-ba45-36ba558f7aee
+
+.. code-block:: console
+
+    nodetool excludenode 2d1e1b0a-4ecb-4128-ba45-36ba558f7aee 73adf19e-2912-4cf6-b9ab-bbc74297b8de
+
+
+.. include:: nodetool-index.rst
--- a/docs/operating-scylla/nodetool-commands/refresh.rst
+++ b/docs/operating-scylla/nodetool-commands/refresh.rst
@@ -29,16 +29,27 @@ Load and Stream

 .. code::

-   nodetool refresh <my_keyspace> <my_table> [--load-and-stream | -las] [--scope <scope>]
+   nodetool refresh <my_keyspace> <my_table> [(--load-and-stream | -las) [[(--primary-replica-only | -pro)] | [--scope <scope>]]]
+
+The Load and Stream feature extends nodetool refresh. 
+
+The ``--load-and-stream`` option loads arbitrary sstables into the cluster by reading the sstable data and streaming each partition to the replica(s) that owns it. In addition, the ``--scope`` and ``--primary-replica-only`` options are applied to filter the set of target replicas for each partition.  For example, say the old cluster has 6 nodes and the new cluster has 3 nodes. One can copy the sstables from the old cluster to any of the new nodes and trigger refresh with load and stream.
+
+
+

-The Load and Stream feature extends nodetool refresh. The new ``-las`` option loads arbitrary sstables that do not belong to a node into the cluster. It loads the sstables from the disk and calculates the data's owning nodes, and streams automatically.
-For example, say the old cluster has 6 nodes and the new cluster has 3 nodes. We can copy the sstables from the old cluster to any of the new nodes and trigger the load and stream process.

 Load and Stream make restores and migrations much easier:

 * You can place sstable from every node to every node
 * No need to run nodetool cleanup to remove unused data

+With --primary-replica-only (or -pro) option, only the primary replica of each partition in an sstable will be used as the target. 
+--primary-replica-only must be applied together with --load-and-stream.
+--primary-replica-only cannot be used with --scope, they are mutually exclusive.
+--primary-replica-only requires repair to be run after the load and stream operation is completed. 
+
+
 Scope
 -----

--- a/docs/operating-scylla/nodetool-commands/removenode.rst
+++ b/docs/operating-scylla/nodetool-commands/removenode.rst
@@ -47,6 +47,12 @@ Example:

    nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy

+To only mark the node as permanently down without doing actual removal, use :doc:`nodetool excludenode </operating-scylla/nodetool-commands/excludenode>`:
+
+.. code-block:: console
+
+    nodetool excludenode <Host ID of the node>
+

 .. _removenode-ignore-dead-nodes:

--- a/docs/operating-scylla/nodetool-commands/restore.rst
+++ b/docs/operating-scylla/nodetool-commands/restore.rst
@@ -53,6 +53,7 @@ Options
 * ``--nowait`` - Don't wait on the restore process
 * ``--scope <scope>`` - Use specified load-and-stream scope
 * ``--sstables-file-list <file>`` - restore the sstables listed in the given <file>. the list should be new-line separated.
+* ``--primary-replica-only`` - Load the sstables and stream to primary replica node that owns the data. Repair is needed after the restore process
 * ``<sstables>`` - Remainder of keys of the TOC (Table of Contents) components of SSTables to restore, relative to the specified prefix

 The `scope` parameter describes the subset of cluster nodes where you want to load data:
--- a/docs/operating-scylla/nodetool-commands/status.rst
+++ b/docs/operating-scylla/nodetool-commands/status.rst
@@ -18,84 +18,93 @@ Example output:

    Datacenter: datacenter1
    =======================
-    Status=Up/Down
+    Status=Up/Down/eXcluded
    |/ State=Normal/Leaving/Joining/Moving
    --  Address    Load       Tokens  Owns (effective)  Host ID                               Rack
    UN  127.0.0.1  394.97 MB  256     33.4%             292a6c7f-2063-484c-b54d-9015216f1750  rack1
    UN  127.0.0.2  151.07 MB  256     34.3%             102b6ecd-2081-4073-8172-bf818c35e27b  rack1
    UN  127.0.0.3  249.07 MB  256     32.3%             20db6ecd-2981-447s-l172-jf118c17o27y  rack1
+    XN  127.0.0.4  149.07 MB  256     32.3%             dd961642-c7c6-4962-9f5a-ea774dbaed77  rack1

-+----------+---------------------------------------+
-|Parameter |Description                            |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-+==========+=======================================+
-|Datacenter|The data center that holds             |
-|          |the information.                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-+----------+---------------------------------------+
-|Status    |``U`` - The node is up.                |
-|          |                                       |
-|          |``D`` - The node is down.              |
-+----------+---------------------------------------+
-|State     |``N`` - Normal                         |
-|          |                                       |
-|          |``L`` - Leaving                        |
-|          |                                       |
-|          |``J`` - Joining                        |
-|          |                                       |
-|          |``M`` - Moving                         |
-+----------+---------------------------------------+
-|Address   |The IP address of the node.            |
-|          |                                       |
-+----------+---------------------------------------+
-|Load      |The size on disk the ScyllaDB data     |
-|          | takes up (updates every 60 seconds).  |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-+----------+---------------------------------------+
-|Tokens    |The number of tokens per node.         |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-+----------+---------------------------------------+
-|Owns      |The percentage of data owned by        |
-|          |the node (per datacenter) multiplied by|
-|          |the replication factor you are using.  |
-|          |                                       |
-|          |For example, if the node owns 25% of   |
-|          |the data and the replication factor    |
-|          |is 4, the value will equal 100%.       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-|          |                                       |
-+----------+---------------------------------------+
-|Host ID   |The unique identifier (UUID)           |
-|          |automatically assigned to the node.    |
-|          |                                       |
-+----------+---------------------------------------+
-|Rack      |The name of the rack.                  |
-+----------+---------------------------------------+
+----------+---------------------------------------------------------------+
+|Parameter |Description                                                    |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+==========+===============================================================+
+|Datacenter|The data center that holds                                     |
+|          |the information.                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+----------+---------------------------------------------------------------+
+|Status    |``U`` - The node is up.                                        |
+|          |                                                               |
+|          |``D`` - The node is down.                                      |
+|          |                                                               |
+|          |``X`` - The node is :ref:`excluded <status-excluded>`.         |
+----------+---------------------------------------------------------------+
+|State     |``N`` - Normal                                                 |
+|          |                                                               |
+|          |``L`` - Leaving                                                |
+|          |                                                               |
+|          |``J`` - Joining                                                |
+|          |                                                               |
+|          |``M`` - Moving                                                 |
+----------+---------------------------------------------------------------+
+|Address   |The IP address of the node.                                    |
+|          |                                                               |
+----------+---------------------------------------------------------------+
+|Load      |The size on disk the ScyllaDB data                             |
+|          | takes up (updates every 60 seconds).                          |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+----------+---------------------------------------------------------------+
+|Tokens    |The number of tokens per node.                                 |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+----------+---------------------------------------------------------------+
+|Owns      |The percentage of data owned by                                |
+|          |the node (per datacenter) multiplied by                        |
+|          |the replication factor you are using.                          |
+|          |                                                               |
+|          |For example, if the node owns 25% of                           |
+|          |the data and the replication factor                            |
+|          |is 4, the value will equal 100%.                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+|          |                                                               |
+----------+---------------------------------------------------------------+
+|Host ID   |The unique identifier (UUID)                                   |
+|          |automatically assigned to the node.                            |
+|          |                                                               |
+----------+---------------------------------------------------------------+
+|Rack      |The name of the rack.                                          |
+----------+---------------------------------------------------------------+
+
+.. _status-excluded:
+
+Nodes in the excluded status (``X``) are down nodes which were marked as excluded
+by ``removenode``, ``excludenode``` or node replace, and means that they are considered permanently lost.
+See :doc:`nodetool excludenode </operating-scylla/nodetool-commands/excludenode>` for more information.

 .. include:: nodetool-index.rst
--- a/docs/operating-scylla/nodetool.rst
+++ b/docs/operating-scylla/nodetool.rst
@@ -30,6 +30,7 @@ Nodetool
   nodetool-commands/enablebackup
   nodetool-commands/enablebinary
   nodetool-commands/enablegossip
+   nodetool-commands/excludenode
   nodetool-commands/flush
   nodetool-commands/getcompactionthroughput
   nodetool-commands/getendpoints
@@ -104,6 +105,7 @@ Operations that are not listed below are currently not available.
 * :doc:`enablebackup </operating-scylla/nodetool-commands/enablebackup/>` - Enable incremental backup.
 * :doc:`enablebinary </operating-scylla/nodetool-commands/enablebinary/>` - Re-enable native transport (binary protocol).
 * :doc:`enablegossip </operating-scylla/nodetool-commands/enablegossip/>` - Re-enable gossip.
+* :doc:`excludenode </operating-scylla/nodetool-commands/excludenode/>`- Mark nodes as permanently down.
 * :doc:`flush </operating-scylla/nodetool-commands/flush/>` - Flush one or more column families.
 * :doc:`getcompactionthroughput </operating-scylla/nodetool-commands/getcompactionthroughput>` - Print the throughput cap for compaction in the system
 * :doc:`getendpoints <nodetool-commands/getendpoints/>` :code:`<keyspace>` :code:`<table>` :code:`<key>`- Print the end points that owns the key.
--- a/ent/encryption/encryption.cc
+++ b/ent/encryption/encryption.cc
@@ -165,7 +165,7 @@ bytes hmac_sha256(bytes_view msg, bytes_view key) {
    return res;
 }

-future<temporary_buffer<char>> read_text_file_fully(const sstring& filename) {
+future<temporary_buffer<char>> read_text_file_fully(const std::string& filename) {
    return open_file_dma(filename, open_flags::ro).then([](file f) {
        return f.size().then([f](size_t s) {
            return do_with(make_file_input_stream(f), [s](input_stream<char>& in) {
@@ -179,7 +179,7 @@ future<temporary_buffer<char>> read_text_file_fully(const sstring& filename) {
    });
 }

-future<> write_text_file_fully(const sstring& filename, temporary_buffer<char> buf) {
+future<> write_text_file_fully(const std::string& filename, temporary_buffer<char> buf) {
    return open_file_dma(filename, open_flags::wo|open_flags::create).then([buf = std::move(buf)](file f) mutable {
      return make_file_output_stream(f).then([buf = std::move(buf)] (output_stream<char> out) mutable {
        return do_with(std::move(out), [buf = std::move(buf)](output_stream<char>& out) mutable {
@@ -193,7 +193,7 @@ future<> write_text_file_fully(const sstring& filename, temporary_buffer<char> b
    });
 }

-future<> write_text_file_fully(const sstring& filename, const sstring& s) {
+future<> write_text_file_fully(const std::string& filename, const std::string& s) {
    return write_text_file_fully(filename, temporary_buffer<char>(s.data(), s.size()));
 }

--- a/ent/encryption/encryption.hh
+++ b/ent/encryption/encryption.hh
@@ -63,9 +63,9 @@ bytes calculate_sha256(const bytes&, size_t off = 0, size_t n = bytes::npos);
 bytes calculate_sha256(bytes_view);
 bytes hmac_sha256(bytes_view msg, bytes_view key);

-future<temporary_buffer<char>> read_text_file_fully(const sstring&);
-future<> write_text_file_fully(const sstring&, temporary_buffer<char>);
-future<> write_text_file_fully(const sstring&, const sstring&);
+future<temporary_buffer<char>> read_text_file_fully(const std::string&);
+future<> write_text_file_fully(const std::string&, temporary_buffer<char>);
+future<> write_text_file_fully(const std::string&, const std::string&);

 std::optional<std::chrono::milliseconds> parse_expiry(std::optional<std::string>);

--- a/ent/encryption/kms_host.cc
+++ b/ent/encryption/kms_host.cc
@@ -32,6 +32,7 @@
 #include "encryption_exceptions.hh"
 #include "symmetric_key.hh"
 #include "utils.hh"
+#include "utils/exponential_backoff_retry.hh"
 #include "utils/hash.hh"
 #include "utils/loading_cache.hh"
 #include "utils/UUID.hh"
@@ -45,13 +46,43 @@ using namespace std::string_literals;

 logger kms_log("kms");

+using httpclient = rest::httpclient;
+
+static std::string get_response_error(httpclient::reply_status res) {
+    switch (res) {
+    case httpclient::reply_status::unauthorized: case httpclient::reply_status::forbidden: return "AccessDenied";
+    case httpclient::reply_status::not_found: return "ResourceNotFound";
+    case httpclient::reply_status::too_many_requests: return "SlowDown";
+    case httpclient::reply_status::internal_server_error: return "InternalError";
+    case httpclient::reply_status::service_unavailable: return "ServiceUnavailable";
+    case httpclient::reply_status::request_timeout: case httpclient::reply_status::gateway_timeout:
+    case httpclient::reply_status::network_connect_timeout: case httpclient::reply_status::network_read_timeout:
+        return "RequestTimeout";
+    default:
+        return format("{}", res);
+    }
+};
+
 class kms_error : public std::exception {
+    httpclient::reply_status _res;
    std::string _type, _msg;
 public:
-    kms_error(std::string_view type, std::string_view msg)
-        : _type(type)
-        , _msg(fmt::format("{}: {}", type, msg))
+    kms_error(httpclient::reply_status res, std::string type, std::string_view msg)
+        : _res(res)
+        , _type(std::move(type))
+        , _msg(fmt::format("{}: {}", _type, msg))
    {}
+    kms_error(httpclient::reply_status res, std::string_view msg)
+        : _res(res)
+        , _type(get_response_error(res))
+        , _msg(fmt::format("{}: {}", _type, msg))
+    {}
+    kms_error(const httpclient::result_type& res, std::string_view msg)
+        : kms_error(res.result(), msg)
+    {}
+    httpclient::reply_status result() const {
+        return _res;
+    }
    const std::string& type() const {
        return _type;
    }
@@ -201,7 +232,9 @@ private:
    using result_type = httpclient::result_type;

    future<result_type> post(aws_query);
+
    future<rjson::value> post(std::string_view target, std::string_view aws_assume_role_arn, const rjson::value& query);
+    future<rjson::value> do_post(std::string_view target, std::string_view aws_assume_role_arn, const rjson::value& query);

    future<key_and_id_type> create_key(const attr_cache_key&);
    future<bytes> find_key(const id_cache_key&);
@@ -338,21 +371,27 @@ struct encryption::kms_host::impl::aws_query {
 };

 future<rjson::value> encryption::kms_host::impl::post(std::string_view target, std::string_view aws_assume_role_arn, const rjson::value& query) {
-    static auto get_response_error = [](const result_type& res) -> std::string {
-        switch (res.result()) {
-        case httpclient::reply_status::unauthorized: case httpclient::reply_status::forbidden: return "AccessDenied";
-        case httpclient::reply_status::not_found: return "ResourceNotFound";
-        case httpclient::reply_status::too_many_requests: return "SlowDown";
-        case httpclient::reply_status::internal_server_error: return "InternalError";
-        case httpclient::reply_status::service_unavailable: return "ServiceUnavailable";
-        case httpclient::reply_status::request_timeout: case httpclient::reply_status::gateway_timeout:
-        case httpclient::reply_status::network_connect_timeout: case httpclient::reply_status::network_read_timeout:
-            return "RequestTimeout";
-        default:
-            return format("{}", res.result());
-        }
-    };
+    static constexpr auto max_retries = 10;

+    exponential_backoff_retry exr(10ms, 10000ms);
+
+    for (int retry = 0; ; ++retry) {
+        try {
+            co_return co_await do_post(target, aws_assume_role_arn, query);
+        } catch (kms_error& e) {
+            // Special case 503. This can be both actual service or ec2 metadata.
+            // In either case, do local backoff-retry here.
+            // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#instance-metadata-returns
+            if (e.result() != httpclient::reply_status::service_unavailable || retry >= max_retries) {
+                throw;
+            }
+        }
+
+        co_await exr.retry();
+    }
+}
+
+future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target, std::string_view aws_assume_role_arn, const rjson::value& query) {
    static auto query_ec2_meta = [](std::string_view target, std::string token = {}) -> future<std::tuple<httpclient::result_type, std::string>> {
        static auto get_env_def = [](std::string_view var, std::string_view def) {
            auto val = std::getenv(var.data());
@@ -382,7 +421,7 @@ future<rjson::value> encryption::kms_host::impl::post(std::string_view target, s
            }
            kms_log.trace("Result: status={}, response={}", res.result_int(), res);
            if (res.result() != httpclient::reply_status::ok) {
-                throw kms_error(get_response_error(res), "EC2 metadata query");
+                throw kms_error(res, "EC2 metadata query");
            }
            co_return res;
        };
@@ -394,13 +433,8 @@ future<rjson::value> encryption::kms_host::impl::post(std::string_view target, s
            client.method(httpclient::method_type::PUT);
            client.target("/latest/api/token");

-
            auto res = co_await logged_send(client);

-            if (res.result() != httpclient::reply_status::ok) {
-                throw kms_error(get_response_error(res), "EC2 metadata token query");
-            }
-
            token = res.body();
            client.clear_headers();
        }
@@ -541,7 +575,7 @@ future<rjson::value> encryption::kms_host::impl::post(std::string_view target, s
            aws_secret_access_key = rjson::get<std::string>(body, "SecretAccessKey");
            session = rjson::get<std::string>(body, "Token");
        } catch (rjson::malformed_value&) {
-            std::throw_with_nested(kms_error("AccessDenied", fmt::format("Code={}, Message={}"
+            std::throw_with_nested(kms_error(httpclient::reply_status::forbidden, fmt::format("Code={}, Message={}"
                , rjson::get_opt<std::string>(body, "Code")
                , rjson::get_opt<std::string>(body, "Message")                
                )));
@@ -573,7 +607,7 @@ future<rjson::value> encryption::kms_host::impl::post(std::string_view target, s
        });

        if (res.result() != httpclient::reply_status::ok) {
-            throw kms_error(get_response_error(res), "AssumeRole");
+            throw kms_error(res, "AssumeRole");
        }

        rapidxml::xml_document<> doc;
@@ -586,7 +620,7 @@ future<rjson::value> encryption::kms_host::impl::post(std::string_view target, s
            static auto get_xml_node = [](node_type* node, const char* what) {
                auto res = node->first_node(what);
                if (!res) {
-                    throw kms_error("XML parse error", what);
+                    throw malformed_response_error(fmt::format("XML parse error", what));
                }
                return res;
            };
@@ -603,7 +637,7 @@ future<rjson::value> encryption::kms_host::impl::post(std::string_view target, s
            session = token->value();

        } catch (const rapidxml::parse_error& e) {
-            std::throw_with_nested(kms_error("XML parse error", "AssumeRole"));
+            std::throw_with_nested(malformed_response_error("XML parse error: AssumeRole"));
        }
    }

@@ -650,9 +684,11 @@ future<rjson::value> encryption::kms_host::impl::post(std::string_view target, s
            o = rjson::get_opt<std::string>(body, type_header);
        }
        // this should never happen with aws, but...
-        auto type = o ? *o : get_response_error(res);
+        if (!o) {
+            throw kms_error(res, msg);
+        }

-        throw kms_error(type, msg);
+        throw kms_error(res.result(), *o, msg);
    }

    co_return body;    
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -159,6 +159,7 @@ public:
    gms::feature workload_prioritization { *this, "WORKLOAD_PRIORITIZATION"sv };
    gms::feature colocated_tablets { *this, "COLOCATED_TABLETS"sv };
    gms::feature cdc_with_tablets { *this, "CDC_WITH_TABLETS"sv };
+    gms::feature counters_with_tablets { *this, "COUNTERS_WITH_TABLETS"sv };
    gms::feature file_stream { *this, "FILE_STREAM"sv };
    gms::feature compression_dicts { *this, "COMPRESSION_DICTS"sv };
    gms::feature tablet_options { *this, "TABLET_OPTIONS"sv };
--- a/idl/storage_service.idl.hh
+++ b/idl/storage_service.idl.hh
@@ -37,7 +37,9 @@ struct tablet_load_stats final {
    // Sum of all tablet sizes on a node and available disk space.
    uint64_t effective_capacity;

-    std::unordered_map<locator::range_based_tablet_id, uint64_t> tablet_sizes;
+    // Contains tablet sizes per table. The token ranges must be in the form
+    // (a, b] and only such ranges are allowed
+    std::unordered_map<::table_id, std::unordered_map<dht::token_range, uint64_t>> tablet_sizes;
 };

 struct load_stats {
--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -233,10 +233,10 @@ go_arch() {
    echo ${GO_ARCH["$(arch)"]}
 }

-NODE_EXPORTER_VERSION=1.9.0
+NODE_EXPORTER_VERSION=1.10.2
 declare -A NODE_EXPORTER_CHECKSUM=(
-    ["x86_64"]=e7b65ea30eec77180487d518081d3dcb121b975f6d95f1866dfb9156c5b24075
-    ["aarch64"]=5314fae1efff19abf807cfc8bd7dadbd47a35565c1043c236ffb0689dc15ef4f
+    ["x86_64"]=c46e5b6f53948477ff3a19d97c58307394a29fe64a01905646f026ddc32cb65b
+    ["aarch64"]=de69ec8341c8068b7c8e4cfe3eb85065d24d984a3b33007f575d307d13eb89a6
 )
 NODE_EXPORTER_DIR=/opt/scylladb/dependencies

--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -557,7 +557,7 @@ static_effective_replication_map::~static_effective_replication_map() {
 vnode_effective_replication_map::~vnode_effective_replication_map() {
    if (is_registered()) {
        try {
-            _factory->submit_background_work(clear_gently(std::move(_replication_map),
+            _factory->submit_background_work(dispose_gently(std::move(_replication_map),
                std::move(*_pending_endpoints),
                std::move(*_read_endpoints),
                std::move(_tmptr)));
--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -90,60 +90,53 @@ future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cm
 }

 future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd, std::optional<sstring> token) {
-    return connect(socket_address(inet_address{addr}, port))
-    .then([this, addr, cmd, token] (connected_socket fd) {
-        _sd = std::move(fd);
-        _in = _sd.input();
-        _out = _sd.output();
+    connected_socket fd = co_await connect(socket_address(inet_address{addr}, port));
+    auto in = fd.input();
+    auto out = fd.output();

-        if (token) {
-            _req = sstring("GET ") + cmd +
-                   sstring(" HTTP/1.1\r\nHost: ") +addr +
-                   sstring("\r\nX-aws-ec2-metadata-token: ") + *token +
-                   sstring("\r\n\r\n");
-        } else {
-            _req = sstring("PUT ") + cmd +
-                   sstring(" HTTP/1.1\r\nHost: ") + addr +
-                   sstring("\r\nX-aws-ec2-metadata-token-ttl-seconds: 60") +
-                   sstring("\r\n\r\n");
-        }
+    if (token) {
+        _req = sstring("GET ") + cmd +
+               sstring(" HTTP/1.1\r\nHost: ") +addr +
+               sstring("\r\nX-aws-ec2-metadata-token: ") + *token +
+               sstring("\r\n\r\n");
+    } else {
+        _req = sstring("PUT ") + cmd +
+               sstring(" HTTP/1.1\r\nHost: ") + addr +
+               sstring("\r\nX-aws-ec2-metadata-token-ttl-seconds: 60") +
+               sstring("\r\n\r\n");
+    }

-        return _out.write(_req.c_str()).then([this] {
-            return _out.flush();
-        });
-    }).then([this] {
-        _parser.init();
-        return _in.consume(_parser).then([this] {
-            if (_parser.eof()) {
-                return make_exception_future<sstring>("Bad HTTP response");
-            }
+    co_await out.write(_req.c_str());
+    co_await out.flush();

-            // Read HTTP response header first
-            auto _rsp = _parser.get_parsed_response();
-            auto rc = _rsp->_status;
-            // Verify EC2 instance metadata access
-            if (rc == http::reply::status_type(403)) {
-                return make_exception_future<sstring>(std::runtime_error("Error: Unauthorized response received when trying to communicate with instance metadata service."));
-            }
-            if (_rsp->_status != http::reply::status_type::ok) {
-                return make_exception_future<sstring>(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status)));
-            }
+    _parser.init();
+    co_await in.consume(_parser);
+    if (_parser.eof()) {
+        co_await coroutine::return_exception(std::runtime_error("Bad HTTP response"));
+    }

-            auto it = _rsp->_headers.find("Content-Length");
-            if (it == _rsp->_headers.end()) {
-                return make_exception_future<sstring>("Error: HTTP response does not contain: Content-Length\n");
-            }
+    // Read HTTP response header first
+    auto _rsp = _parser.get_parsed_response();
+    auto rc = _rsp->_status;
+    // Verify EC2 instance metadata access
+    if (rc == http::reply::status_type(403)) {
+        co_await coroutine::return_exception(std::runtime_error("Error: Unauthorized response received when trying to communicate with instance metadata service."));
+    }
+    if (_rsp->_status != http::reply::status_type::ok) {
+        co_await coroutine::return_exception(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status)));
+    }

-            auto content_len = std::stoi(it->second);
+    auto it = _rsp->_headers.find("Content-Length");
+    if (it == _rsp->_headers.end()) {
+        co_await coroutine::return_exception(std::runtime_error("Error: HTTP response does not contain: Content-Length\n"));
+    }

-            // Read HTTP response body
-            return _in.read_exactly(content_len).then([] (temporary_buffer<char> buf) {
-                sstring res(buf.get(), buf.size());
+    auto content_len = std::stoi(it->second);

-                return make_ready_future<sstring>(std::move(res));
-            });
-        });
-    });
+    // Read HTTP response body
+    temporary_buffer<char> buf = co_await in.read_exactly(content_len);
+    sstring res(buf.get(), buf.size());
+    co_return res;
 }

 future<sstring> ec2_snitch::read_property_file() {
--- a/locator/ec2_snitch.hh
+++ b/locator/ec2_snitch.hh
@@ -30,9 +30,6 @@ protected:
    future<sstring> aws_api_call(sstring addr, uint16_t port, const sstring cmd, std::optional<sstring> token);
    future<sstring> read_property_file();
 private:
-    connected_socket _sd;
-    input_stream<char> _in;
-    output_stream<char> _out;
    http_response_parser _parser;
    sstring _req;
    exponential_backoff_retry _ec2_api_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(2560));
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -13,6 +13,7 @@
 #include "locator/tablet_sharder.hh"
 #include "locator/token_range_splitter.hh"
 #include "db/system_keyspace.hh"
+#include "locator/topology.hh"
 #include "replica/database.hh"
 #include "utils/stall_free.hh"
 #include "utils/rjson.hh"
@@ -240,7 +241,7 @@ tablet_replica_set get_new_replicas(const tablet_info& tinfo, const tablet_migra
    return replace_replica(tinfo.replicas, mig.src, mig.dst);
 }

-tablet_replica_set get_primary_replicas(const locator::tablet_map& tablet_map, tablet_id tid, std::function<bool(const tablet_replica&)> filter) {
+tablet_replica_set get_primary_replicas(const locator::tablet_map& tablet_map, tablet_id tid, const locator::topology& topo, std::function<bool(const tablet_replica&)> filter) {
    const auto& info = tablet_map.get_tablet_info(tid);
    const auto* transition = tablet_map.get_tablet_transition_info(tid);

@@ -250,8 +251,8 @@ tablet_replica_set get_primary_replicas(const locator::tablet_map& tablet_map, t
        }
        return transition->writes;
    };
-    auto primary = [tid, filter = std::move(filter)] (tablet_replica_set set) -> std::optional<tablet_replica> {
-        return maybe_get_primary_replica(tid, set, filter);
+    auto primary = [tid, filter = std::move(filter), &topo] (tablet_replica_set set) -> std::optional<tablet_replica> {
+        return maybe_get_primary_replica(tid, set, topo, filter);
    };
    auto add = [] (tablet_replica r1, tablet_replica r2) -> tablet_replica_set {
        // if primary replica is not the one leaving, then only primary will be streamed to.
@@ -555,14 +556,30 @@ dht::token_range tablet_map::get_token_range_after_split(const token& t) const n
    return get_token_range(id_after_split, log2_tablets_after_split);
 }

-std::optional<tablet_replica> maybe_get_primary_replica(tablet_id id, const tablet_replica_set& replica_set, std::function<bool(const tablet_replica&)> filter) {
-    const auto replicas = replica_set | std::views::filter(std::move(filter)) | std::ranges::to<tablet_replica_set>();
+auto tablet_replica_comparator(const locator::topology& topo) {
+    return [&topo](const tablet_replica& a, const tablet_replica& b) {
+        const auto loc_a = topo.get_location(a.host);
+        const auto loc_b = topo.get_location(b.host);
+        if (loc_a.dc != loc_b.dc) {
+            return loc_a.dc < loc_b.dc;
+        }
+        if (loc_a.rack != loc_b.rack) {
+            return loc_a.rack < loc_b.rack;
+        }
+        return a.host < b.host;
+    };
+}
+
+std::optional<tablet_replica> maybe_get_primary_replica(tablet_id id, const tablet_replica_set& replica_set, const locator::topology& topo, std::function<bool(const tablet_replica&)> filter) {
+    tablet_replica_set replica_set_copy = replica_set;
+    std::ranges::sort(replica_set_copy, tablet_replica_comparator(topo));
+    const auto replicas = replica_set_copy | std::views::filter(std::move(filter)) | std::ranges::to<tablet_replica_set>();
    return !replicas.empty() ? std::make_optional(replicas.at(size_t(id) % replicas.size())) : std::nullopt;
 }

-tablet_replica tablet_map::get_primary_replica(tablet_id id) const {
+tablet_replica tablet_map::get_primary_replica(tablet_id id, const locator::topology& topo) const {
    const auto& replicas = get_tablet_info(id).replicas;
-    return replicas.at(size_t(id) % replicas.size());
+    return maybe_get_primary_replica(id, replicas, topo, [&] (const auto& _) { return true; }).value();
 }

 tablet_replica tablet_map::get_secondary_replica(tablet_id id) const {
@@ -574,7 +591,7 @@ tablet_replica tablet_map::get_secondary_replica(tablet_id id) const {
 }

 std::optional<tablet_replica> tablet_map::maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const {
-    return maybe_get_primary_replica(id, get_tablet_info(id).replicas, [&] (const auto& tr) {
+    return maybe_get_primary_replica(id, get_tablet_info(id).replicas, topo, [&] (const auto& tr) {
        return tablet_task_info.selected_by_filters(tr, topo);
    });
 }
@@ -865,9 +882,11 @@ table_load_stats& table_load_stats::operator+=(const table_load_stats& s) noexce

 uint64_t tablet_load_stats::add_tablet_sizes(const tablet_load_stats& tls) {
    uint64_t table_sizes_sum = 0;
-    for (auto& [rb_tid, tablet_size] : tls.tablet_sizes) {
-        tablet_sizes[rb_tid] = tablet_size;
-        table_sizes_sum += tablet_size;
+    for (auto& [table, sizes] : tls.tablet_sizes) {
+        for (auto& [range, tablet_size] : sizes) {
+            tablet_sizes[table][range] = tablet_size;
+            table_sizes_sum += tablet_size;
+        }
    }
    return table_sizes_sum;
 }
@@ -894,16 +913,109 @@ load_stats& load_stats::operator+=(const load_stats& s) {
 }

 std::optional<uint64_t> load_stats::get_tablet_size(host_id host, const range_based_tablet_id& rb_tid) const {
-    if (auto node_i = tablet_stats.find(host); node_i != tablet_stats.end()) {
-        const tablet_load_stats& tls = node_i->second;
-        if (auto ts_i = tls.tablet_sizes.find(rb_tid); ts_i != tls.tablet_sizes.end()) {
-            return ts_i->second;
+    if (auto host_i = tablet_stats.find(host); host_i != tablet_stats.end()) {
+        auto& sizes_per_table = host_i->second.tablet_sizes;
+        if (auto table_i = sizes_per_table.find(rb_tid.table); table_i != sizes_per_table.end()) {
+            auto& tablet_sizes = table_i->second;
+            if (auto size_i = tablet_sizes.find(rb_tid.range); size_i != tablet_sizes.end()) {
+                return size_i->second;
+            }
        }
    }
    tablet_logger.debug("Unable to find tablet size on host: {} for tablet: {}", host, rb_tid);
    return std::nullopt;
 }

+lw_shared_ptr<load_stats> load_stats::reconcile_tablets_resize(const std::unordered_set<table_id>& tables, const token_metadata& old_tm, const token_metadata& new_tm) const {
+    lw_shared_ptr<load_stats> reconciled_stats { make_lw_shared<load_stats>(*this) };
+    load_stats& new_stats = *reconciled_stats;
+
+    for (table_id table : tables) {
+        if (!new_tm.tablets().has_tablet_map(table)) {
+            // Table has been dropped, remove it from stats
+            for (auto& [host, tls] : new_stats.tablet_stats) {
+                tls.tablet_sizes.erase(table);
+            }
+            continue;
+        }
+        const auto& old_tmap = old_tm.tablets().get_tablet_map(table);
+        const auto& new_tmap = new_tm.tablets().get_tablet_map(table);
+        size_t old_tablet_count = old_tmap.tablet_count();
+        size_t new_tablet_count = new_tmap.tablet_count();
+        if (old_tablet_count == new_tablet_count * 2) {
+            // Reconcile for merge
+            for (size_t i = 0; i < old_tablet_count; i += 2) {
+                range_based_tablet_id rb_tid1 { table, old_tmap.get_token_range(tablet_id(i)) };
+                range_based_tablet_id rb_tid2 { table, old_tmap.get_token_range(tablet_id(i + 1)) };
+                auto& tinfo = old_tmap.get_tablet_info(tablet_id(i));
+                for (auto& replica : tinfo.replicas) {
+                    auto tablet_size_opt1 = new_stats.get_tablet_size(replica.host, rb_tid1);
+                    auto tablet_size_opt2 = new_stats.get_tablet_size(replica.host, rb_tid2);
+                    if (!tablet_size_opt1 || !tablet_size_opt2) {
+                        if (!tablet_size_opt1) {
+                            tablet_logger.debug("Unable to find tablet size in stats for table resize reconcile for tablet {} on host {}", rb_tid1, replica.host);
+                        }
+                        if (!tablet_size_opt2) {
+                            tablet_logger.debug("Unable to find tablet size in stats for table resize reconcile for tablet {} on host {}", rb_tid2, replica.host);
+                        }
+                        return nullptr;
+                    }
+                    dht::token_range new_range { new_tmap.get_token_range(tablet_id(i / 2)) };
+                    auto& sizes_for_table = new_stats.tablet_stats.at(replica.host).tablet_sizes.at(table);
+                    uint64_t merged_tablet_size = *tablet_size_opt1 + *tablet_size_opt2;
+                    sizes_for_table[new_range] = merged_tablet_size;
+                    sizes_for_table.erase(rb_tid1.range);
+                    sizes_for_table.erase(rb_tid2.range);
+                }
+            }
+        } else if (old_tablet_count == new_tablet_count / 2) {
+            // Reconcile for split
+            for (size_t i = 0; i < old_tablet_count; i++) {
+                range_based_tablet_id rb_tid { table, old_tmap.get_token_range(tablet_id(i)) };
+                auto& tinfo = old_tmap.get_tablet_info(tablet_id(i));
+                for (auto& replica : tinfo.replicas) {
+                    auto tablet_size_opt = new_stats.get_tablet_size(replica.host, rb_tid);
+                    if (!tablet_size_opt) {
+                        tablet_logger.debug("Unable to find tablet size in stats for table resize reconcile for tablet {} on host {}", rb_tid, replica.host);
+                        return nullptr;
+                    }
+                    dht::token_range new_range1 { new_tmap.get_token_range(tablet_id(i * 2)) };
+                    dht::token_range new_range2 { new_tmap.get_token_range(tablet_id(i * 2 + 1)) };
+                    auto& sizes_for_table = new_stats.tablet_stats.at(replica.host).tablet_sizes.at(table);
+                    uint64_t split_tablet_size = *tablet_size_opt / 2;
+                    sizes_for_table[new_range1] = split_tablet_size;
+                    sizes_for_table[new_range2] = split_tablet_size;
+                    sizes_for_table.erase(rb_tid.range);
+                }
+            }
+        }
+    }
+
+    return reconciled_stats;
+}
+
+lw_shared_ptr<load_stats> load_stats::migrate_tablet_size(locator::host_id leaving, locator::host_id pending, locator::global_tablet_id gid, const dht::token_range trange) const {
+
+    lw_shared_ptr<load_stats> result;
+
+    if (leaving != pending) {
+        range_based_tablet_id rb_tid {gid.table, trange};
+        if (get_tablet_size(leaving, rb_tid) && !get_tablet_size(pending, rb_tid) && tablet_stats.contains(pending)) {
+            tablet_logger.debug("Moving tablet size for tablet: {} from: {} to: {}", gid, leaving, pending);
+            result = make_lw_shared<locator::load_stats>(*this);
+            auto& new_leaving_ts = result->tablet_stats.at(leaving);
+            auto& new_pending_ts = result->tablet_stats.at(pending);
+            auto map_node = new_leaving_ts.tablet_sizes.at(gid.table).extract(trange);
+            new_pending_ts.tablet_sizes[gid.table].insert(std::move(map_node));
+            if (new_leaving_ts.tablet_sizes.at(gid.table).empty()) {
+                new_leaving_ts.tablet_sizes.erase(gid.table);
+            }
+        }
+    }
+
+    return result;
+}
+
 tablet_range_splitter::tablet_range_splitter(schema_ptr schema, const tablet_map& tablets, host_id host, const dht::partition_range_vector& ranges)
    : _schema(std::move(schema))
    , _ranges(ranges)
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -66,6 +66,9 @@ struct global_tablet_id {

 struct range_based_tablet_id {
    table_id table;
+
+    // This represents the token range of the tablet in the form (a, b]
+    // and only such ranges are allowed
    dht::token_range range;

    bool operator==(const range_based_tablet_id&) const = default;
@@ -352,7 +355,7 @@ class tablet_map;
 /// Returns the replica set which will become the replica set of the tablet after executing a given tablet transition.
 tablet_replica_set get_new_replicas(const tablet_info&, const tablet_migration_info&);
 // If filter returns true, the replica can be chosen as primary replica.
-tablet_replica_set get_primary_replicas(const locator::tablet_map&, tablet_id, std::function<bool(const tablet_replica&)> filter);
+tablet_replica_set get_primary_replicas(const locator::tablet_map&, tablet_id, const locator::topology&, std::function<bool(const tablet_replica&)> filter);
 tablet_transition_info migration_to_transition_info(const tablet_info&, const tablet_migration_info&);

 /// Describes streaming required for a given tablet transition.
@@ -445,7 +448,9 @@ struct tablet_load_stats {
    // Sum of all tablet sizes on a node and available disk space.
    uint64_t effective_capacity = 0;

-    std::unordered_map<range_based_tablet_id, uint64_t> tablet_sizes;
+    // Contains tablet sizes per table.
+    // The token ranges must be in the form (a, b] and only such ranges are allowed
+    std::unordered_map<table_id, std::unordered_map<dht::token_range, uint64_t>> tablet_sizes;

    // returns the aggregated size of all the tablets added
    uint64_t add_tablet_sizes(const tablet_load_stats& tls);
@@ -479,6 +484,21 @@ struct load_stats {
    }

    std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid) const;
+
+    // Modifies the tablet sizes in load_stats for the given table after a split or merge. The old_tm argument has
+    // to contain the token_metadata pre-resize. The function returns load_stats with tablet token ranges
+    // corresponding to the post-resize tablet_map.
+    // In case any pre-resize tablet replica is not found, the function returns nullptr
+    lw_shared_ptr<load_stats> reconcile_tablets_resize(const std::unordered_set<table_id>& tables, const token_metadata& old_tm, const token_metadata& new_tm) const;
+
+    // Modifies the tablet sizes in load_stats by moving the size of a tablet from leaving to pending host.
+    // The function returns modified load_stats if the tablet size was successfully migrated.
+    // It returns nullptr if any of the following is true:
+    // - tablet was not found on the leaving host
+    // - tablet was found on the pending host
+    // - pending and leaving hosts are equal (in case of intranode migration)
+    // - pending host is not found in load_stats.tablet_stats
+    lw_shared_ptr<load_stats> migrate_tablet_size(locator::host_id leaving, locator::host_id pending, locator::global_tablet_id gid, const dht::token_range trange) const;
 };

 using load_stats_v2 = load_stats;
@@ -595,7 +615,7 @@ public:
    dht::token_range get_token_range(tablet_id id) const;

    /// Returns the primary replica for the tablet
-    tablet_replica get_primary_replica(tablet_id id) const;
+    tablet_replica get_primary_replica(tablet_id id, const locator::topology& topo) const;

    /// Returns the secondary replica for the tablet, which is assumed to be directly following the primary replica in the replicas vector
    /// \throws std::runtime_error if the tablet has less than 2 replicas.
@@ -783,7 +803,7 @@ public:
 // Check that all tablets which have replicas on this host, have a valid replica shard (< smp::count).
 future<bool> check_tablet_replica_shards(const tablet_metadata& tm, host_id this_host);

-std::optional<tablet_replica> maybe_get_primary_replica(tablet_id id, const tablet_replica_set& replica_set, std::function<bool(const tablet_replica&)> filter);
+std::optional<tablet_replica> maybe_get_primary_replica(tablet_id id, const tablet_replica_set& replica_set, const locator::topology& topo, std::function<bool(const tablet_replica&)> filter);

 struct tablet_routing_info {
    tablet_replica_set tablet_replicas;
@@ -859,6 +879,10 @@ void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr
 /// Returns the list of racks that can be used for placing replicas in a given DC.
 rack_list get_allowed_racks(const locator::token_metadata&, const sstring& dc);

+/// Returns a comparator function that can be used to sort tablet_replicas
+/// according to <dc, rack, host_id> order in the given topology. 
+auto tablet_replica_comparator(const locator::topology& topo);
+
 }

 template <>
--- a/main.cc
+++ b/main.cc
@@ -1366,9 +1366,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            auto destroy_tracing = defer_verbose_shutdown("tracing instance", [&tracing] {
                tracing.stop().get();
            });
-            audit::audit::create_audit(*cfg, token_metadata).handle_exception([&] (auto&& e) {
-                startlog.error("audit creation failed: {}", e);
-            }).get();

            stop_signal.check();
            ctx.http_server.server().invoke_on_all([] (auto& server) { server.set_content_streaming(true); }).get();
@@ -1702,7 +1699,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            checkpoint(stop_signal, "starting migration manager");
            debug::the_migration_manager = &mm;
-            mm.start(std::ref(mm_notifier), std::ref(feature_service), std::ref(messaging), std::ref(proxy), std::ref(ss), std::ref(gossiper), std::ref(group0_client), std::ref(sys_ks)).get();
+            mm.start(std::ref(mm_notifier), std::ref(feature_service), std::ref(messaging), std::ref(proxy), std::ref(gossiper), std::ref(group0_client), std::ref(sys_ks)).get();
            auto stop_migration_manager = defer_verbose_shutdown("migration manager", [&mm] {
                mm.stop().get();
            });
@@ -2221,12 +2218,47 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            // Semantic validation of sstable compression parameters from config.
            // Adding here (i.e., after `join_cluster`) to ensure that the
            // required SSTABLE_COMPRESSION_DICTS cluster feature has been negotiated.
+            //
+            // Also, if the dictionary compression feature is not enabled, use
+            // LZ4Compressor as the default algorithm instead of LZ4WithDictsCompressor.
+            const auto& dicts_feature_enabled = feature_service.local().sstable_compression_dicts;
+            auto& sstable_compression_options = cfg->sstable_compression_user_table_options;
+
+            gms::feature::listener_registration reg_listener;
+
+            if (!sstable_compression_options.is_set() && !dicts_feature_enabled) {
+                if (sstable_compression_options().get_algorithm() != compression_parameters::algorithm::lz4_with_dicts) {
+                    on_internal_error(startlog, "expected LZ4WithDictsCompressor as default algorithm for sstable_compression_user_table_options.");
+                }
+
+                startlog.info("SSTABLE_COMPRESSION_DICTS feature is disabled. Overriding default SSTable compression to use LZ4Compressor instead of LZ4WithDictsCompressor.");
+                compression_parameters original_params{sstable_compression_options().get_options()};
+                auto params = sstable_compression_options().get_options();
+                params[compression_parameters::SSTABLE_COMPRESSION] = sstring(compression_parameters::algorithm_to_name(compression_parameters::algorithm::lz4));
+                smp::invoke_on_all([&sstable_compression_options, params = std::move(params)] {
+                    if (!sstable_compression_options.is_set()) { // guard check; in case we ever make the option live updateable
+                        sstable_compression_options(compression_parameters{params}, utils::config_file::config_source::None);
+                    }
+                }).get();
+
+                // Register a callback to update the default compression algorithm when the feature is enabled.
+                // Precondition:
+                //   The callback must run inside seastar::async context:
+                //   - If the listener fires immediately, we are running inside seastar::async already.
+                //   - If the listener is deferred, `feature_service::enable()` runs it inside seastar::async.
+                reg_listener = feature_service.local().sstable_compression_dicts.when_enabled([&sstable_compression_options, params = std::move(original_params)] {
+                    startlog.info("SSTABLE_COMPRESSION_DICTS feature is now enabled. Overriding default SSTable compression to use LZ4WithDictsCompressor.");
+                    smp::invoke_on_all([&sstable_compression_options, params = std::move(params)] {
+                        if (!sstable_compression_options.is_set()) { // guard check; in case we ever make the option live updateable
+                            sstable_compression_options(params, utils::config_file::config_source::None);
+                        }
+                    }).get();
+                });
+            }
+
            try {
-                const auto& dicts_feature_enabled = feature_service.local().sstable_compression_dicts;
-                const auto& dicts_usage_allowed = cfg->sstable_compression_dictionaries_allow_in_ddl();
                cfg->sstable_compression_user_table_options().validate(
-                        compression_parameters::dicts_feature_enabled(bool(dicts_feature_enabled)),
-                        compression_parameters::dicts_usage_allowed(dicts_usage_allowed));
+                        compression_parameters::dicts_feature_enabled(bool(dicts_feature_enabled)));
            } catch (const std::exception& e) {
                startlog.error("Invalid sstable_compression_user_table_options: {}", e.what());
                throw bad_configuration_error();
@@ -2472,7 +2504,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            seastar::set_abort_on_ebadf(cfg->abort_on_ebadf());
            api::set_server_done(ctx).get();

-            audit::audit::start_audit(*cfg, qp, mm).get();
+            audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
+                startlog.error("audit start failed: {}", e);
+            }).get();
            auto audit_stop = defer([] {
                audit::audit::stop_audit().get();
            });
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -364,19 +364,25 @@ sstring messaging_service::client_metrics_domain(unsigned idx, inet_address addr
    return ret;
 }

-future<> messaging_service::ban_host(locator::host_id id) {
-    return container().invoke_on_all([id] (messaging_service& ms) {
-        if (ms._banned_hosts.contains(id) || ms.is_shutting_down()) {
+future<> messaging_service::ban_hosts(const utils::chunked_vector<locator::host_id>& ids) {
+    if (ids.empty()) {
+        return make_ready_future<>();
+    }
+    return container().invoke_on_all([&ids] (messaging_service& ms) {
+        if (ms.is_shutting_down()) {
            return;
        }
-
-        ms._banned_hosts.insert(id);
-        auto [start, end] = ms._host_connections.equal_range(id);
-        for (auto it = start; it != end; ++it) {
-            auto& conn_ref = it->second;
-            conn_ref.server.abort_connection(conn_ref.conn_id);
+        for (const auto id: ids) {
+            if (const auto [it, inserted] = ms._banned_hosts.insert(id); !inserted) {
+                continue;
+            }
+            auto [start, end] = ms._host_connections.equal_range(id);
+            for (auto it = start; it != end; ++it) {
+                auto& conn_ref = it->second;
+                conn_ref.server.abort_connection(conn_ref.conn_id);
+            }
+            ms._host_connections.erase(start, end);
        }
-        ms._host_connections.erase(start, end);
    });
 }

--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -440,11 +440,11 @@ public:

    void foreach_server_connection_stats(std::function<void(const rpc::client_info&, const rpc::stats&)>&& f) const;

-    // Drops all connections from the given host and prevents further communication from it to happen.
+    // Drops all connections from the given hosts and prevents further communication from it to happen.
    //
    // No further RPC handlers will be called for that node,
    // but we don't prevent handlers that were started concurrently from finishing.
-    future<> ban_host(locator::host_id);
+    future<> ban_hosts(const utils::chunked_vector<locator::host_id>& ids);

    msg_addr addr_for_host_id(locator::host_id hid);
 private:
--- a/pgo/conf/counters.cql
+++ b/pgo/conf/counters.cql
@@ -1,6 +1,5 @@
 DROP KEYSPACE IF EXISTS counters;

-- FIXME: use tablets after https://github.com/scylladb/scylladb/issues/18180 is done.
-CREATE KEYSPACE IF NOT EXISTS counters WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': '3'} AND TABLETS = {'enabled': false};
+CREATE KEYSPACE IF NOT EXISTS counters WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': '3'};

 CREATE TABLE IF NOT EXISTS counters.counter1 (key blob PRIMARY KEY, "C0" counter, "C1" counter, "C2" counter, "C3" counter, "C4" counter);
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9742362bc16ca16e9f962af993d6df7d6c4301182528d2882d50ec01b27b043
-size 6314928
+oid sha256:875c2435bce1e93bab492bdad21b7efe586a4fa22149e9526d219df77f0c3dfd
+size 6411264
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3c9d0a4c8289a7edf8ffb58d23cd71b686d98730ab1ac75921ac3a2d533eb66a
-size 6325416
+oid sha256:57294d7a476c1bfba10f038f01e3b236ac45d11e94c71918e1e5d0ec3d6a9212
+size 6420604
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -196,6 +196,7 @@ struct row_level_repair_metrics {
    uint64_t rx_hashes_nr{0};
    uint64_t inc_sst_skipped_bytes{0};
    uint64_t inc_sst_read_bytes{0};
+    uint64_t tablet_time_ms{0};
    row_level_repair_metrics() {
        namespace sm = seastar::metrics;
        _metrics.add_group("repair", {
@@ -219,6 +220,8 @@ struct row_level_repair_metrics {
                            sm::description("Total number of bytes skipped from sstables for incremental repair on this shard.")),
            sm::make_counter("inc_sst_read_bytes", inc_sst_read_bytes,
                            sm::description("Total number of bytes read from sstables for incremental repair on this shard.")),
+            sm::make_counter("tablet_time_ms", tablet_time_ms,
+                            sm::description("Time spent on tablet repair on this shard in milliseconds.")),
        });
    }
 };
@@ -3477,7 +3480,16 @@ future<> repair_cf_range_row_level(repair::shard_repair_task_impl& shard_task,
        service::frozen_topology_guard topo_guard) {
    auto start_time = flush_time;
    auto repair = row_level_repair(shard_task, std::move(cf_name), std::move(table_id), std::move(range), all_peer_nodes, small_table_optimization, start_time, topo_guard);
-    co_return co_await repair.run();
+    bool is_tablet = shard_task.db.local().find_column_family(table_id).uses_tablets();
+    bool is_tablet_rebuild = shard_task.sched_info.for_tablet_rebuild;
+    auto t = std::chrono::steady_clock::now();
+    auto update_time = seastar::defer([&] {
+        if (is_tablet && !is_tablet_rebuild) {
+            auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - t);
+            _metrics.tablet_time_ms += duration.count();
+        }
+    });
+    co_await repair.run();
 }

 class row_level_repair_gossip_helper : public gms::i_endpoint_state_change_subscriber {
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -483,13 +483,14 @@ locator::static_effective_replication_map_ptr keyspace::get_static_effective_rep
 } // namespace replica

 void backlog_controller::adjust() {
+    // Compute and update the backlog even when static shares are set to
+    // ensure that the backlog metrics reflect the current state.
+    auto backlog = _current_backlog();
    if (controller_disabled()) {
        update_controller(_static_shares);
        return;
    }

-    auto backlog = _current_backlog();
-
    if (backlog >= _control_points.back().input) {
        update_controller(_control_points.back().output);
        return;
@@ -510,7 +511,7 @@ void backlog_controller::adjust() {

 float backlog_controller::backlog_of_shares(float shares) const {
    size_t idx = 1;
-    if (controller_disabled() || _control_points.size() == 0) {
+    if (_control_points.size() == 0) {
            return 1.0f;
    }
    while ((idx < _control_points.size() - 1) && (_control_points[idx].output < shares)) {
@@ -1896,12 +1897,7 @@ std::ostream& operator<<(std::ostream& out, const database& db) {
    return out;
 }

-future<mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema,
-                                                   db::timeout_clock::time_point timeout,tracing::trace_state_ptr trace_state) {
-    auto m = fm.unfreeze(m_schema);
-    m.upgrade(cf.schema());
-
-    // prepare partition slice
+static query::partition_slice partition_slice_for_counter_update(const mutation& m) {
    query::column_id_vector static_columns;
    static_columns.reserve(m.partition().static_row().size());
    m.partition().static_row().for_each_cell([&] (auto id, auto&&) {
@@ -1924,20 +1920,18 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
    regular_columns.erase(std::unique(regular_columns.begin(), regular_columns.end()),
                          regular_columns.end());

-    auto slice = query::partition_slice(std::move(cr_ranges), std::move(static_columns),
+    return query::partition_slice(std::move(cr_ranges), std::move(static_columns),
        std::move(regular_columns), { }, { }, query::max_rows);
+}

-    auto op = cf.write_in_progress();
-
-    tracing::trace(trace_state, "Acquiring counter locks");
-    auto locks = co_await cf.lock_counter_cells(m, timeout);
-
+future<mutation> database::read_and_transform_counter_mutation_to_shards(mutation m, column_family& cf, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
    // Before counter update is applied it needs to be transformed from
    // deltas to counter shards. To do that, we need to read the current
    // counter state for each modified cell...

    tracing::trace(trace_state, "Reading counter values from the CF");
    auto permit = get_reader_concurrency_semaphore().make_tracking_only_permit(cf.schema(), "counter-read-before-write", timeout, trace_state);
+    auto slice = partition_slice_for_counter_update(m);
    auto mopt = co_await counter_write_query(cf.schema(), cf.as_mutation_source(), std::move(permit), m.decorated_key(), slice, trace_state);

    if (utils::get_local_injector().enter("apply_counter_update_delay_100ms")) {
@@ -1948,14 +1942,8 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
    // cells we can look for our shard in each of them, increment
    // its clock and apply the delta.
    transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable(), get_token_metadata().get_my_id());
-    tracing::trace(trace_state, "Applying counter update");
-    co_await apply_with_commitlog(cf, m, timeout);

-    if (utils::get_local_injector().enter("apply_counter_update_delay_5s")) {
-        co_await seastar::sleep(std::chrono::seconds(5));
-    }
-
-    co_return m;
+    co_return std::move(m);
 }

 max_purgeable memtable_list::get_max_purgeable(const dht::decorated_key& dk, is_shadowable is, api::timestamp_type max_seen_timestamp) const noexcept {
@@ -2050,29 +2038,70 @@ future<> database::apply_in_memory(const mutation& m, column_family& cf, db::rp_
    return cf.apply(m, std::move(h), timeout);
 }

-future<mutation> database::apply_counter_update(schema_ptr s, const frozen_mutation& m, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state) {
+future<counter_update_guard> database::acquire_counter_locks(schema_ptr s, const frozen_mutation& fm, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state) {
+    auto& cf = find_column_family(fm.column_family_id());
+
+    auto m = fm.unfreeze(s);
+    m.upgrade(cf.schema());
+
+    auto op = cf.write_in_progress();
+
+    tracing::trace(trace_state, "Acquiring counter locks");
+
+    return do_with(std::move(m), [this, &cf, op = std::move(op), timeout] (mutation& m) mutable {
+        return update_write_metrics_if_failed([&m, &cf, op = std::move(op), timeout] mutable -> future<counter_update_guard> {
+            return cf.lock_counter_cells(m, timeout).then([op = std::move(op)] (std::vector<locked_cell> locks) mutable {
+                return counter_update_guard{std::move(op), std::move(locks)};
+            });
+        }());
+    });
+}
+
+future<mutation> database::prepare_counter_update(schema_ptr s, const frozen_mutation& fm, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state) {
    if (timeout <= db::timeout_clock::now() || utils::get_local_injector().is_enabled("database_apply_counter_update_force_timeout")) {
        update_write_metrics_for_timed_out_write();
        return make_exception_future<mutation>(timed_out_error{});
    }

-    auto& cf = find_column_family(m.column_family_id());
+    auto& cf = find_column_family(fm.column_family_id());
    if (is_in_critical_disk_utilization_mode() && cf.is_eligible_to_write_rejection_on_critical_disk_utilization()) {
        update_write_metrics_for_rejected_writes();
        return make_exception_future<mutation>(replica::critical_disk_utilization_exception{"rejected counter update mutation"});
    }
-  return update_write_metrics(seastar::futurize_invoke([&] {
-    if (!s->is_synced()) {
-        throw std::runtime_error(format("attempted to mutate using not synced schema of {}.{}, version={}",
-                                        s->ks_name(), s->cf_name(), s->version()));
+
+    auto m = fm.unfreeze(s);
+    m.upgrade(cf.schema());
+
+    return update_write_metrics_if_failed(
+        read_and_transform_counter_mutation_to_shards(std::move(m), cf, std::move(trace_state), timeout));
+}
+
+future<> database::apply_counter_update(schema_ptr s, const frozen_mutation& fm, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state) {
+    auto& cf = find_column_family(fm.column_family_id());
+
+    auto m = fm.unfreeze(s);
+    m.upgrade(cf.schema());
+
+    tracing::trace(trace_state, "Applying counter update");
+    auto f = co_await coroutine::as_future(update_write_metrics(seastar::futurize_invoke([&] {
+        if (!s->is_synced()) {
+            throw std::runtime_error(format("attempted to mutate using not synced schema of {}.{}, version={}",
+                                            s->ks_name(), s->cf_name(), s->version()));
+        }
+        try {
+            return apply_with_commitlog(cf, m, timeout);
+        } catch (no_such_column_family&) {
+            dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
+            throw;
+        }
+    })));
+    if (f.failed()) {
+        co_await coroutine::return_exception_ptr(f.get_exception());
    }
-    try {
-        return do_apply_counter_update(cf, m, s, timeout, std::move(trace_state));
-    } catch (no_such_column_family&) {
-        dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
-        throw;
+
+    if (utils::get_local_injector().enter("apply_counter_update_delay_5s")) {
+        co_await seastar::sleep(std::chrono::seconds(5));
    }
-  }));
 }

 // #9919 etc. The initiative to wrap exceptions here
@@ -2309,6 +2338,24 @@ Future database::update_write_metrics(Future&& f) {
    });
 }

+template<typename Future>
+Future database::update_write_metrics_if_failed(Future&& f) {
+    return f.then_wrapped([s = _stats] (auto f) {
+        if (f.failed()) {
+            ++s->total_writes;
+            ++s->total_writes_failed;
+            auto ep = f.get_exception();
+            if (is_timeout_exception(ep)) {
+                ++s->total_writes_timedout;
+            } else if (try_catch<replica::rate_limit_exception>(ep)) {
+                ++s->total_writes_rate_limited;
+            }
+            return futurize<Future>::make_exception_future(std::move(ep));
+        }
+        return f;
+    });
+}
+
 void database::update_write_metrics_for_timed_out_write() {
    ++_stats->total_writes;
    ++_stats->total_writes_failed;
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -19,6 +19,7 @@
 #include "types/user.hh"
 #include "utils/assert.hh"
 #include "utils/hash.hh"
+#include "cell_locking.hh"
 #include "db_clock.hh"
 #include "gc_clock.hh"
 #include <chrono>
@@ -756,7 +757,8 @@ private:
                                        tracing::trace_state_ptr trace_state,
                                        streamed_mutation::forwarding fwd,
                                        mutation_reader::forwarding fwd_mr,
-                                        const sstables::sstable_predicate& = sstables::default_sstable_predicate()) const;
+                                        const sstables::sstable_predicate& = sstables::default_sstable_predicate(),
+                                        sstables::integrity_check integrity = sstables::integrity_check::no) const;

    lw_shared_ptr<const sstables::sstable_set> make_compound_sstable_set() const;
    // Compound sstable set must be refreshed whenever any of its managed sets are changed
@@ -1500,6 +1502,11 @@ struct string_pair_eq {
    bool operator()(spair lhs, spair rhs) const;
 };

+struct counter_update_guard {
+    utils::phased_barrier::operation op;
+    std::vector<locked_cell> locks;
+};
+
 class db_user_types_storage;

 // Policy for sharded<database>:
@@ -1728,11 +1735,12 @@ private:
    future<> do_apply_many(const utils::chunked_vector<frozen_mutation>&, db::timeout_clock::time_point timeout);
    future<> apply_with_commitlog(column_family& cf, const mutation& m, db::timeout_clock::time_point timeout);

-    future<mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, db::timeout_clock::time_point timeout,
-                                             tracing::trace_state_ptr trace_state);
+    future<mutation> read_and_transform_counter_mutation_to_shards(mutation m, column_family& cf, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout);

    template<typename Future>
    Future update_write_metrics(Future&& f);
+    template<typename Future>
+    Future update_write_metrics_if_failed(Future&& f);
    void update_write_metrics_for_timed_out_write();
    void update_write_metrics_for_rejected_writes();
    future<std::unique_ptr<keyspace>> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, locator::effective_replication_map_factory& erm_factory, const locator::token_metadata_ptr& token_metadata, system_keyspace system);
@@ -1912,7 +1920,11 @@ public:
    // Mutations may be partially visible to reads until restart on exception (FIXME).
    future<> apply(const utils::chunked_vector<frozen_mutation>&, db::timeout_clock::time_point timeout);
    future<> apply_hint(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout);
-    future<mutation> apply_counter_update(schema_ptr, const frozen_mutation& m, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state);
+
+    future<counter_update_guard> acquire_counter_locks(schema_ptr s, const frozen_mutation& fm, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state);
+    future<mutation> prepare_counter_update(schema_ptr s, const frozen_mutation& fm, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state);
+    future<> apply_counter_update(schema_ptr, const frozen_mutation& fm, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state);
+
    const sstring& get_snitch_name() const;
    /*!
     * \brief clear snapshot based on a tag
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -92,17 +92,18 @@ table::make_sstable_reader(schema_ptr s,
                                   tracing::trace_state_ptr trace_state,
                                   streamed_mutation::forwarding fwd,
                                   mutation_reader::forwarding fwd_mr,
-                                   const sstables::sstable_predicate& predicate) const {
+                                   const sstables::sstable_predicate& predicate,
+                                   sstables::integrity_check integrity) const {
    // CAVEAT: if make_sstable_reader() is called on a single partition
    // we want to optimize and read exactly this partition. As a
    // consequence, fast_forward_to() will *NOT* work on the result,
    // regardless of what the fwd_mr parameter says.
    if (pr.is_singular() && pr.start()->value().has_key()) {
        return sstables->create_single_key_sstable_reader(const_cast<column_family*>(this), std::move(s), std::move(permit),
-                _stats.estimated_sstable_per_read, pr, slice, std::move(trace_state), fwd, fwd_mr, predicate);
+                _stats.estimated_sstable_per_read, pr, slice, std::move(trace_state), fwd, fwd_mr, predicate, integrity);
    } else {
        return sstables->make_local_shard_sstable_reader(std::move(s), std::move(permit), pr, slice,
-                std::move(trace_state), fwd, fwd_mr, sstables::default_read_monitor_generator(), predicate);
+                std::move(trace_state), fwd, fwd_mr, sstables::default_read_monitor_generator(), predicate, nullptr, integrity);
    }
 }

@@ -310,7 +311,8 @@ table::make_streaming_reader(schema_ptr s, reader_permit permit,
        add_memtables_to_reader_list(readers, s, permit, range, slice, trace_state, fwd, fwd_mr, [&] (size_t memtable_count) {
            readers.reserve(memtable_count + 1);
        });
-        readers.emplace_back(make_sstable_reader(s, permit, _sstables, range, slice, std::move(trace_state), fwd, fwd_mr));
+        readers.emplace_back(make_sstable_reader(s, permit, _sstables, range, slice,
+            std::move(trace_state), fwd, fwd_mr, sstables::default_sstable_predicate(), sstables::integrity_check::yes));
        return make_combined_reader(s, std::move(permit), std::move(readers), fwd, fwd_mr);
    });

@@ -331,7 +333,8 @@ mutation_reader table::make_streaming_reader(schema_ptr schema, reader_permit pe
    add_memtables_to_reader_list(readers, schema, permit, range, slice, trace_state, fwd, fwd_mr, [&] (size_t memtable_count) {
        readers.reserve(memtable_count + 1);
    });
-    readers.emplace_back(make_sstable_reader(schema, permit, _sstables, range, slice, std::move(trace_state), fwd, fwd_mr));
+    readers.emplace_back(make_sstable_reader(schema, permit, _sstables, range, slice,
+        std::move(trace_state), fwd, fwd_mr, sstables::default_sstable_predicate(), sstables::integrity_check::yes));
    return maybe_compact_for_streaming(
            make_combined_reader(std::move(schema), std::move(permit), std::move(readers), fwd, fwd_mr),
            get_compaction_manager(),
@@ -348,7 +351,7 @@ mutation_reader table::make_streaming_reader(schema_ptr schema, reader_permit pe
    const auto fwd_mr = mutation_reader::forwarding::no;
    return maybe_compact_for_streaming(
            sstables->make_range_sstable_reader(std::move(schema), std::move(permit), range, slice,
-                    std::move(trace_state), fwd, fwd_mr),
+                    std::move(trace_state), fwd, fwd_mr, sstables::default_read_monitor_generator(), sstables::integrity_check::yes),
            get_compaction_manager(),
            compaction_time,
            _config.enable_compacting_data_for_streaming_and_repair(),
@@ -2811,8 +2814,10 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
        if (tablet_filter(*_tablet_map, gid)) {
            const uint64_t tablet_size = sg.live_disk_space_used();
            table_stats.size_in_bytes += tablet_size;
-            const locator::range_based_tablet_id rb_tid {gid.table, _tablet_map->get_token_range(gid.tablet)};
-            tablet_stats.tablet_sizes[rb_tid] = tablet_size;
+            const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
+            // Make sure the token range is in the form (a, b]
+            SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
+            tablet_stats.tablet_sizes[gid.table][trange] = tablet_size;
        }
    });
    return locator::combined_load_stats{
--- a/replica/tablets.cc
+++ b/replica/tablets.cc
@@ -934,7 +934,8 @@ public:
            tracing::trace_state_ptr,
            streamed_mutation::forwarding,
            mutation_reader::forwarding,
-            const sstables::sstable_predicate&) const override;
+            const sstables::sstable_predicate&,
+            sstables::integrity_check integrity = sstables::integrity_check::no) const override;

    // Will always return an engaged sstable set ptr.
    const lw_shared_ptr<const sstables::sstable_set>& find_sstable_set(size_t i) const {
@@ -1171,7 +1172,8 @@ tablet_sstable_set::create_single_key_sstable_reader(
        tracing::trace_state_ptr trace_state,
        streamed_mutation::forwarding fwd,
        mutation_reader::forwarding fwd_mr,
-        const sstables::sstable_predicate& predicate) const {
+        const sstables::sstable_predicate& predicate,
+        sstables::integrity_check integrity) const {
    // The singular partition_range start bound must be engaged.
    auto idx = group_of(pr.start()->value().token());
    const auto& set = find_sstable_set(idx);
--- a/schema/frozen_schema.cc
+++ b/schema/frozen_schema.cc
@@ -27,7 +27,7 @@ frozen_schema::frozen_schema(const schema_ptr& s)
    }())
 { }

-schema_ptr frozen_schema::unfreeze(const db::schema_ctxt& ctxt, std::optional<db::view::base_dependent_view_info> base_info) const {
+schema_ptr frozen_schema::unfreeze(const db::schema_ctxt& ctxt, schema_ptr cdc_schema, std::optional<db::view::base_dependent_view_info> base_info) const {
    auto in = ser::as_input_stream(_data);
    auto sv = ser::deserialize(in, std::type_identity<ser::schema_view>());
    auto sm = sv.mutations();
@@ -37,7 +37,7 @@ schema_ptr frozen_schema::unfreeze(const db::schema_ctxt& ctxt, std::optional<db
        if (base_info) {
            throw std::runtime_error("Trying to unfreeze regular table schema with base info");
        }
-        return db::schema_tables::create_table_from_mutations(ctxt, std::move(sm), ctxt.user_types(), sv.version());
+        return db::schema_tables::create_table_from_mutations(ctxt, std::move(sm), ctxt.user_types(), std::move(cdc_schema), sv.version());
    }
 }

@@ -50,12 +50,24 @@ const bytes_ostream& frozen_schema::representation() const
    return _data;
 }

-frozen_schema_with_base_info::frozen_schema_with_base_info(const schema_ptr& c) : frozen_schema(c) {
-    if (c->is_view()) {
-        base_info = c->view_info()->base_info();
-    }
+extended_frozen_schema::extended_frozen_schema(const schema_ptr& c)
+        : fs(c),
+          base_info([&c] -> std::optional<db::view::base_dependent_view_info> {
+              if (c->is_view()) {
+                  return c->view_info()->base_info();
+              }
+              return std::nullopt;
+          }()),
+          frozen_cdc_schema([&c] -> std::optional<frozen_schema> {
+              if (c->cdc_schema()) {
+                  return frozen_schema(c->cdc_schema());
+              }
+              return std::nullopt;
+          }())
+{
 }

-schema_ptr frozen_schema_with_base_info::unfreeze(const db::schema_ctxt& ctxt) const {
-    return frozen_schema::unfreeze(ctxt, base_info);
+schema_ptr extended_frozen_schema::unfreeze(const db::schema_ctxt& ctxt) const {
+    auto cdc_schema = frozen_cdc_schema ? frozen_cdc_schema->unfreeze(ctxt, nullptr, {}) : nullptr;
+    return fs.unfreeze(ctxt, std::move(cdc_schema), base_info);
 }
--- a/schema/frozen_schema.hh
+++ b/schema/frozen_schema.hh
@@ -28,17 +28,17 @@ public:
    frozen_schema(const frozen_schema&) = default;
    frozen_schema& operator=(const frozen_schema&) = default;
    frozen_schema& operator=(frozen_schema&&) = default;
-    schema_ptr unfreeze(const db::schema_ctxt&, std::optional<db::view::base_dependent_view_info> base_info = {}) const;
+    schema_ptr unfreeze(const db::schema_ctxt&, schema_ptr cdc_schema, std::optional<db::view::base_dependent_view_info> base_info = {}) const;
    const bytes_ostream& representation() const;
 };

-// To unfreeze view without base table added to schema registry
-// we need base_info.
-class frozen_schema_with_base_info : public frozen_schema {
-public:
-    frozen_schema_with_base_info(const schema_ptr& c);
+// A frozen schema with additional information that is needed to be transported
+// with it to be used for unfreezing it.
+struct extended_frozen_schema {
+    extended_frozen_schema(const schema_ptr& c);
    schema_ptr unfreeze(const db::schema_ctxt& ctxt) const;
-private:
-    // Set only for views.
-    std::optional<db::view::base_dependent_view_info> base_info;
+
+    frozen_schema fs;
+    std::optional<db::view::base_dependent_view_info> base_info; // Set only for views.
+    std::optional<frozen_schema> frozen_cdc_schema; // Set only for tables with CDC enabled.
 };
--- a/schema/schema.cc
+++ b/schema/schema.cc
@@ -70,7 +70,7 @@ speculative_retry::from_sstring(sstring str) {
        try {
            return boost::lexical_cast<double>(str.substr(0, str.size() - t.size()));
        } catch (boost::bad_lexical_cast& e) {
-            throw std::invalid_argument(format("cannot convert {} to speculative_retry\n", str));
+            throw exceptions::configuration_exception(format("cannot convert {} to speculative_retry\n", str));
        }
    };

@@ -86,12 +86,12 @@ speculative_retry::from_sstring(sstring str) {
    } else if (str.compare(str.size() - percentile.size(), percentile.size(), percentile) == 0) {
        t = type::PERCENTILE;
        v = convert(percentile) / 100;
-        if  (v <= 0.0 || v >= 1.0) {
+        if  (v < 0.0 || v > 1.0) {
            throw exceptions::configuration_exception(
-                format("Invalid value {} for PERCENTILE option 'speculative_retry': must be between (0.0 and 100.0)", str));
+                format("Invalid value {} for PERCENTILE option 'speculative_retry': must be between [0.0 and 100.0]", str));
        }
    } else {
-        throw std::invalid_argument(format("cannot convert {} to speculative_retry\n", str));
+        throw exceptions::configuration_exception(format("cannot convert {} to speculative_retry\n", str));
    }
    return speculative_retry(t, v);
 }
@@ -413,9 +413,10 @@ schema::raw_schema::raw_schema(table_id id)
    , _sharder(::get_sharder(smp::count, default_partitioner_ignore_msb))
 { }

-schema::schema(private_tag, const raw_schema& raw, const schema_static_props& props, std::optional<std::variant<schema_ptr, db::view::base_dependent_view_info>> base)
+schema::schema(private_tag, const raw_schema& raw, const schema_static_props& props, schema_ptr cdc_schema, std::optional<std::variant<schema_ptr, db::view::base_dependent_view_info>> base)
    : _raw(raw)
    , _static_props(props)
+    , _cdc_schema(cdc_schema)
    , _offsets([this] {
        if (_raw._columns.size() > std::numeric_limits<column_count_type>::max()) {
            throw std::runtime_error(format("Column count limit ({:d}) overflowed: {:d}",
@@ -518,6 +519,7 @@ schema::schema(private_tag, const raw_schema& raw, const schema_static_props& pr
 schema::schema(const schema& o, const std::function<void(schema&)>& transform)
    : _raw(o._raw)
    , _static_props(o._static_props)
+    , _cdc_schema(o._cdc_schema)
    , _offsets(o._offsets)
 {
    // Do the transformation after all the raw fields are initialized, but
@@ -549,6 +551,13 @@ schema::schema(reversed_tag, const schema& o)
 {
 }

+schema::schema(with_cdc_schema_tag, const schema& o, schema_ptr cdc_schema)
+    : schema(o, [cdc_schema] (schema& s) {
+        s._cdc_schema = cdc_schema;
+    })
+{
+}
+
 schema::~schema() {
    if (_registry_entry) {
        _registry_entry->detach_schema();
@@ -1302,6 +1311,10 @@ schema_builder::schema_builder(const schema_ptr s)
        _base_info = s->view_info()->base_info();
        _view_info = s->view_info()->raw();
    }
+
+    if (s->cdc_schema()) {
+        _cdc_schema = s->cdc_schema();
+    }
 }

 schema_builder::schema_builder(const schema::raw_schema& raw)
@@ -1549,6 +1562,13 @@ schema_builder& schema_builder::with_view_info(table_id base_id, sstring base_na
    return *this;
 }

+schema_builder& schema_builder::with_cdc_schema(schema_ptr cdc_schema) {
+    if (cdc_schema) {
+        _cdc_schema = std::move(cdc_schema);
+    }
+    return *this;
+}
+
 schema_builder& schema_builder::with_index(const index_metadata& im) {
    _raw._indices_by_name.emplace(im.name(), im);
    return *this;
@@ -1651,11 +1671,11 @@ schema_ptr schema_builder::build(schema::raw_schema& new_raw) {
    ), _version);

    if (_base_info) {
-        return make_lw_shared<schema>(schema::private_tag{}, new_raw, static_props, _base_info);
+        return make_lw_shared<schema>(schema::private_tag{}, new_raw, static_props, nullptr, _base_info);
    } else if (_base_schema) {
-        return make_lw_shared<schema>(schema::private_tag{}, new_raw, static_props, _base_schema);
+        return make_lw_shared<schema>(schema::private_tag{}, new_raw, static_props, nullptr, _base_schema);
    }
-    return make_lw_shared<schema>(schema::private_tag{}, new_raw, static_props);
+    return make_lw_shared<schema>(schema::private_tag{}, new_raw, static_props, _cdc_schema ? *_cdc_schema : nullptr);
 }

 auto schema_builder::static_configurators() -> std::vector<static_configurator>& {
@@ -2083,16 +2103,16 @@ schema_ptr schema::make_reversed() const {
 }

 schema_ptr schema::get_reversed() const {
-    return local_schema_registry().get_or_load(reversed(_raw._version), [this] (table_schema_version) -> view_schema_and_base_info {
+    return local_schema_registry().get_or_load(reversed(_raw._version), [this] (table_schema_version) -> extended_frozen_schema {
        auto s = make_reversed();
-
-        if (s->is_view()) {
-            return {frozen_schema(s), s->view_info()->base_info()};
-        }
-        return {frozen_schema(s)};
+        return extended_frozen_schema(s);
    });
 }

+schema_ptr schema::make_with_cdc(schema_ptr cdc_schema) const {
+    return make_lw_shared<schema>(schema::with_cdc_schema_tag{}, *this, cdc_schema);
+}
+
 raw_view_info::raw_view_info(table_id base_id, sstring base_name, bool include_all_columns, sstring where_clause)
        : _base_id(std::move(base_id))
        , _base_name(std::move(base_name))
--- a/schema/schema.hh
+++ b/schema/schema.hh
@@ -579,6 +579,7 @@ private:
    v3_columns _v3_columns;
    mutable schema_registry_entry* _registry_entry = nullptr;
    std::unique_ptr<::view_info> _view_info;
+    schema_ptr _cdc_schema;

    const std::array<column_count_type, 3> _offsets;

@@ -635,6 +636,7 @@ public:
    };
 private:
    struct reversed_tag { };
+    struct with_cdc_schema_tag { };

    lw_shared_ptr<cql3::column_specification> make_column_specification(const column_definition& def) const;
    void rebuild();
@@ -642,10 +644,11 @@ private:
    schema(const schema&, const std::function<void(schema&)>&);
    class private_tag{};
 public:
-    schema(private_tag, const raw_schema&, const schema_static_props& props, std::optional<std::variant<schema_ptr, db::view::base_dependent_view_info>> base = std::nullopt);
+    schema(private_tag, const raw_schema&, const schema_static_props& props, schema_ptr cdc_schema, std::optional<std::variant<schema_ptr, db::view::base_dependent_view_info>> base = std::nullopt);
    schema(const schema&);
    // See \ref make_reversed().
    schema(reversed_tag, const schema&);
+    schema(with_cdc_schema_tag, const schema&, schema_ptr cdc_schema);
    ~schema();
    const schema_static_props& static_props() const {
        return _static_props;
@@ -888,6 +891,9 @@ public:
    bool is_view() const {
        return bool(_view_info);
    }
+    schema_ptr cdc_schema() const {
+        return _cdc_schema;
+    }
    const query::partition_slice& full_slice() const {
        return *_full_slice;
    }
@@ -963,6 +969,8 @@ public:
    // The schema's version is also reversed using UUID_gen::negate().
    schema_ptr make_reversed() const;

+    schema_ptr make_with_cdc(schema_ptr cdc_schema) const;
+
    // Get the reversed counterpart of this schema from the schema registry.
    //
    // If not present in the registry, create one (via \ref make_reversed()) and
--- a/schema/schema_builder.hh
+++ b/schema/schema_builder.hh
@@ -32,6 +32,7 @@ private:
    std::optional<raw_view_info> _view_info;
    std::optional<schema_ptr> _base_schema;
    std::optional<db::view::base_dependent_view_info> _base_info;
+    std::optional<schema_ptr> _cdc_schema;
    schema_builder(const schema::raw_schema&);
    static std::vector<static_configurator>& static_configurators();
 public:
@@ -279,6 +280,8 @@ public:
    schema_builder& with_view_info(schema_ptr base_schema, bool include_all_columns, sstring where_clause);
    schema_builder& with_view_info(table_id base_id, sstring base_name, bool include_all_columns, sstring where_clause, db::view::base_dependent_view_info base);

+    schema_builder& with_cdc_schema(schema_ptr cdc_schema);
+
    schema_builder& with_index(const index_metadata& im);
    schema_builder& without_index(const sstring& name);
    schema_builder& without_indexes();
--- a/schema/schema_registry.cc
+++ b/schema/schema_registry.cc
@@ -77,9 +77,13 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
    }
 }

-schema_ptr schema_registry::learn(const schema_ptr& s) {
+schema_ptr schema_registry::learn(schema_ptr s) {
+    auto learned_cdc_schema = s->cdc_schema() ? local_schema_registry().learn(s->cdc_schema()) : nullptr;
+    if (learned_cdc_schema != s->cdc_schema()) {
+        s = s->make_with_cdc(learned_cdc_schema);
+    }
    if (s->registry_entry()) {
-        return std::move(s);
+        return s;
    }
    auto i = _entries.find(s->version());
    if (i != _entries.end()) {
@@ -171,11 +175,8 @@ void schema_registry::clear() {
    _entries.clear();
 }

-schema_ptr schema_registry_entry::load(view_schema_and_base_info fs) {
-    _frozen_schema = std::move(fs.schema);
-    if (fs.base_info) {
-        _base_info = std::move(fs.base_info);
-    }
+schema_ptr schema_registry_entry::load(extended_frozen_schema fs) {
+    _extended_frozen_schema = std::move(fs);
    auto s = get_schema();
    if (_state == state::LOADING) {
        _schema_promise.set_value(s);
@@ -187,10 +188,7 @@ schema_ptr schema_registry_entry::load(view_schema_and_base_info fs) {
 }

 schema_ptr schema_registry_entry::load(schema_ptr s) {
-    _frozen_schema = frozen_schema(s);
-    if (s->is_view()) {
-        _base_info = s->view_info()->base_info();
-    }
+    _extended_frozen_schema = extended_frozen_schema(s);
    _schema = &*s;
    _schema->_registry_entry = this;
    _erase_timer.cancel();
@@ -210,7 +208,7 @@ future<schema_ptr> schema_registry_entry::start_loading(async_schema_loader load
    _state = state::LOADING;
    slogger.trace("Loading {}", _version);
    // Move to background.
-    (void)f.then_wrapped([self = shared_from_this(), this] (future<view_schema_and_base_info>&& f) {
+    (void)f.then_wrapped([self = shared_from_this(), this] (future<extended_frozen_schema>&& f) {
        _loader = {};
        if (_state != state::LOADING) {
            slogger.trace("Loading of {} aborted", _version);
@@ -236,11 +234,7 @@ schema_ptr schema_registry_entry::get_schema() {
    if (!_schema) {
        slogger.trace("Activating {}", _version);
        schema_ptr s;
-        if (_base_info) {
-            s = _frozen_schema->unfreeze(*_registry._ctxt, *_base_info);
-        } else {
-            s = _frozen_schema->unfreeze(*_registry._ctxt);
-        }
+        s = _extended_frozen_schema->unfreeze(*_registry._ctxt);
        if (s->version() != _version) {
            throw std::runtime_error(format("Unfrozen schema version doesn't match entry version ({}): {}", _version, *s));
        }
@@ -259,9 +253,14 @@ void schema_registry_entry::detach_schema() noexcept {
    _erase_timer.arm(_registry.grace_period());
 }

+extended_frozen_schema schema_registry_entry::extended_frozen() const {
+    SCYLLA_ASSERT(_state >= state::LOADED);
+    return *_extended_frozen_schema;
+}
+
 frozen_schema schema_registry_entry::frozen() const {
    SCYLLA_ASSERT(_state >= state::LOADED);
-    return *_frozen_schema;
+    return _extended_frozen_schema->fs;
 }

 future<> schema_registry_entry::maybe_sync(std::function<future<>()> syncer) {
@@ -330,18 +329,17 @@ global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
    SCYLLA_ASSERT(o._cpu_of_origin == current);
    _ptr = std::move(o._ptr);
    _cpu_of_origin = current;
-    _base_info = std::move(o._base_info);
 }

 schema_ptr global_schema_ptr::get() const {
    if (this_shard_id() == _cpu_of_origin) {
        return _ptr;
    } else {
-        auto registered_schema = [](const schema_registry_entry& e, std::optional<db::view::base_dependent_view_info> base_info = std::nullopt) -> schema_ptr {
+        auto registered_schema = [](const schema_registry_entry& e) -> schema_ptr {
            schema_ptr ret = local_schema_registry().get_or_null(e.version());
            if (!ret) {
-                ret = local_schema_registry().get_or_load(e.version(), [&e, &base_info](table_schema_version) -> view_schema_and_base_info {
-                    return {e.frozen(), base_info};
+                ret = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) -> extended_frozen_schema {
+                    return e.extended_frozen();
                });
            }
            return ret;
@@ -352,7 +350,7 @@ schema_ptr global_schema_ptr::get() const {
        // that _ptr will have a registry on the foreign shard where this
        // object originated so as long as this object lives the registry entries lives too
        // and it is safe to reference them on foreign shards.
-        schema_ptr s = registered_schema(*_ptr->registry_entry(), _base_info);
+        schema_ptr s = registered_schema(*_ptr->registry_entry());
        if (_ptr->registry_entry()->is_synced()) {
            s->registry_entry()->mark_synced();
        }
@@ -369,18 +367,11 @@ global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
        if (e) {
            return s;
        } else {
-            return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) -> view_schema_and_base_info {
-                if (s->is_view()) {
-                    return {frozen_schema(s), s->view_info()->base_info()};
-                } else {
-                    return {frozen_schema(s)};
-                }
+            return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) -> extended_frozen_schema {
+                return extended_frozen_schema(s);
            });
        }
    };

    _ptr = ensure_registry_entry(ptr);
-    if (_ptr->is_view()) {
-        _base_info = _ptr->view_info()->base_info();
-    }
 }
--- a/Show More
+++ b/Show More