scylladb/test/cluster/test_compaction_backpressure.py

#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#

from test.pylib.manager_client import ManagerClient
from test.pylib.tablets import get_tablet_replica
from test.pylib.rest_client import inject_error_one_shot, inject_error
from test.cluster.util import new_test_keyspace, new_test_table
from test.cqlpy import nodetool

import pytest
import asyncio
import logging

logger = logging.getLogger(__name__)


@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_intranode_migration_not_blocked_by_backpressure(manager: ManagerClient):
    """
    Reproducer for a bug where intra-node tablet migration gets stuck
    in maybe_wait_for_sstable_count_reduction() because the compaction
    manager's regular_compaction_task_executor::do_run() returns early
    (descriptor.sstables is empty) without signaling compaction_done.

    The flush triggered by clone_locally_tablet_storage() calls
    maybe_wait_for_sstable_count_reduction() which waits on
    compaction_done. If compaction returns early without signaling,
    the wait hangs forever.

    The test uses two error injections:
    1. maybe_wait_for_sstable_count_reduction_wait_on_compaction_done
       (one-shot): Forces the flush path to wait on compaction_done,
       simulating the backpressure condition.
    2. compaction_regular_compaction_task_executor_do_run_empty_sstables
       (non-one-shot): Forces do_run() to always return early with
       empty sstables, simulating the case where the compaction
       strategy finds nothing to compact.

    Without the fix: do_run() returns early without signaling
    compaction_done, the wait hangs forever, blocking the migration.

    With the fix: do_run() signals compaction_done on early return,
    waking up the waiter and allowing the migration to proceed.
    """
    cmdline = ['--smp', '1']
    servers = [await manager.server_add(cmdline=cmdline)]

    cql = manager.get_cql()
    extra_ks_param = "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}"
    async with new_test_keyspace(manager, extra_ks_param) as ks:
        extra_table_param = "WITH compaction = {'class' : 'IncrementalCompactionStrategy', 'max_threshold': 64} and compression = {}"
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, t text", extra_table_param) as cf:
            table = cf.split('.')[-1]

            await manager.api.disable_autocompaction(servers[0].ip_addr, ks)

            # Write data and flush to create an sstable.
            logger.info("Writing initial data and flushing")
            for i in range(100):
                await asyncio.gather(*[cql.run_async(f"INSERT INTO {cf} (pk, t) VALUES ({i}, '{'x' * 1020}')") for i in range(10*i, 10*(i+1))])
                await manager.api.keyspace_flush(servers[0].ip_addr, ks, table)

            # Write more data WITHOUT flushing
            logger.info("Writing dirty data to memtable (no flush)")
            await asyncio.gather(*[cql.run_async(f"INSERT INTO {cf} (pk, t) VALUES ({i}, '{'x' * 1020}')") for i in range(400, 410)])

            logger.info("Enable autocompaction and trigger flush to simulate backpressure")
            await manager.api.enable_injection(servers[0].ip_addr, "set_sstable_count_reduction_threshold", one_shot=False, parameters={'value': 32})
            await manager.api.enable_autocompaction(servers[0].ip_addr, ks)
            # With the bug, the flush hangs forever in maybe_wait_for_sstable_count_reduction().
            # With the fix, it completes promptly.
            await manager.api.keyspace_flush(servers[0].ip_addr, ks, table)