mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-05 06:23:03 +00:00
Tests in test/cqlpy use a tiny nodetool-like library, where calls to nodetool.flush() are translated to the parallel REST API request on Scylla - but use an external "nodetool" command when running the test against Cassandra. Some tests/cluster also began using test/cqlpy/nodetool.py, but it is NOT a good fit for test/cluster tests, because: 1. It falls back to using the external "nodetool" when it thinks the REST API is not available. In cluster tests, no such fallback is needed (these tests can't be run on Cassandra). If the REST API is down, the test should fail - not fall back to an irrelevant method. 2. The nodetool.flush() et al. functions are not async, and cluster tests are supposed (by design...) to only use async APIs. 3. test/cqlpy/nodetool.py was not written in the "style" defined for the test/cluster codebase - specifically they don't have docstrings or strong typing. This patch introduces test/pylib/nodetool.py, based on test/cqlpy/nodetool.py but fixing all the above problems - there are no Cassandra fallbacks, there are docstrings and type hints, and all the functions are async. We also fix the test/cluster tests that used test/cqlpy/nodetool.py to switch to test/pylib/nodetool.py. Of course it means the newly async functions need to be "await"ed, not just called, so this patch changes that too. Signed-off-by: Nadav Har'El <nyh@scylladb.com> Closes scylladb/scylladb#30129
75 lines
3.6 KiB
Python
75 lines
3.6 KiB
Python
#
|
|
# Copyright (C) 2025-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
#
|
|
|
|
from test.pylib.manager_client import ManagerClient
|
|
from test.pylib.tablets import get_tablet_replica
|
|
from test.pylib.rest_client import inject_error_one_shot, inject_error
|
|
from test.cluster.util import new_test_keyspace, new_test_table
|
|
|
|
import pytest
|
|
import asyncio
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
|
async def test_intranode_migration_not_blocked_by_backpressure(manager: ManagerClient):
|
|
"""
|
|
Reproducer for a bug where intra-node tablet migration gets stuck
|
|
in maybe_wait_for_sstable_count_reduction() because the compaction
|
|
manager's regular_compaction_task_executor::do_run() returns early
|
|
(descriptor.sstables is empty) without signaling compaction_done.
|
|
|
|
The flush triggered by clone_locally_tablet_storage() calls
|
|
maybe_wait_for_sstable_count_reduction() which waits on
|
|
compaction_done. If compaction returns early without signaling,
|
|
the wait hangs forever.
|
|
|
|
The test uses two error injections:
|
|
1. maybe_wait_for_sstable_count_reduction_wait_on_compaction_done
|
|
(one-shot): Forces the flush path to wait on compaction_done,
|
|
simulating the backpressure condition.
|
|
2. compaction_regular_compaction_task_executor_do_run_empty_sstables
|
|
(non-one-shot): Forces do_run() to always return early with
|
|
empty sstables, simulating the case where the compaction
|
|
strategy finds nothing to compact.
|
|
|
|
Without the fix: do_run() returns early without signaling
|
|
compaction_done, the wait hangs forever, blocking the migration.
|
|
|
|
With the fix: do_run() signals compaction_done on early return,
|
|
waking up the waiter and allowing the migration to proceed.
|
|
"""
|
|
cmdline = ['--smp', '1']
|
|
servers = [await manager.server_add(cmdline=cmdline)]
|
|
|
|
cql = manager.get_cql()
|
|
extra_ks_param = "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}"
|
|
async with new_test_keyspace(manager, extra_ks_param) as ks:
|
|
extra_table_param = "WITH compaction = {'class' : 'IncrementalCompactionStrategy', 'max_threshold': 64} and compression = {}"
|
|
async with new_test_table(manager, ks, "pk int PRIMARY KEY, t text", extra_table_param) as cf:
|
|
table = cf.split('.')[-1]
|
|
|
|
await manager.api.disable_autocompaction(servers[0].ip_addr, ks)
|
|
|
|
# Write data and flush to create an sstable.
|
|
logger.info("Writing initial data and flushing")
|
|
for i in range(100):
|
|
await asyncio.gather(*[cql.run_async(f"INSERT INTO {cf} (pk, t) VALUES ({i}, '{'x' * 1020}')") for i in range(10*i, 10*(i+1))])
|
|
await manager.api.keyspace_flush(servers[0].ip_addr, ks, table)
|
|
|
|
# Write more data WITHOUT flushing
|
|
logger.info("Writing dirty data to memtable (no flush)")
|
|
await asyncio.gather(*[cql.run_async(f"INSERT INTO {cf} (pk, t) VALUES ({i}, '{'x' * 1020}')") for i in range(400, 410)])
|
|
|
|
logger.info("Enable autocompaction and trigger flush to simulate backpressure")
|
|
await manager.api.enable_injection(servers[0].ip_addr, "set_sstable_count_reduction_threshold", one_shot=False, parameters={'value': 32})
|
|
await manager.api.enable_autocompaction(servers[0].ip_addr, ks)
|
|
# With the bug, the flush hangs forever in maybe_wait_for_sstable_count_reduction().
|
|
# With the fix, it completes promptly.
|
|
await manager.api.keyspace_flush(servers[0].ip_addr, ks, table)
|