Merge 'test: test_topology_ops: fix flakiness and reenable bg writes' from Patryk Jędrzejczak

We decrease the server's request timeouts in topology tests so that
they are lower than the driver's timeout. Before, the driver could
time out its request before the server handled it successfully.
This problem caused scylladb/scylladb#15924.

Since scylladb/scylladb#15924 is the last issue mentioned in
scylladb/scylladb#15962, this PR also reenables background
writes in `test_topology_ops` with tablets disabled. The test
doesn't pass with tablets and background writes because of
scylladb/scylladb#17025. We will reenable background writes
with tablets after fixing that issue.

Fixes scylladb/scylladb#15924
Fixes scylladb/scylladb#15962

Closes scylladb/scylladb#17585

* github.com:scylladb/scylladb:
  test: test_topology_ops: reenable background writes without tablets
  test: test_topology_ops: run with and without tablets
  test: topology: decrease the server's request timeouts
This commit is contained in:
Kamil Braun
2024-03-04 20:57:24 +01:00
3 changed files with 36 additions and 25 deletions

View File

@@ -429,6 +429,7 @@ class PythonTestSuite(TestSuite):
create_cfg.config_from_test
server = ScyllaServer(
mode=self.mode,
exe=self.scylla_exe,
vardir=os.path.join(self.options.tmpdir, self.mode),
logger=create_cfg.logger,

View File

@@ -53,7 +53,16 @@ class ReplaceConfig(NamedTuple):
wait_replaced_dead: bool = True
def make_scylla_conf(workdir: pathlib.Path, host_addr: str, seed_addrs: List[str], cluster_name: str) -> dict[str, object]:
def make_scylla_conf(mode: str, workdir: pathlib.Path, host_addr: str, seed_addrs: List[str], cluster_name: str) -> dict[str, object]:
# We significantly increase default timeouts to allow running tests on a very slow
# setup (but without network losses). These timeouts can impact the running time of
# topology tests. For example, the barrier_and_drain topology command waits until
# background writes' handlers time out. We don't want to slow down tests for no
# reason, so we increase the timeouts according to each mode's needs. The client
# should avoid timing out its requests before the server times out - for this reason
# we increase the CQL driver's client-side timeout in conftest.py.
request_timeout_in_ms = 180000 if mode in {'debug', 'sanitize'} else 30000
return {
'cluster_name': cluster_name,
'workdir': str(workdir.resolve()),
@@ -86,19 +95,13 @@ def make_scylla_conf(workdir: pathlib.Path, host_addr: str, seed_addrs: List[str
'flush_schema_tables_after_modification': False,
'auto_snapshot': False,
# Significantly increase default timeouts to allow running tests
# on a very slow setup (but without network losses). Note that these
# are server-side timeouts: The client should also avoid timing out
# its own requests - for this reason we increase the CQL driver's
# client-side timeout in conftest.py.
'range_request_timeout_in_ms': 300000,
'read_request_timeout_in_ms': 300000,
'counter_write_request_timeout_in_ms': 300000,
'cas_contention_timeout_in_ms': 300000,
'truncate_request_timeout_in_ms': 300000,
'write_request_timeout_in_ms': 300000,
'request_timeout_in_ms': 300000,
'range_request_timeout_in_ms': request_timeout_in_ms,
'read_request_timeout_in_ms': request_timeout_in_ms,
'counter_write_request_timeout_in_ms': request_timeout_in_ms,
'cas_contention_timeout_in_ms': request_timeout_in_ms,
'truncate_request_timeout_in_ms': request_timeout_in_ms,
'write_request_timeout_in_ms': request_timeout_in_ms,
'request_timeout_in_ms': request_timeout_in_ms,
'user_defined_function_time_limit_ms': 1000,
'strict_allow_filtering': True,
@@ -209,7 +212,7 @@ class ScyllaServer:
host_id: HostID # Host id (UUID)
newid = itertools.count(start=1).__next__ # Sequential unique id
def __init__(self, exe: str, vardir: str,
def __init__(self, mode: str, exe: str, vardir: str,
logger: Union[logging.Logger, logging.LoggerAdapter],
cluster_name: str, ip_addr: str, seeds: List[str],
cmdline_options: List[str],
@@ -236,6 +239,7 @@ class ScyllaServer:
self.property_filename = self.workdir / "conf/cassandra-rackdc.properties"
# Sum of basic server configuration and the user-provided config options.
self.config = make_scylla_conf(
mode = mode,
workdir = self.workdir,
host_addr = self.ip_addr,
seed_addrs = self.seeds,

View File

@@ -21,23 +21,28 @@ logger = logging.getLogger(__name__)
@pytest.mark.asyncio
async def test_topology_ops(request, manager: ManagerClient):
@pytest.mark.parametrize("tablets_enabled", ["true", "false"])
async def test_topology_ops(request, manager: ManagerClient, tablets_enabled: bool):
"""Test basic topology operations using the topology coordinator."""
cfg = {'experimental_features': ['consistent-topology-changes']}
if tablets_enabled:
cfg['experimental_features'].append('tablets')
logger.info("Bootstrapping first node")
servers = [await manager.server_add()]
servers = [await manager.server_add(config=cfg)]
logger.info(f"Restarting node {servers[0]}")
await manager.server_stop_gracefully(servers[0].server_id)
await manager.server_start(servers[0].server_id)
await wait_for_cql_and_get_hosts(manager.cql, await manager.running_servers(), time.time() + 60)
cql = await reconnect_driver(manager)
# FIXME: disabled as a workaround for #15935, #15924
# We need to re-enable once these issues are fixed.
#finish_writes = await start_writes(cql)
logger.info("Bootstrapping other nodes")
servers += await manager.servers_add(3)
servers += await manager.servers_add(3, config=cfg)
await wait_for_cql_and_get_hosts(manager.cql, servers, time.time() + 60)
cql = await reconnect_driver(manager)
# FIXME: we disable background writes with tablets enabled because the test fails due to #17025.
# We need to re-enable them once this issue is fixed (by removing `if` here and above `await finish_writes()`).
finish_writes = await start_writes(cql) if not tablets_enabled else None
logger.info(f"Decommissioning node {servers[0]}")
await manager.decommission_node(servers[0].server_id)
@@ -67,7 +72,8 @@ async def test_topology_ops(request, manager: ManagerClient):
servers = servers[1:]
logger.info("Checking results of the background writes")
#await finish_writes()
if not tablets_enabled:
await finish_writes()
for server in servers:
await check_node_log_for_failed_mutations(manager, server)