Files
scylladb/test/pylib/suite/python.py
Andrzej Jackowski 7dc0c4cf4f test: close logfile/socket_dir for stopped servers in recycle_cluster
PythonTestSuite::recycle_cluster is a function that releases resources
of an old, dirty cluster to make it reusable. It closes log_file and
maintenance_socket_dir for running nodes in a dirty cluster, however it
doesn't do the same for stopped nodes. It leads to leakage of file
descriptors of stopped nodes, which in turn can lead to hitting ulimit
of open files (that is often 1024) if the leaking test is repeated with
`./test.py --repeat ...`. The problem was detected when tests from
`test/cluster/dtest/` directory were executed with high `repeat` value.

This commit extends `recycle_cluster` to close and cleanup logfile and
`socket_dir` for nodes that are stopped (because self.servers in
ScyllaCluster is ChainMap of self.running and self.stopped).

Closes scylladb/scylladb#24243
2025-05-27 08:37:43 +03:00

249 lines
11 KiB
Python

#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
from __future__ import annotations
import collections
import logging
import os
import pathlib
import xml.etree.ElementTree as ET
from typing import TYPE_CHECKING
from scripts import coverage
from test import path_to
from test.pylib.pool import Pool
from test.pylib.scylla_cluster import ScyllaCluster, ScyllaServer, merge_cmdline_options
from test.pylib.suite.base import Test, TestSuite, read_log, run_test
from test.pylib.util import LogPrefixAdapter
if TYPE_CHECKING:
import argparse
from collections.abc import Callable, Awaitable
from typing import Optional, Union
class PythonTestSuite(TestSuite):
"""A collection of Python pytests against a single Scylla instance"""
def __init__(self, path, cfg: dict, options: argparse.Namespace, mode: str) -> None:
super().__init__(path, cfg, options, mode)
self.scylla_exe = path_to(self.mode, "scylla")
self.scylla_env = dict(self.base_env)
if self.mode == "coverage":
self.scylla_env.update(coverage.env(self.scylla_exe, distinct_id=self.name))
self.scylla_env['SCYLLA'] = self.scylla_exe
cluster_cfg = self.cfg.get("cluster", {"initial_size": 1})
cluster_size = cluster_cfg["initial_size"]
env_pool_size = os.getenv("CLUSTER_POOL_SIZE")
if options.cluster_pool_size is not None:
pool_size = options.cluster_pool_size
elif env_pool_size is not None:
pool_size = int(env_pool_size)
else:
pool_size = cfg.get("pool_size", 2)
self.dirties_cluster = set(cfg.get("dirties_cluster", []))
self.create_cluster = self.get_cluster_factory(cluster_size, options)
async def recycle_cluster(cluster: ScyllaCluster) -> None:
"""When a dirty cluster is returned to the cluster pool,
stop it and release the used IPs. We don't necessarily uninstall() it yet,
which would delete the log file and directory - we might want to preserve
these if it came from a failed test.
"""
for srv in cluster.servers.values():
srv.log_file.close()
srv.maintenance_socket_dir.cleanup()
await cluster.stop()
await cluster.release_ips()
self.clusters = Pool(pool_size, self.create_cluster, recycle_cluster)
def get_cluster_factory(self, cluster_size: int, options: argparse.Namespace) -> Callable[..., Awaitable]:
def create_server(create_cfg: ScyllaCluster.CreateServerParams):
cmdline_options = self.cfg.get("extra_scylla_cmdline_options", [])
if type(cmdline_options) == str:
cmdline_options = [cmdline_options]
cmdline_options = merge_cmdline_options(cmdline_options, create_cfg.cmdline_from_test)
cmdline_options = merge_cmdline_options(cmdline_options, options.extra_scylla_cmdline_options)
# There are multiple sources of config options, with increasing priority
# (if two sources provide the same config option, the higher priority one wins):
# 1. the defaults
# 2. suite-specific config options (in "extra_scylla_config_options")
# 3. config options from tests (when servers are added during a test)
default_config_options = \
{"authenticator": "PasswordAuthenticator",
"authorizer": "CassandraAuthorizer"}
default_config_options["tablets_initial_scale_factor"] = 4 if self.mode == "release" else 2
config_options = default_config_options | \
self.cfg.get("extra_scylla_config_options", {}) | \
create_cfg.config_from_test
server = ScyllaServer(
mode=self.mode,
exe=self.scylla_exe,
vardir=self.log_dir,
logger=create_cfg.logger,
cluster_name=create_cfg.cluster_name,
ip_addr=create_cfg.ip_addr,
seeds=create_cfg.seeds,
cmdline_options=cmdline_options,
config_options=config_options,
property_file=create_cfg.property_file,
append_env=self.base_env,
server_encryption=create_cfg.server_encryption)
return server
async def create_cluster(logger: Union[logging.Logger, logging.LoggerAdapter]) -> ScyllaCluster:
cluster = ScyllaCluster(logger, self.hosts, cluster_size, create_server)
async def stop() -> None:
await cluster.stop()
# Suite artifacts are removed when
# the entire suite ends successfully.
self.artifacts.add_suite_artifact(self, stop)
if not self.options.save_log_on_success:
# If a test fails, we might want to keep the data dirs.
async def uninstall() -> None:
await cluster.uninstall()
self.artifacts.add_suite_artifact(self, uninstall)
self.artifacts.add_exit_artifact(self, stop)
await cluster.install_and_start()
return cluster
return create_cluster
@property
def pattern(self) -> str | list[str]:
return ["*_test.py", "*_tests.py", "test_*.py"]
async def add_test(self, shortname, casename) -> None:
test = PythonTest(self.next_id((shortname, self.suite_key)), shortname, casename, self)
self.tests.append(test)
async def run(self, test: 'Test', options: argparse.Namespace):
if not os.access(self.scylla_exe, os.F_OK):
raise FileNotFoundError(f"{self.scylla_exe} does not exist.")
if not os.access(self.scylla_exe, os.X_OK):
raise PermissionError(f"{self.scylla_exe} is not executable.")
return await super().run(test, options)
class PythonTest(Test):
"""Run a pytest collection of cases against a standalone Scylla"""
def __init__(self, test_no: int, shortname: str, casename: str, suite) -> None:
super().__init__(test_no, shortname, suite)
self.path = "python"
self.core_args = ["-m", "pytest"]
self.casename = casename
self.xmlout = self.suite.log_dir / "xml" / f"{self.uname}.xunit.xml"
self.server_log: Optional[str] = None
self.server_log_filename: Optional[pathlib.Path] = None
self.is_before_test_ok = False
self.is_after_test_ok = False
def _prepare_pytest_params(self, options: argparse.Namespace):
self.args = [
"-s", # don't capture print() output inside pytest
"--log-level=DEBUG", # Capture logs
"-vv",
"-o",
"junit_family=xunit2",
"-o",
"junit_suite_name={}".format(self.suite.name),
"--junit-xml={}".format(self.xmlout),
"-rs",
"--run_id={}".format(self.id),
"--mode={}".format(self.mode),
]
if options.gather_metrics:
self.args.append("--gather-metrics")
self.args.append(f"--alluredir={self.allure_dir}")
if not options.save_log_on_success:
self.args.append("--allure-no-capture")
if options.markers:
self.args.append(f"-m={options.markers}")
# https://docs.pytest.org/en/7.1.x/reference/exit-codes.html
no_tests_selected_exit_code = 5
self.valid_exit_codes = [0, no_tests_selected_exit_code]
arg = str(self.suite.suite_path / (self.shortname + ".py"))
if self.casename is not None:
arg += '::' + self.casename
self.args.append(arg)
def reset(self) -> None:
"""Reset the test before a retry, if it is retried as flaky"""
super().reset()
self.server_log = None
self.server_log_filename = None
self.is_before_test_ok = False
self.is_after_test_ok = False
def print_summary(self) -> None:
print("Output of {} {}:".format(self.path, " ".join(self.args)))
print(read_log(self.log_filename))
if self.server_log is not None:
print("Server log of the first server:")
print(self.server_log)
async def run(self, options: argparse.Namespace) -> Test:
self._prepare_pytest_params(options)
loggerPrefix = self.mode + '/' + self.uname
logger = LogPrefixAdapter(logging.getLogger(loggerPrefix), {'prefix': loggerPrefix})
cluster = await self.suite.clusters.get(logger)
try:
cluster.before_test(self.uname)
prepare_cql = self.suite.cfg.get("prepare_cql", None)
if prepare_cql and not hasattr(cluster, 'prepare_cql_executed'):
cc = next(iter(cluster.running.values())).control_connection
if not isinstance(prepare_cql, collections.abc.Iterable):
prepare_cql = [prepare_cql]
for stmt in prepare_cql:
cc.execute(stmt)
cluster.prepare_cql_executed = True
logger.info("Leasing Scylla cluster %s for test %s", cluster, self.uname)
self.args.insert(0, f"--host={cluster.endpoint()}")
log_filename = next(server.log_filename for server in cluster.running.values())
self.args.insert(0, f"--scylla-log-filename={log_filename}")
self.is_before_test_ok = True
cluster.take_log_savepoint()
status = await run_test(self, options, env=self.suite.scylla_env)
if self.shortname in self.suite.dirties_cluster:
cluster.is_dirty = True
cluster.after_test(self.uname, status)
self.is_after_test_ok = True
self.success = status
except Exception as e:
self.server_log = cluster.read_server_log()
self.server_log_filename = cluster.server_log_filename()
if not self.is_before_test_ok:
print("Test {} pre-check failed: {}".format(self.name, str(e)))
print("Server log of the first server:\n{}".format(self.server_log))
logger.info(f"Discarding cluster after failed start for test %s...", self.name)
elif not self.is_after_test_ok:
print("Test {} post-check failed: {}".format(self.name, str(e)))
print("Server log of the first server:\n{}".format(self.server_log))
logger.info(f"Discarding cluster after failed test %s...", self.name)
await self.suite.clusters.put(cluster, is_dirty=cluster.is_dirty)
logger.info("Test %s %s", self.uname, "succeeded" if self.success else "failed ")
return self
def write_junit_failure_report(self, xml_res: ET.Element) -> None:
super().write_junit_failure_report(xml_res)
if self.server_log_filename is not None:
system_err = ET.SubElement(xml_res, 'system-err')
system_err.text = read_log(self.server_log_filename)