Files
scylladb/test/pylib/runner.py
Andrei Chekun a03c4fd754 test.py: Add host hardware info
Gather additional information about the running host for better metrics analysis
2026-05-18 12:23:40 +02:00

606 lines
26 KiB
Python

#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#
from __future__ import annotations
import asyncio
import logging
import os
import pathlib
import platform
import random
import sys
import time
from argparse import BooleanOptionalAction
from collections import defaultdict
from itertools import chain, count
from functools import cache, cached_property
from pathlib import Path
from random import randint
from typing import TYPE_CHECKING, Callable
from types import SimpleNamespace
import pytest
import xdist
import yaml
from _pytest.junitxml import xml_key
from test import ALL_MODES, DEBUG_MODES, TEST_RUNNER, TOP_SRC_DIR, HOST_ID
from test.pylib.resource_gather import setup_cgroup, setup_worker_cgroup, get_resource_gather, SystemResourceMonitor, \
SCYLLA_TEST_CGROUP_BASE_ENV, gather_host_info
from test.pylib.db.writer import SQLiteWriter, DEFAULT_DB_NAME, HOST_INFO_TABLE
from test.pylib.scylla_cluster import merge_cmdline_options
from test.pylib.skip_reason_plugin import skip_marker
from test.pylib.suite.base import (
PYTEST_TESTS_LOGS_FOLDER,
TestSuite,
get_testpy_test,
prepare_environment,
init_testsuite_globals,
)
from test.pylib.util import get_modes_to_run, scale_timeout_by_mode
if TYPE_CHECKING:
from collections.abc import Generator
import _pytest.nodes
import _pytest.scope
from test.pylib.suite.base import Test
TEST_CONFIG_FILENAME = "test_config.yaml"
PYTEST_LOG_FOLDER = "pytest_log"
REPEATING_FILES = pytest.StashKey[set[pathlib.Path]]()
BUILD_MODE = pytest.StashKey[str]()
RUN_ID = pytest.StashKey[int]()
PYTEST_LOG_FILE = pytest.StashKey[str]()
EXIT_MAXFAIL_REACHED = 11
logger = logging.getLogger(__name__)
# Store pytest config globally so we can access it in hooks that only receive report
_pytest_config: pytest.Config | None = None
_system_resource_monitor: SystemResourceMonitor | None = None
def pytest_addoption(parser: pytest.Parser) -> None:
parser.addoption('--mode', choices=ALL_MODES, action="append", dest="modes",
help="Run only tests for given build mode(s)")
parser.addoption('--tmpdir', action='store', default=str(TOP_SRC_DIR / 'testlog'),
help='Path to temporary test data and log files. The data is further segregated per build mode.')
parser.addoption('--run_id', action='store', default=None, help='Run id for the test run')
parser.addoption('--byte-limit', action="store", default=randint(0, 2000), type=int,
help="Specific byte limit for failure injection (random by default)")
parser.addoption("--gather-metrics", action=BooleanOptionalAction, default=False,
help='Switch on gathering cgroup metrics')
parser.addoption('--random-seed', action="store",
help="Random number generator seed to be used by boost tests")
# Following option is to use with bare pytest command.
#
# For compatibility with reasons need to run bare pytest with --test-py-init option
# to run a test.py-compatible pytest session.
#
# TODO: remove this when we'll completely switch to bare pytest runner.
parser.addoption('--test-py-init', action='store_true', default=False, deprecated=True,
help='Run pytest session in test.py-compatible mode. I.e., start all required services, etc.')
# Options for compatibility with test.py
parser.addoption('--save-log-on-success', default=False,
dest="save_log_on_success", action="store_true",
help="Save test log output on success and skip cleanup before the run.")
parser.addoption('--coverage', action='store_true', default=False,
help="When running code instrumented with coverage support"
"Will route the profiles to `tmpdir`/mode/coverage/`suite` and post process them in order to generate "
"lcov file per suite, lcov file per mode, and an lcov file for the entire run, "
"The lcov files can eventually be used for generating coverage reports")
parser.addoption("--coverage-mode", action='append', type=str, dest="coverage_modes",
help="Collect and process coverage only for the modes specified. implies: --coverage, default: All built modes")
parser.addoption("--cluster-pool-size", type=int,
help="Set the pool_size for PythonTest and its descendants. Alternatively environment variable "
"CLUSTER_POOL_SIZE can be used to achieve the same")
parser.addoption("--extra-scylla-cmdline-options", default='',
help="Passing extra scylla cmdline options for all tests. Options should be space separated:"
" '--logger-log-level raft=trace --default-log-level error'")
parser.addoption('--x-log2-compaction-groups', action="store", default="0", type=int,
help="Controls number of compaction groups to be used by Scylla tests. Value of 3 implies 8 groups.")
parser.addoption('--repeat', action="store", default=1, type=int,
help="number of times to repeat test execution")
parser.addoption('--exe-path', default=False,
dest="exe_path", action="store",
help="Path to the executable to run. Not working with `mode`")
parser.addoption('--exe-url', default=False,
dest="exe_url", action="store",
help="URL to download the relocatable executable. Not working with `mode`")
# Stores the per-phase test reports so that fixtures and hooks can inspect the
# outcome of each phase (setup / call / teardown) independently.
PHASE_REPORT_KEY = pytest.StashKey[dict[str, pytest.CollectReport]]()
def _build_test_mock(item: pytest.Item) -> SimpleNamespace:
"""Build a SimpleNamespace test object for resource gathering from any pytest item.
Works for both Python test items and C++ CppTestCase items, providing a
unified interface for the resource-gather subsystem.
"""
from test.pylib.cpp.base import CppTestCase
params_stash = get_params_stash(node=item)
build_mode = params_stash[BUILD_MODE] if params_stash else item.config.build_modes[0]
run_id = item.stash.get(RUN_ID, None) or item.config.getoption("--run_id")
temp_dir = pathlib.Path(item.config.getoption("--tmpdir")).absolute()
# Strip the ".mode.run_id" suffix appended by modify_pytest_item()
test_name = item.name
suffix = f".{build_mode}.{run_id}"
original_test_name = test_name[:-len(suffix)] if test_name.endswith(suffix) else test_name
file_path = item.path
suite_path = file_path.parent
if isinstance(item, CppTestCase):
file_name = f"{item.parent.test_name}.cc"
shortname = item.test_case_name
else:
file_name = file_path.name
shortname = original_test_name
return SimpleNamespace(
time_end=0,
time_start=0,
id=run_id,
mode=build_mode,
success=False,
status=None,
path=file_path,
shortname=shortname,
suite=SimpleNamespace(
log_dir=temp_dir / build_mode,
name=suite_path.name,
suite_path=suite_path,
test_file_name=file_name,
),
)
@pytest.hookimpl(wrapper=True)
def pytest_runtest_protocol(item, nextitem):
test_mock = _build_test_mock(item)
test_mock.time_start = time.time()
resource_gather = get_resource_gather(
temp_dir=pathlib.Path(item.config.getoption("--tmpdir")),
is_switched_on=item.config.getoption("--gather-metrics"),
test=test_mock,
worker_id=os.environ.get("PYTEST_XDIST_WORKER"),
)
try:
resource_gather.setup_test_tracking()
resource_gather.cgroup_monitor()
except Exception:
resource_gather.stop_monitoring()
resource_gather.teardown_test_tracking()
raise
try:
return (yield)
finally:
if resource_gather is not None:
test_mock.time_end = time.time()
resource_gather.stop_monitoring()
try:
reports = item.stash.get(PHASE_REPORT_KEY, {})
# skipped test have no call report so need to get setup report instead
call_report = reports.get("call") if reports.get("call") is not None else reports.get("setup")
success = call_report is not None and not call_report.failed
test_metrics = resource_gather.get_test_metrics()
if call_report is not None:
status = "skipped" if call_report.skipped else call_report.outcome
if hasattr(call_report, "wasxfail"):
if call_report.skipped:
status = "xfailed"
elif call_report.passed:
status = "xpassed"
else:
# with xfail_strict = true wasxfail is not present when test is xpassed, so need to check report
if 'XPASS' in call_report.longreprtext:
status = "xpassed"
else:
status = "unknown"
test_metrics.status = status
resource_gather.write_metrics_to_db(
metrics=test_metrics,
success=success
)
finally:
resource_gather.teardown_test_tracking()
@pytest.fixture(scope="module", autouse=True)
def build_mode(request: pytest.FixtureRequest) -> str:
params_stash = get_params_stash(node=request.node)
if params_stash is None:
return request.config.build_modes[0]
return params_stash[BUILD_MODE]
@pytest.fixture(scope="module")
def scale_timeout(build_mode: str) -> Callable[[int | float], int | float]:
def scale_timeout_inner(timeout: int | float) -> int | float:
return scale_timeout_by_mode(build_mode, timeout)
return scale_timeout_inner
@pytest.fixture(scope="module")
async def testpy_test(request: pytest.FixtureRequest, build_mode: str) -> Test | None:
"""Create an instance of Test class for the current test.py test."""
if request.scope == "module":
return await get_testpy_test(path=request.path, options=request.config.option, mode=build_mode)
return None
@pytest.fixture(scope="function")
def scylla_binary(testpy_test) -> Path:
return testpy_test.suite.scylla_exe
def pytest_collection_modifyitems(items: list[pytest.Item], config: pytest.Config) -> None:
run_ids = defaultdict(lambda: count(start=int(config.getoption("--run_id") or 1)))
for item in items:
modify_pytest_item(item=item, run_ids=run_ids)
suites_order = defaultdict(count().__next__) # number suites in order of appearance
def sort_key(item: pytest.Item) -> tuple[int, bool]:
suite = item.stash[TEST_SUITE]
return suites_order[suite], suite and item.path.stem not in suite.cfg.get("run_first", [])
items.sort(key=sort_key)
def pytest_sessionstart(session: pytest.Session) -> None:
# test.py starts S3 mock and create/cleanup testlog by itself. Also, if we run with --collect-only option,
# we don't need this stuff.
global _system_resource_monitor
gather_metrics = session.config.getoption("--gather-metrics")
temp_dir = pathlib.Path(session.config.getoption("--tmpdir")).absolute()
save_log_on_success = session.config.getoption("--save-log-on-success")
toxiproxy_byte_limit = session.config.getoption("--byte-limit")
collect_only = session.config.getoption("--collect-only")
if TEST_RUNNER != "pytest" or collect_only:
return
# Check if this is an xdist worker
is_xdist_worker = xdist.is_xdist_worker(request_or_session=session)
init_testsuite_globals()
TestSuite.artifacts.add_exit_artifact(None, TestSuite.hosts.cleanup)
# Run stuff just once for the main pytest process (not in xdist workers).
if not is_xdist_worker:
temp_dir = pathlib.Path(session.config.getoption("--tmpdir")).absolute()
prepare_environment(
tempdir_base=temp_dir,
modes=get_modes_to_run(session.config),
gather_metrics=gather_metrics,
save_log_on_success=save_log_on_success,
toxiproxy_byte_limit=toxiproxy_byte_limit,
)
if gather_metrics:
# In the master process, set up the cgroup hierarchy if test.py hasn't done it already.
# Workers inherit SCYLLA_TEST_CGROUP_BASE_ENV from the master via environment inheritance.
if not is_xdist_worker and SCYLLA_TEST_CGROUP_BASE_ENV not in os.environ:
setup_cgroup(is_required=True)
setup_worker_cgroup()
_system_resource_monitor = SystemResourceMonitor(temp_dir)
_system_resource_monitor.start()
@pytest.hookimpl(tryfirst=True)
def pytest_runtest_logreport(report):
"""Add custom XML attributes to JUnit testcase elements.
This hook wraps the node_reporter's to_xml method to add custom attributes
when the XML element is created. This approach works with pytest-xdist because
it modifies the XML element directly when it's generated, rather than trying
to modify attrs before finalize() is called.
Attributes added:
- function_path: The function path of the test case (excluding parameters).
Uses tryfirst=True to run before LogXML's hook has created the node_reporter to avoid double recording.
"""
# Get the XML reporter
config = _pytest_config
if config is None:
return
xml = config.stash.get(xml_key, None)
if xml is None:
return
node_reporter = xml.node_reporter(report)
# Only wrap once to avoid multiple wrapping (check on the node_reporter object itself)
if not getattr(node_reporter, '__reporter_modified', False):
function_path = f'test/{report.nodeid.rsplit('.', 2)[0].rsplit('[', 1)[0]}'
# Wrap the to_xml method to add custom attributes to the element
original_to_xml = node_reporter.to_xml
def custom_to_xml():
"""Wrapper that adds custom attributes to the testcase element."""
element = original_to_xml()
element.set("function_path", function_path)
return element
node_reporter.to_xml = custom_to_xml
node_reporter.__reporter_modified = True
def pytest_sessionfinish(session: pytest.Session) -> None:
global _system_resource_monitor
if _system_resource_monitor:
_system_resource_monitor.stop()
is_xdist_worker = xdist.is_xdist_worker(request_or_session=session)
# If all tests passed, remove the log file to save space and avoid confusion with logs from failed runs.
# We check this at the end of the session to ensure that we have the complete log available for any failed tests.
if session.testsfailed == 0 and not session.config.getoption("--save-log-on-success"):
# Use missing_ok=True because the log file is only created on first write,
# so it may never have been written if nothing was logged.
pathlib.Path(_pytest_config.stash[PYTEST_LOG_FILE]).unlink(missing_ok=True)
# Check if this is an xdist worker - workers should not clean up (only the main process should)
# Check if test.py has already prepared the environment, so it should clean up
if is_xdist_worker:
return
# we only clean up when running with pure pytest
if getattr(TestSuite, "artifacts", None) is not None:
asyncio.run(TestSuite.artifacts.cleanup_before_exit())
# Modify exit code to reflect the number of failed tests for easier detection in CI.
maxfail = session.config.getoption("maxfail")
if 0 < maxfail <= session.testsfailed:
session.exitstatus = EXIT_MAXFAIL_REACHED
def pytest_configure(config: pytest.Config) -> None:
global _pytest_config
_pytest_config = config
pytest_log_dir = pathlib.Path(_pytest_config.getoption("--tmpdir")).absolute() / PYTEST_LOG_FOLDER
worker_id = os.environ.get("PYTEST_XDIST_WORKER")
# If this is an xdist worker, set up logging to a separate file for this worker. Otherwise, set up logging for the main process.
if worker_id is not None:
_pytest_config.stash[PYTEST_LOG_FILE] = f"{pytest_log_dir}/pytest_{worker_id}_{HOST_ID}.log"
else:
# For the main process, we want to clean up old logs before the run, so we create the log directory and remove any existing log files.
pytest_log_dir.mkdir(parents=True, exist_ok=True)
if not _pytest_config.getoption("--save-log-on-success"):
for file in pytest_log_dir.glob("*"):
# This will help in case framework tests are executed with test.py event if it's the wrong way to run them.
# test_no_bare_skip_markers_in_collection uses a subprocess to run a collection that has lead to race
# condition, especially with repeat.
file.unlink(missing_ok=True)
_pytest_config.stash[PYTEST_LOG_FILE] = f"{pytest_log_dir}/pytest_main_{HOST_ID}.log"
# Explicitly configure the root logger to write exclusively to a file.
# logging.basicConfig() is a no-op when the root logger already has handlers
# (e.g. added by pytest or any early import), which would leave a StreamHandler
# in place and cause all log records — including noisy third-party DEBUG messages
# like urllib3.connectionpool or asyncio — to appear on the terminal.
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
handler.close()
file_handler = logging.FileHandler(_pytest_config.stash[PYTEST_LOG_FILE])
file_handler.setFormatter(logging.Formatter(config.getini("log_file_format")))
root_logger.addHandler(file_handler)
root_logger.setLevel(config.getini("log_file_level"))
if config.getoption("--exe-url") and config.getoption("--exe-path"):
raise RuntimeError("Can't use --exe-url and exe-path simultaneously.")
if config.getoption("--exe-path") or config.getoption("--exe-url"):
if config.getoption("--mode"):
raise RuntimeError("Can't use --mode with --exe-path or --exe-url.")
config.option.modes = ["custom_exe"]
os.environ["TOPOLOGY_RANDOM_FAILURES_TEST_SHUFFLE_SEED"] = os.environ.get("TOPOLOGY_RANDOM_FAILURES_TEST_SHUFFLE_SEED", str(random.randint(0, sys.maxsize)))
config.build_modes = get_modes_to_run(config)
if testpy_run_id := config.getoption("--run_id"):
if config.getoption("--repeat") != 1:
raise RuntimeError("Can't use --run_id and --repeat simultaneously.")
# Write host hardware info once, at the very start, before any test preparation.
# Done unconditionally (not gated on --gather-metrics) so that the host_info FK
# referenced by every other table is always populated, regardless of whether
# full metrics collection is enabled.
# Only in the main process — xdist workers share the same DB and the same host_id,
# so there is nothing new to record.
if os.environ.get("PYTEST_XDIST_WORKER") is None and not config.getoption("--collect-only"):
temp_dir = pathlib.Path(config.getoption("--tmpdir")).absolute()
writer = SQLiteWriter(temp_dir / DEFAULT_DB_NAME)
try:
writer.write_row_if_not_exist(gather_host_info(), HOST_INFO_TABLE, id_column="host_id")
finally:
writer.close()
class DisabledFile(pytest.File):
def collect(self) -> list[pytest.Item]:
pytest.skip("All tests in this file are disabled in requested modes according to the suite config.")
@pytest.hookimpl(wrapper=True)
def pytest_collect_file(file_path: pathlib.Path,
parent: pytest.Collector) -> Generator[None, list[pytest.Collector], list[pytest.Collector]]:
collectors = yield
if len(collectors) == 1 and file_path not in parent.stash.setdefault(REPEATING_FILES, set()):
parent.stash[REPEATING_FILES].add(file_path)
build_modes = parent.config.build_modes
if suite_config := TestSuiteConfig.from_pytest_node(node=collectors[0]):
build_modes = (
mode for mode in build_modes
if not suite_config.is_test_disabled(build_mode=mode, path=file_path)
)
if repeats := [mode for mode in build_modes for _ in range(parent.config.getoption("--repeat"))]:
ihook = parent.ihook
collectors = list(chain(collectors, chain.from_iterable(
ihook.pytest_collect_file(file_path=file_path, parent=parent) for _ in range(1, len(repeats))
)))
for build_mode, collector in zip(repeats, collectors, strict=True):
collector.stash[BUILD_MODE] = build_mode
collector.stash[TEST_SUITE] = suite_config
else:
collectors = [DisabledFile.from_parent(parent=parent, path=file_path)]
parent.stash[REPEATING_FILES].remove(file_path)
return collectors
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
"""Post-test hook to store test result in stash and optionally save logs.
Stores each phase's report in item.stash[PHASE_REPORT_KEY][phase] so
fixtures and hooks can access the test outcome per phase. `item.stash`
is the same stash as `request.node.stash` in pytest fixtures.
When --test-py-init is set, also saves failed test details to log files.
"""
outcome = yield
report = outcome.get_result()
# Store report per phase for use by fixtures and hooks
item.stash.setdefault(PHASE_REPORT_KEY, {})[report.when] = report
# Optionally save test failure logs to files
if _pytest_config:
pytest_tests_logs = pathlib.Path(_pytest_config.getoption("--tmpdir")).absolute() / PYTEST_TESTS_LOGS_FOLDER
if report.failed or _pytest_config.getoption("--save-log-on-success"):
with open(pytest_tests_logs / f"{item._nodeid.replace('::', '-').replace('/', '-')}-{report.when}-{HOST_ID}.log", 'a') as f:
f.write(report.longreprtext + "\n")
for section in report.sections:
f.write(section[0] + "\n")
f.write(section[1] + "\n")
class TestSuiteConfig:
def __init__(self, config_file: pathlib.Path):
self.path = config_file.parent
self.cfg = yaml.safe_load(config_file.read_text(encoding="utf-8"))
@cached_property
def name(self) -> str:
return self.path.name
@cached_property
def _run_in_specific_mode(self) -> set[str]:
return set(chain.from_iterable(self.cfg.get(f"run_in_{build_mode}", []) for build_mode in ALL_MODES))
@cache
def disabled_tests(self, build_mode: str) -> set[str]:
result = set(self.cfg.get("disable", []))
result.update(self.cfg.get(f"skip_in_{build_mode}", []))
if build_mode in DEBUG_MODES:
result.update(self.cfg.get("skip_in_debug_modes", []))
run_in_this_mode = set(self.cfg.get(f"run_in_{build_mode}", []))
result.update(self._run_in_specific_mode - run_in_this_mode)
return result
def is_test_disabled(self, build_mode: str, path: pathlib.Path) -> bool:
return str(path.relative_to(self.path).with_suffix("")) in self.disabled_tests(build_mode=build_mode)
@classmethod
def from_pytest_node(cls, node: _pytest.nodes.Node) -> TestSuiteConfig | None:
config_file = node.path / TEST_CONFIG_FILENAME
if config_file.is_file():
suite = cls(config_file=config_file)
else:
if node.parent is None:
return None
suite = node.parent.stash.get(TEST_SUITE, None)
if suite is None:
suite = cls.from_pytest_node(node=node.parent)
if suite:
extra_opts = node.config.getoption("--extra-scylla-cmdline-options")
if extra_opts:
extra_cmd = suite.cfg.get('extra_scylla_cmdline_options', [])
extra_cmd = merge_cmdline_options(extra_cmd, extra_opts.split())
suite.cfg['extra_scylla_cmdline_options'] = extra_cmd
node.stash[TEST_SUITE] = suite
return suite
TEST_SUITE = pytest.StashKey[TestSuiteConfig | None]()
_STASH_KEYS_TO_COPY = BUILD_MODE, TEST_SUITE
def get_params_stash(node: _pytest.nodes.Node) -> pytest.Stash | None:
parent = node.getparent(cls=pytest.File)
if parent is None:
return None
return parent.stash
def modify_pytest_item(item: pytest.Item, run_ids: defaultdict[tuple[str, str], count]) -> None:
params_stash = get_params_stash(node=item)
for key in _STASH_KEYS_TO_COPY:
item.stash[key] = params_stash[key]
item.stash[RUN_ID] = next(run_ids[(item.stash[BUILD_MODE], item._nodeid)])
suffix = f".{item.stash[BUILD_MODE]}.{item.stash[RUN_ID]}"
item._nodeid = f"{item._nodeid}{suffix}"
item.name = f"{item.name}{suffix}"
skip_marks = [
mark for mark in item.iter_markers("skip_mode")
if mark.name == "skip_mode"
]
for mark in skip_marks:
def __skip_test(mode, reason, platform_key=None):
modes = [mode] if isinstance(mode, str) else mode
for mode in modes:
if mode == item.stash[BUILD_MODE]:
if platform_key is None or platform_key in platform.platform():
skip_marker(item, reason, skip_type="mode")
try:
__skip_test(*mark.args, **mark.kwargs)
except TypeError as e:
raise TypeError(f"Failed to process skip_mode mark, {mark} for test {item}, error {e}")
if (any(mark.name == "xfail" for mark in item.iter_markers("xfail"))
and not any(mark.name == "nightly" for mark in item.iter_markers("nightly"))):
item.add_marker(pytest.mark.nightly)
if (any(mark.name in ("perf", "manual", "unstable") for mark in item.iter_markers())
and not any(mark.name == "non_gating" for mark in item.iter_markers("non_gating"))):
item.add_marker(pytest.mark.non_gating)