scylladb/test/pylib/resource_gather.py

#
# Copyright (C) 2024-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#

from __future__ import annotations

import getpass
import logging
import os
import platform
import shlex
import subprocess
import time
from abc import ABC
from concurrent.futures.thread import ThreadPoolExecutor
from datetime import datetime
from functools import lru_cache
from pathlib import Path
from time import sleep
from types import SimpleNamespace
from typing import TYPE_CHECKING

import psutil

from threading import Event
from test import HOST_ID, TOP_SRC_DIR
from test.pylib.db.model import HostInfo, Metric, SystemResourceMetric, CgroupMetric, Test
from test.pylib.db.writer import (
    CGROUP_MEMORY_METRICS_TABLE,
    DEFAULT_DB_NAME,
    HOST_INFO_TABLE,
    METRICS_TABLE,
    SYSTEM_RESOURCE_METRICS_TABLE,
    TESTS_TABLE,
    SQLiteWriter,
)

if TYPE_CHECKING:
    from typing import IO, TextIO

    from test.pylib.suite.base import Test as TestPyTest

logger = logging.getLogger(__name__)


def get_current_cgroup() -> Path:
    """Get the current cgroup path for this process."""
    with open("/proc/self/cgroup", 'r') as f:
        cgroup_info = f.readlines()
    return Path(f"/sys/fs/cgroup/{cgroup_info[0].strip().split(':')[-1]}")

SCYLLA_TEST_CGROUP_BASE_ENV = 'SCYLLA_TEST_CGROUP_BASE'


@lru_cache(maxsize=None)
def get_cgroup() -> Path:
    # Use the env var when set so that xdist worker subprocesses (which are spawned
    # after the master has moved itself to tests/master/default) still compute the
    # correct top-level cgroup path rather than one nested inside the master's cgroup.
    env_val = os.environ.get(SCYLLA_TEST_CGROUP_BASE_ENV)
    base = Path(env_val) if env_val else get_current_cgroup()
    if base.stem != 'resource_gather':
        base = base / 'resource_gather'
    return base


CGROUP_INITIAL = get_cgroup()
CGROUP_TESTS = CGROUP_INITIAL.parent / 'tests'


class ResourceGather(ABC):

    def setup_test_tracking(self) -> None:
        pass

    def put_process_to_cgroup(self) -> None:
        pass

    def get_test_metrics(self) -> Metric:
        pass

    def write_metrics_to_db(self, metrics: Metric, success: bool = False) -> None:
        pass

    def teardown_test_tracking(self) -> None:
        pass

    def stop_monitoring(self) -> None:
        pass

    def cgroup_monitor(self) -> None:
        pass


class ResourceGatherRecord(ResourceGather):
    """Writes test records and timing metrics to the DB, but performs no cgroup operations.

    Used when --gather-metrics is OFF so all tests still appear in the tests table.
    """

    def __init__(self, temp_dir: Path, test: TestPyTest | SimpleNamespace, worker_id: str | None = None):
        self.test = test
        self.worker_id = worker_id or "master"
        self.db_path = temp_dir / DEFAULT_DB_NAME
        self.sqlite_writer = SQLiteWriter(self.db_path)
        self.logger = logging.getLogger(__name__)

        directory_path = str(test.suite.suite_path.relative_to(TOP_SRC_DIR))

        self.test_id: int = self.sqlite_writer.write_row_if_not_exist(
            Test(
                host_id=HOST_ID,
                architecture=platform.machine(),
                path=directory_path,
                file=test.suite.test_file_name,
                mode=test.mode,
                run_id=test.id,
                test_name=test.shortname,
            ),
            TESTS_TABLE)

    def get_test_metrics(self) -> Metric:
        test_metrics = Metric(test_id=self.test_id, host_id=HOST_ID, worker_id=self.worker_id)
        test_metrics.time_taken = self.test.time_end - self.test.time_start
        test_metrics.time_start = datetime.fromtimestamp(self.test.time_start)
        test_metrics.time_end = datetime.fromtimestamp(self.test.time_end)
        test_metrics.success = self.test.success
        return test_metrics

    def write_metrics_to_db(self, metrics: Metric, success: bool = False) -> None:
        metrics.success = success
        self.sqlite_writer.write_row(metrics, METRICS_TABLE)

    def teardown_test_tracking(self) -> None:
        self.sqlite_writer.close()


class ResourceGatherOn(ResourceGatherRecord):
    """Resource gatherer that tracks worker-level cgroup memory and CPU metrics.

    Uses the worker's cgroup (CGROUP_TESTS/{worker_id}) which hierarchically includes
    all Scylla node processes running under that worker, giving accurate memory readings.
    """

    def __init__(self, temp_dir: Path, test: TestPyTest | SimpleNamespace, worker_id: str | None = None):
        super().__init__(temp_dir, test, worker_id)
        self.pool = ThreadPoolExecutor(max_workers=1)
        self.future = None
        self.stop_event = Event()
        self.cgroup_path = CGROUP_TESTS / self.worker_id
        self._memory_peak_fd: IO | None = None
        self._cpu_stat_start: dict[str, float] | None = None

    def stop_monitoring(self) -> None:
        self.stop_event.set()
        if self.future is not None:
            self.future.result()
            self.pool.shutdown(wait=True)

    def cgroup_monitor(self) -> None:
        self.future = self.pool.submit(self._monitor_cgroup)

    def _monitor_cgroup(self) -> None:
        """Continuously monitors cgroup memory utilization every second."""
        memory_current = self.cgroup_path / 'memory.current'
        sqlite_writer = SQLiteWriter(self.db_path)
        try:
            while not self.stop_event.is_set():
                try:
                    timeline_record = CgroupMetric(
                        test_id=self.test_id,
                        host_id=HOST_ID,
                        memory=int(memory_current.read_text().strip()),
                        timestamp=datetime.now()
                    )
                    sqlite_writer.write_row(timeline_record, CGROUP_MEMORY_METRICS_TABLE)
                except Exception as e:
                    self.logger.debug(f"Could not read cgroup memory for {self.cgroup_path}: {e}")
                self.stop_event.wait(1)
        finally:
            sqlite_writer.close()

    def setup_test_tracking(self) -> None:
        # Open a fresh FD on memory.peak so the kernel resets its per-FD peak tracker
        # to the current memory. Reading this FD later returns the peak memory since it
        # was opened, i.e., the peak during this test only.
        memory_peak_path = self.cgroup_path / 'memory.peak'
        if memory_peak_path.exists():
            self._memory_peak_fd = open(memory_peak_path, 'r')

        # Snapshot cpu.stat at the start of the test. Unlike memory.peak, cpu.stat
        # has no per-FD reset mechanism — values are cumulative for the cgroup's
        # lifetime. We subtract this snapshot from the end-of-test reading to get
        # per-test CPU usage.
        cpu_stat_path = self.cgroup_path / 'cpu.stat'
        if cpu_stat_path.exists():
            with open(cpu_stat_path, 'r') as f:
                self._cpu_stat_start = self._read_cpu_stat(f)

    def get_test_metrics(self) -> Metric:
        test_metrics = super().get_test_metrics()
        if self._memory_peak_fd is not None:
            try:
                self._memory_peak_fd.seek(0)
                test_metrics.memory_peak = int(self._memory_peak_fd.read().strip())
            except Exception as e:
                self.logger.warning(f"Could not read memory.peak for {self.cgroup_path}: {e}")

        cpu_stat_path = self.cgroup_path / 'cpu.stat'
        if cpu_stat_path.exists() and self._cpu_stat_start is not None:
            with open(cpu_stat_path, 'r') as f:
                cpu_stat_end = self._read_cpu_stat(f)
            for stat, attr in self._CPU_STAT_FIELDS.items():
                start_val = self._cpu_stat_start.get(stat, 0.0)
                end_val = cpu_stat_end.get(stat, 0.0)
                setattr(test_metrics, attr, end_val - start_val)

        return test_metrics

    def teardown_test_tracking(self) -> None:
        if self._memory_peak_fd is not None:
            self._memory_peak_fd.close()
            self._memory_peak_fd = None
        self._cpu_stat_start = None
        super().teardown_test_tracking()

    # Maps cpu.stat keys to Metric attribute names. Values in cpu.stat are in
    # microseconds; we convert to seconds when assigning to the Metric.
    _CPU_STAT_FIELDS = {
        'user_usec': 'user_sec',
        'system_usec': 'system_sec',
        'usage_usec': 'usage_sec',
    }

    @staticmethod
    def _read_cpu_stat(file: TextIO) -> dict[str, float]:
        """Read cpu.stat and return the relevant counters converted to seconds."""
        result: dict[str, float] = {}
        for line in file.readlines():
            parts = line.split(' ', 1)
            if len(parts) == 2 and parts[0] in ResourceGatherOn._CPU_STAT_FIELDS:
                result[parts[0]] = float(parts[1]) / 1_000_000
        return result


def gather_host_info() -> HostInfo:
    """Collect static hardware information about the current host."""
    try:
        cpu_model = "unknown"
        with open("/proc/cpuinfo") as f:
            for line in f:
                if line.startswith("model name"):
                    cpu_model = line.split(":", 1)[1].strip()
                    break
    except OSError:
        cpu_model = platform.processor() or "unknown"

    cpu_cores = psutil.cpu_count(logical=False) or os.cpu_count() or 0
    ram_bytes = psutil.virtual_memory().total
    return HostInfo(host_id=HOST_ID, cpu_model=cpu_model, cpu_cores=cpu_cores, ram_bytes=ram_bytes)


def get_resource_gather(temp_dir: Path, is_switched_on: bool, test: TestPyTest | SimpleNamespace, worker_id: str | None = None) -> ResourceGather:
    """Return a resource gatherer for the given test. Always creates a test record in the DB."""
    if is_switched_on:
        return ResourceGatherOn(temp_dir, test, worker_id)
    else:
        return ResourceGatherRecord(temp_dir, test, worker_id)


def _is_cgroup_rw() -> bool:
    with open('/proc/mounts', 'r') as f:
        for line in f.readlines():
            if 'cgroup2' in line:
                options = line.split(' ')[3].split(',')
                return 'rw' in options
    return False

def propagate_subtree_controls(group: Path):
    with open(group / 'cgroup.controllers', 'r') as f:
        controllers = f.readline().strip()
    if not controllers:
        return
    controllers = " ".join(map(lambda x: f"+{x}", controllers.split(" ")))
    with open(group / 'cgroup.subtree_control', 'w') as f:
        f.write(controllers)


def setup_cgroup(is_required: bool) -> None:
    if is_required:
        # Export the cgroup base path as an env var so that xdist worker subprocesses
        # inherit it. Workers are spawned after the master has already moved itself into
        # tests/master/default, so without this env var they would compute CGROUP_INITIAL
        # relative to that nested cgroup instead of the original top-level scope.
        os.environ[SCYLLA_TEST_CGROUP_BASE_ENV] = str(CGROUP_INITIAL.parent)

        # check where the process is executed in podman or in docker
        is_podman = os.access("/run/.containerenv", os.F_OK)
        is_docker = os.access("/.dockerenv", os.F_OK)

        if _is_cgroup_rw() and is_docker:
            subprocess.run(
                [
                    "sudo",
                    "mount",
                    "-o",
                    "remount,rw",
                    "/sys/fs/cgroup",
                ],
                check=True,
            )

        if is_docker:
            cmd = ["sudo", "chown", "-R", f"{getpass.getuser()}:{getpass.getuser()}", '/sys/fs/cgroup']
            subprocess.run(cmd, check=True)

        configured = False
        for directory in [CGROUP_INITIAL, CGROUP_TESTS]:
            if not directory.exists():
                directory.mkdir()
            else:
                configured = True

        if not configured:
            with open(CGROUP_INITIAL.parent / 'cgroup.procs') as f:
                processes = [line.strip() for line in f.readlines()]

            for process in processes:
                with open(CGROUP_INITIAL / 'cgroup.procs', "w") as f:
                    f.write(str(process))

            propagate_subtree_controls(CGROUP_INITIAL.parent)

        # Always ensure CGROUP_TESTS has subtree controls enabled so that worker
        # sub-cgroups and per-test cgroups can use memory tracking.
        propagate_subtree_controls(CGROUP_TESTS)


def setup_worker_cgroup() -> None:
    from test.pylib.util import get_xdist_worker_id
    worker_id = get_xdist_worker_id() or "master"
    # this method is creating the worker cgroup, but the main cgroup is created in the master thread, so this is just to
    # avoid race conditions
    for i in range(10):
        if CGROUP_TESTS.exists():
            break
        time.sleep(0.5)
    worker_cgroup_path = CGROUP_TESTS / worker_id
    worker_cgroup_path_default = worker_cgroup_path / 'default'
    for group in [worker_cgroup_path, worker_cgroup_path_default]:
        if not group.exists():
            group.mkdir()
    propagate_subtree_controls(worker_cgroup_path)
    # Move the current worker process into the worker's default leaf cgroup.
    # Scylla processes spawned by the test (via ScyllaClusterManager) will inherit
    # this cgroup. The worker-level cgroup (CGROUP_TESTS/{worker_id}) is used for
    # hierarchical memory monitoring and captures all descendant processes.
    try:
        with open(worker_cgroup_path_default / 'cgroup.procs', 'w') as f:
            f.write(str(os.getpid()))
    except Exception as e:
        logger.warning(f"Could not move worker process to cgroup {worker_cgroup_path_default}: {e}")


class SystemResourceMonitor:
    """Continuously monitors CPU and memory utilization."""
    def __init__(self, tmpdir: Path):
        self.tmpdir = tmpdir
        self.stop_event = Event()
        self.thread = ThreadPoolExecutor(max_workers=1)

    def start(self) -> None:
        self.thread.submit(self._monitor_resources, self.tmpdir)

    def stop(self) -> None:
        self.stop_event.set()
        self.thread.shutdown(wait=True)

    def _monitor_resources(self, tmpdir: Path) -> None:
        sqlite_writer = SQLiteWriter(tmpdir / DEFAULT_DB_NAME)
        try:
            while not self.stop_event.is_set():
                vm = psutil.virtual_memory()
                timeline_record = SystemResourceMetric(
                    host_id=HOST_ID,
                    cpu=psutil.cpu_percent(interval=0.1),
                    memory_free=vm.free,
                    memory_available=vm.available,
                    memory_used=vm.used,
                    memory_active=vm.active,
                    memory_inactive=vm.inactive,
                    memory_buffers=vm.buffers,
                    timestamp=datetime.now(),
                )
                sqlite_writer.write_row(timeline_record, SYSTEM_RESOURCE_METRICS_TABLE)

                # Control the frequency of updates, for example, every 2 seconds
                sleep(2)
        finally:
            sqlite_writer.close()