Merge 'test/cluster/dtest: fix ScyllaNode state not persisting across nodelist() calls' from Benny Halevy

`ScyllaCluster.nodelist()` creates new `ScyllaNode` objects on every call,
so per-node state set via `set_smp()`, `set_log_level()`, and
`_adjust_smp_and_memory()` was lost. This meant `set_smp()` had no effect
when `cluster.start()` was called after it, since `start_nodes()` calls
`nodelist()` internally which creates fresh nodes with default values.

- Add debug logging for smp/memory in ScyllaNode
- Store per-node settings (smp, memory, log levels) in a
  `ScyllaCluster._node_resources` dict keyed by server_id, so they survive
  `nodelist()` reconstruction. `ScyllaNode` restores its state from this dict
  on construction and saves it back whenever `set_smp()`, `set_log_level()`,
  or `_adjust_smp_and_memory()` modifies it.
- Add a reproducer test verifying `set_smp()` takes effect on restart

Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1629

--

No backport needed: this only fixes dtest infrastructure, no production code
is affected.

Closes scylladb/scylladb#29549

* github.com:scylladb/scylladb:
  test/cluster/dtest: add test for node.set_smp() persistence
  test/cluster/dtest: cache ScyllaNode instances in ScyllaCluster
  test/cluster/dtest/ccmlib/scylla_node: add debug logging
This commit is contained in:
Botond Dénes
2026-04-29 06:25:36 +03:00
3 changed files with 74 additions and 14 deletions

View File

@@ -11,13 +11,11 @@ from typing import TYPE_CHECKING
from cassandra.auth import PlainTextAuthProvider
from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.cluster.dtest.ccmlib.common import logger
from test.cluster.dtest.ccmlib.scylla_node import ScyllaNode
if TYPE_CHECKING:
from collections.abc import Iterable
from typing import Any
@@ -29,6 +27,10 @@ class ScyllaCluster:
self.manager = manager
self.scylla_mode = scylla_mode
self._config_options = {}
# Cached ScyllaNode instances. Nodes are appended by _add_nodes()
# in the order they are created by servers_add().
self._nodes: list[ScyllaNode] = []
self._next_node_num: int = 1
if self.scylla_mode == "debug":
self.default_wait_other_notice_timeout = 600
@@ -39,19 +41,20 @@ class ScyllaCluster:
self.force_wait_for_cluster_start = force_wait_for_cluster_start
@staticmethod
def _sorted_nodes(servers: Iterable[ServerInfo]) -> list[ServerInfo]:
return sorted(servers, key=lambda s: s.server_id)
def _add_nodes(self, servers: list) -> None:
"""Create ScyllaNode instances for the given servers and cache them."""
for server in servers:
name = f"node{self._next_node_num}"
self._next_node_num += 1
self._nodes.append(ScyllaNode(
cluster=self, server=server, name=name))
@property
def nodes(self) -> dict[str, ScyllaNode]:
return {node.name: node for node in self.nodelist()}
def nodelist(self) -> list[ScyllaNode]:
return [
ScyllaNode(cluster=self, server=server, name=f"node{n}")
for n, server in enumerate(self._sorted_nodes(self.manager.all_servers()), start=1)
]
return list(self._nodes)
def get_node_ip(self, nodeid: int) -> str:
return self.nodelist()[nodeid-1].address()
@@ -61,16 +64,16 @@ class ScyllaCluster:
self.manager.auth_provider = PlainTextAuthProvider(username="cassandra", password="cassandra")
match nodes:
case int():
self.manager.servers_add(servers_num=nodes, config=self._config_options, start=False, auto_rack_dc="dc1")
self._add_nodes(self.manager.servers_add(servers_num=nodes, config=self._config_options, start=False, auto_rack_dc="dc1"))
case list():
for dc, n_nodes in enumerate(nodes, start=1):
dc_name = f"dc{dc}"
self.manager.servers_add(
self._add_nodes(self.manager.servers_add(
servers_num=n_nodes,
config=self._config_options,
start=False,
auto_rack_dc=dc_name
)
))
case dict():
# Supported spec: {"dc1": {"rack1": 3, "rack2": 2}, "dc2": {"rack1": 2}}
for dc, dc_nodes in nodes.items():
@@ -79,7 +82,7 @@ class ScyllaCluster:
for rack, rack_nodes in dc_nodes.items():
if not isinstance(rack_nodes, int):
raise RuntimeError(f"Unsupported topology specification: {nodes}")
self.manager.servers_add(
self._add_nodes(self.manager.servers_add(
servers_num=rack_nodes,
config=self._config_options,
property_file={
@@ -87,7 +90,7 @@ class ScyllaCluster:
"rack": rack,
},
start=False,
)
))
case _:
raise RuntimeError(f"Unsupported topology specification: {nodes}")

View File

@@ -17,6 +17,7 @@ from itertools import chain
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING, Any
import logging
from test.cluster.dtest.ccmlib.common import ArgumentError, wait_for, BIN_DIR
from test.pylib.internal_types import ServerUpState
@@ -28,6 +29,9 @@ if TYPE_CHECKING:
from test.cluster.dtest.ccmlib.scylla_cluster import ScyllaCluster
logger = logging.getLogger("scylla_node")
NODETOOL_STDERR_IGNORED_PATTERNS = (
re.compile(r"WARNING: debug mode. Not for benchmarking or production"),
re.compile(
@@ -149,15 +153,20 @@ class ScyllaNode:
return self.cluster.scylla_mode
def set_smp(self, smp: int) -> None:
logger.debug(f"Setting smp: {self=} {smp=}")
self._smp_set_during_test = smp
def smp(self) -> int:
logger.debug(f"Getting smp: {self=} _smp_set_during_test={self._smp_set_during_test} _smp={self._smp} {DEFAULT_SMP=}")
return self._smp_set_during_test or self._smp or DEFAULT_SMP
def memory(self) -> int:
return self._memory or self.smp() * DEFAULT_MEMORY_PER_CPU
def _adjust_smp_and_memory(self, smp: int | None = None, memory: int | None = None) -> None:
if not memory and not smp:
return
logger.debug(f"Adjusting smp={smp} memory={memory} current_smp={self._smp} current_memory={self._memory}")
if memory:
self._memory = memory // (smp or self.smp()) * self.smp()
if smp:
@@ -446,6 +455,8 @@ class ScyllaNode:
self.mark = self.mark_log()
logger.debug(f"Starting server: server_id={self.server_id} {scylla_args=} {scylla_env=}")
self.cluster.manager.server_start(
server_id=self.server_id,
seeds=None if self.bootstrap else [self.address()],

View File

@@ -0,0 +1,46 @@
#
# Copyright (C) 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#
import logging
import pytest
from dtest_class import Tester
logger = logging.getLogger(__file__)
@pytest.mark.single_node
class TestSetSmp(Tester):
"""Test that node.set_smp() properly persists across restarts."""
def _get_smp_from_log(self, node, from_mark=None):
"""Extract smp value from the node's log by looking at the SHARD_COUNT gossip value."""
matches = node.grep_log(r"SHARD_COUNT : Value\((\d+),\d+\)", from_mark=from_mark)
assert matches, "Could not find SHARD_COUNT in node log"
# Return the last match (most recent start)
return int(matches[-1][1].group(1))
def test_set_smp(self):
"""Verify that set_smp() takes effect on the next start."""
cluster = self.cluster
cluster.populate(1).start(wait_for_binary_proto=True)
node1 = cluster.nodelist()[0]
default_smp = self._get_smp_from_log(node1)
cluster.stop()
# set_smp to a different value and restart without jvm_args
target_smp = 1 if default_smp != 1 else 2
node1.set_smp(target_smp)
mark = node1.mark_log()
cluster.start(wait_for_binary_proto=True)
node1 = cluster.nodelist()[0]
actual_smp = self._get_smp_from_log(node1, from_mark=mark)
assert actual_smp == target_smp, \
f"Expected smp={target_smp} after set_smp({target_smp}), got {actual_smp}"