mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-30 05:07:05 +00:00
Merge 'test/cluster/dtest: fix ScyllaNode state not persisting across nodelist() calls' from Benny Halevy
`ScyllaCluster.nodelist()` creates new `ScyllaNode` objects on every call, so per-node state set via `set_smp()`, `set_log_level()`, and `_adjust_smp_and_memory()` was lost. This meant `set_smp()` had no effect when `cluster.start()` was called after it, since `start_nodes()` calls `nodelist()` internally which creates fresh nodes with default values. - Add debug logging for smp/memory in ScyllaNode - Store per-node settings (smp, memory, log levels) in a `ScyllaCluster._node_resources` dict keyed by server_id, so they survive `nodelist()` reconstruction. `ScyllaNode` restores its state from this dict on construction and saves it back whenever `set_smp()`, `set_log_level()`, or `_adjust_smp_and_memory()` modifies it. - Add a reproducer test verifying `set_smp()` takes effect on restart Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1629 -- No backport needed: this only fixes dtest infrastructure, no production code is affected. Closes scylladb/scylladb#29549 * github.com:scylladb/scylladb: test/cluster/dtest: add test for node.set_smp() persistence test/cluster/dtest: cache ScyllaNode instances in ScyllaCluster test/cluster/dtest/ccmlib/scylla_node: add debug logging
This commit is contained in:
@@ -11,13 +11,11 @@ from typing import TYPE_CHECKING
|
||||
|
||||
from cassandra.auth import PlainTextAuthProvider
|
||||
|
||||
from test.pylib.internal_types import ServerInfo
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.dtest.ccmlib.common import logger
|
||||
from test.cluster.dtest.ccmlib.scylla_node import ScyllaNode
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
from typing import Any
|
||||
|
||||
|
||||
@@ -29,6 +27,10 @@ class ScyllaCluster:
|
||||
self.manager = manager
|
||||
self.scylla_mode = scylla_mode
|
||||
self._config_options = {}
|
||||
# Cached ScyllaNode instances. Nodes are appended by _add_nodes()
|
||||
# in the order they are created by servers_add().
|
||||
self._nodes: list[ScyllaNode] = []
|
||||
self._next_node_num: int = 1
|
||||
|
||||
if self.scylla_mode == "debug":
|
||||
self.default_wait_other_notice_timeout = 600
|
||||
@@ -39,19 +41,20 @@ class ScyllaCluster:
|
||||
|
||||
self.force_wait_for_cluster_start = force_wait_for_cluster_start
|
||||
|
||||
@staticmethod
|
||||
def _sorted_nodes(servers: Iterable[ServerInfo]) -> list[ServerInfo]:
|
||||
return sorted(servers, key=lambda s: s.server_id)
|
||||
def _add_nodes(self, servers: list) -> None:
|
||||
"""Create ScyllaNode instances for the given servers and cache them."""
|
||||
for server in servers:
|
||||
name = f"node{self._next_node_num}"
|
||||
self._next_node_num += 1
|
||||
self._nodes.append(ScyllaNode(
|
||||
cluster=self, server=server, name=name))
|
||||
|
||||
@property
|
||||
def nodes(self) -> dict[str, ScyllaNode]:
|
||||
return {node.name: node for node in self.nodelist()}
|
||||
|
||||
def nodelist(self) -> list[ScyllaNode]:
|
||||
return [
|
||||
ScyllaNode(cluster=self, server=server, name=f"node{n}")
|
||||
for n, server in enumerate(self._sorted_nodes(self.manager.all_servers()), start=1)
|
||||
]
|
||||
return list(self._nodes)
|
||||
|
||||
def get_node_ip(self, nodeid: int) -> str:
|
||||
return self.nodelist()[nodeid-1].address()
|
||||
@@ -61,16 +64,16 @@ class ScyllaCluster:
|
||||
self.manager.auth_provider = PlainTextAuthProvider(username="cassandra", password="cassandra")
|
||||
match nodes:
|
||||
case int():
|
||||
self.manager.servers_add(servers_num=nodes, config=self._config_options, start=False, auto_rack_dc="dc1")
|
||||
self._add_nodes(self.manager.servers_add(servers_num=nodes, config=self._config_options, start=False, auto_rack_dc="dc1"))
|
||||
case list():
|
||||
for dc, n_nodes in enumerate(nodes, start=1):
|
||||
dc_name = f"dc{dc}"
|
||||
self.manager.servers_add(
|
||||
self._add_nodes(self.manager.servers_add(
|
||||
servers_num=n_nodes,
|
||||
config=self._config_options,
|
||||
start=False,
|
||||
auto_rack_dc=dc_name
|
||||
)
|
||||
))
|
||||
case dict():
|
||||
# Supported spec: {"dc1": {"rack1": 3, "rack2": 2}, "dc2": {"rack1": 2}}
|
||||
for dc, dc_nodes in nodes.items():
|
||||
@@ -79,7 +82,7 @@ class ScyllaCluster:
|
||||
for rack, rack_nodes in dc_nodes.items():
|
||||
if not isinstance(rack_nodes, int):
|
||||
raise RuntimeError(f"Unsupported topology specification: {nodes}")
|
||||
self.manager.servers_add(
|
||||
self._add_nodes(self.manager.servers_add(
|
||||
servers_num=rack_nodes,
|
||||
config=self._config_options,
|
||||
property_file={
|
||||
@@ -87,7 +90,7 @@ class ScyllaCluster:
|
||||
"rack": rack,
|
||||
},
|
||||
start=False,
|
||||
)
|
||||
))
|
||||
case _:
|
||||
raise RuntimeError(f"Unsupported topology specification: {nodes}")
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ from itertools import chain
|
||||
from functools import cached_property
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
import logging
|
||||
|
||||
from test.cluster.dtest.ccmlib.common import ArgumentError, wait_for, BIN_DIR
|
||||
from test.pylib.internal_types import ServerUpState
|
||||
@@ -28,6 +29,9 @@ if TYPE_CHECKING:
|
||||
from test.cluster.dtest.ccmlib.scylla_cluster import ScyllaCluster
|
||||
|
||||
|
||||
logger = logging.getLogger("scylla_node")
|
||||
|
||||
|
||||
NODETOOL_STDERR_IGNORED_PATTERNS = (
|
||||
re.compile(r"WARNING: debug mode. Not for benchmarking or production"),
|
||||
re.compile(
|
||||
@@ -149,15 +153,20 @@ class ScyllaNode:
|
||||
return self.cluster.scylla_mode
|
||||
|
||||
def set_smp(self, smp: int) -> None:
|
||||
logger.debug(f"Setting smp: {self=} {smp=}")
|
||||
self._smp_set_during_test = smp
|
||||
|
||||
def smp(self) -> int:
|
||||
logger.debug(f"Getting smp: {self=} _smp_set_during_test={self._smp_set_during_test} _smp={self._smp} {DEFAULT_SMP=}")
|
||||
return self._smp_set_during_test or self._smp or DEFAULT_SMP
|
||||
|
||||
def memory(self) -> int:
|
||||
return self._memory or self.smp() * DEFAULT_MEMORY_PER_CPU
|
||||
|
||||
def _adjust_smp_and_memory(self, smp: int | None = None, memory: int | None = None) -> None:
|
||||
if not memory and not smp:
|
||||
return
|
||||
logger.debug(f"Adjusting smp={smp} memory={memory} current_smp={self._smp} current_memory={self._memory}")
|
||||
if memory:
|
||||
self._memory = memory // (smp or self.smp()) * self.smp()
|
||||
if smp:
|
||||
@@ -446,6 +455,8 @@ class ScyllaNode:
|
||||
|
||||
self.mark = self.mark_log()
|
||||
|
||||
logger.debug(f"Starting server: server_id={self.server_id} {scylla_args=} {scylla_env=}")
|
||||
|
||||
self.cluster.manager.server_start(
|
||||
server_id=self.server_id,
|
||||
seeds=None if self.bootstrap else [self.address()],
|
||||
|
||||
46
test/cluster/dtest/set_smp_test.py
Normal file
46
test/cluster/dtest/set_smp_test.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
#
|
||||
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
|
||||
from dtest_class import Tester
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
@pytest.mark.single_node
|
||||
class TestSetSmp(Tester):
|
||||
"""Test that node.set_smp() properly persists across restarts."""
|
||||
|
||||
def _get_smp_from_log(self, node, from_mark=None):
|
||||
"""Extract smp value from the node's log by looking at the SHARD_COUNT gossip value."""
|
||||
matches = node.grep_log(r"SHARD_COUNT : Value\((\d+),\d+\)", from_mark=from_mark)
|
||||
assert matches, "Could not find SHARD_COUNT in node log"
|
||||
# Return the last match (most recent start)
|
||||
return int(matches[-1][1].group(1))
|
||||
|
||||
def test_set_smp(self):
|
||||
"""Verify that set_smp() takes effect on the next start."""
|
||||
cluster = self.cluster
|
||||
cluster.populate(1).start(wait_for_binary_proto=True)
|
||||
node1 = cluster.nodelist()[0]
|
||||
|
||||
default_smp = self._get_smp_from_log(node1)
|
||||
|
||||
cluster.stop()
|
||||
|
||||
# set_smp to a different value and restart without jvm_args
|
||||
target_smp = 1 if default_smp != 1 else 2
|
||||
node1.set_smp(target_smp)
|
||||
mark = node1.mark_log()
|
||||
cluster.start(wait_for_binary_proto=True)
|
||||
|
||||
node1 = cluster.nodelist()[0]
|
||||
actual_smp = self._get_smp_from_log(node1, from_mark=mark)
|
||||
assert actual_smp == target_smp, \
|
||||
f"Expected smp={target_smp} after set_smp({target_smp}), got {actual_smp}"
|
||||
Reference in New Issue
Block a user