From 31d3e20cd610857e7ce49f3cdb0a47249c030c6d Mon Sep 17 00:00:00 2001 From: Michael Litvak Date: Thu, 21 May 2026 14:27:22 +0200 Subject: [PATCH 1/2] pgo: make training cluster RF-rack-valid The pgo training script creates a 3-node cluster, all in a single rack. However, some of the workloads create a keyspace with RF=3. This is not allowed in some cases, for example materialized views with tablets require the cluster to be RF-rack-valid, so it must have at least 3 different racks. Change the cluster to be RF-rack-valid by configuring each node in a different rack using the rackdc properties file. Instead of using a shared config directory, we define a separate home directory for each node, copy the config files into it, and write the separate rackdc file for each node. --- pgo/conf/cassandra-rackdc.properties | 2 -- pgo/pgo.py | 20 +++++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) delete mode 100644 pgo/conf/cassandra-rackdc.properties diff --git a/pgo/conf/cassandra-rackdc.properties b/pgo/conf/cassandra-rackdc.properties deleted file mode 100644 index e39f2d2c50..0000000000 --- a/pgo/conf/cassandra-rackdc.properties +++ /dev/null @@ -1,2 +0,0 @@ -dc=dc1 -rack=rack1 diff --git a/pgo/pgo.py b/pgo/pgo.py index 37f86ed127..d23587eb50 100644 --- a/pgo/pgo.py +++ b/pgo/pgo.py @@ -354,6 +354,22 @@ async def validate_addrs_unused(addresses: list[str]) -> None: diagnostics = f"Command: {shlex.join(ss_command)}\nOutput (expected empty):\n{ss_output.decode()}" raise AddressAlreadyInUseException(addresses, diagnostics) +def write_rackdc_properties(cluster_workdir: PathLike, addr: str, dc: str, rack: str) -> None: + """Write a node-local rack/DC configuration file for the snitch.""" + conf_dir = os.path.realpath(f"{cluster_workdir}/{addr}/conf") + os.makedirs(conf_dir, exist_ok=True) + with open(f"{conf_dir}/cassandra-rackdc.properties", "w") as f: + f.write(f"dc={dc}\n") + f.write(f"rack={rack}\n") + +def prepare_node_conf(cluster_workdir: PathLike, addr: str, dc: str, rack: str) -> None: + """Populate a node-local conf directory and apply its rack/DC settings.""" + node_workdir = os.path.realpath(f"{cluster_workdir}/{addr}") + conf_dir = f"{node_workdir}/conf" + if not os.path.exists(conf_dir): + shutil.copytree(os.path.realpath("../conf"), conf_dir) + write_rackdc_properties(cluster_workdir=cluster_workdir, addr=addr, dc=dc, rack=rack) + async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str, seed: str, cluster_name: str, extra_opts: list[str]) -> Process: """Starts a Scylla node. Its --workdir will be $cluster_workdir/$addr/, its log file will be $cluster_workdir/$addr.log, @@ -364,12 +380,13 @@ async def start_node(executable: PathLike, cluster_workdir: PathLike, addr: str, # The directory change to it happens via the cwd=cluster_workdir in run() llvm_profile_file = f"{addr}-%m.profraw" scylla_workdir = f"{addr}" + scylla_home = os.path.realpath(f"{cluster_workdir}/{scylla_workdir}") logfile = f"{addr}.log" socket = maintenance_socket_path(cluster_workdir, addr) command = [ "env", f"LLVM_PROFILE_FILE={llvm_profile_file}", - f"SCYLLA_HOME={os.path.realpath(os.getcwd())}", # We assume that the script has Scylla's `conf/` as its filesystem neighbour. + f"SCYLLA_HOME={scylla_home}", os.path.realpath(executable), f"--workdir={scylla_workdir}", f"--maintenance-socket={socket}", @@ -433,6 +450,7 @@ async def start_cluster(executable: PathLike, addrs: list[str], cpusets: Optiona seed = addrs[0] try: for i in range(0, len(addrs)): + prepare_node_conf(cluster_workdir=workdir, addr=addrs[i], dc="dc1", rack=f"rack{i + 1}") proc = await start_node(executable, addr=addrs[i], seed=seed, cluster_workdir=workdir, cluster_name=cluster_name, extra_opts=extra_opts+cpuset_args[i]) procs.append(proc) await wait_for_node(proc, addrs[i], timeout) From 47d90da86779da98338b8949ab41924a3e2ba61d Mon Sep 17 00:00:00 2001 From: Michael Litvak Date: Thu, 21 May 2026 14:07:17 +0200 Subject: [PATCH 2/2] pgo: enable train with tablets for SI and LWT pgo training for secondary indexes and LWT was configured with tablets disabled because it wasn't supported at the time. this is no longer the case, so we should remove the restrictions and enable the training with the default mode. --- pgo/conf/lwt.yaml | 3 +-- pgo/conf/si.yaml | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pgo/conf/lwt.yaml b/pgo/conf/lwt.yaml index 7d193d5358..aa29558489 100644 --- a/pgo/conf/lwt.yaml +++ b/pgo/conf/lwt.yaml @@ -2,9 +2,8 @@ keyspace: ks # The CQL for creating a keyspace (optional if it already exists) -# FIXME: use tablets after https://github.com/scylladb/scylladb/issues/18068 is done. keyspace_definition: | - CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND TABLETS = {'enabled': false}; + CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}; # Table name table: targettable diff --git a/pgo/conf/si.yaml b/pgo/conf/si.yaml index 6c793523e4..529ba66185 100644 --- a/pgo/conf/si.yaml +++ b/pgo/conf/si.yaml @@ -1,9 +1,8 @@ keyspace: sec_index -# FIXME: use tablets after https://github.com/scylladb/scylladb/issues/22677 is done. keyspace_definition: | - CREATE KEYSPACE IF NOT EXISTS sec_index WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND TABLETS = {'enabled': false}; + CREATE KEYSPACE IF NOT EXISTS sec_index WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}; table: users