From 8e293cd5e88a2dec07d5112471a7703f10bebfa7 Mon Sep 17 00:00:00 2001 From: Andrzej Jackowski Date: Tue, 2 Jun 2026 10:04:00 +0200 Subject: [PATCH] pgo: bound write_rmw concurrency during alternator training The write_rmw alternator workload uses LWT/Paxos, which retains in-flight state in LSA. Running it at the same high concurrency as the other workloads let that state grow unbounded, exhausting the server mid-run and aborting the client connection (ECONNABORTED), which failed the PGO training pipeline. Make concurrency configurable per workload and cap write_rmw so its in-flight state stays bounded, while keeping the other workloads at their previous level. Although this lowers write_rmw concurrency by a factor of three, in a dev environment the workload takes only ~25% longer to complete. The `std::runtime_error (sl:default: wait queue overload)` errors also drop to zero once the fix is applied. Fixes: SCYLLADB-1071 Closes scylladb/scylladb#30198 --- pgo/pgo.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pgo/pgo.py b/pgo/pgo.py index d23587eb50..fca219f281 100644 --- a/pgo/pgo.py +++ b/pgo/pgo.py @@ -665,11 +665,12 @@ async def train_alternator(executable: PathLike, workdir: PathLike) -> None: await asyncio.sleep(5) # FIXME: artificial gossip sleep, get rid of it. workloads = [ - ["write", 250_000], - ["read", 250_000], - ["scan", 1_000], - ["write_gsi", 250_000], - ["write_rmw", 250_000], + # [workload, ops_per_shard, concurrency] + ["write", 250_000, 100], + ["read", 250_000, 100], + ["scan", 1_000, 100], + ["write_gsi", 250_000, 100], + ["write_rmw", 250_000, 32], # LWT/Paxos: keep in-flight LSA state bounded ] for workload in workloads: # the tool doesn't yet support load balancing so we @@ -682,6 +683,7 @@ async def train_alternator(executable: PathLike, workdir: PathLike) -> None: # we reuse cluster data so don't need to pre-populate "--prepopulate-partitions", "0", "--operations-per-shard", f"{workload[1]}", + "--concurrency", f"{workload[2]}", "--cpuset", f'{CS_CPUSET.get()}', "--remote-host", addr, ]) for addr in addrs])