mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-03 05:26:58 +00:00
pgo: bound write_rmw concurrency during alternator training
The write_rmw alternator workload uses LWT/Paxos, which retains in-flight state in LSA. Running it at the same high concurrency as the other workloads let that state grow unbounded, exhausting the server mid-run and aborting the client connection (ECONNABORTED), which failed the PGO training pipeline. Make concurrency configurable per workload and cap write_rmw so its in-flight state stays bounded, while keeping the other workloads at their previous level. Although this lowers write_rmw concurrency by a factor of three, in a dev environment the workload takes only ~25% longer to complete. The `std::runtime_error (sl:default: wait queue overload)` errors also drop to zero once the fix is applied. Fixes: SCYLLADB-1071 Closes scylladb/scylladb#30198
This commit is contained in:
committed by
Marcin Maliszkiewicz
parent
46a712d6d4
commit
8e293cd5e8
12
pgo/pgo.py
12
pgo/pgo.py
@@ -665,11 +665,12 @@ async def train_alternator(executable: PathLike, workdir: PathLike) -> None:
|
||||
await asyncio.sleep(5) # FIXME: artificial gossip sleep, get rid of it.
|
||||
|
||||
workloads = [
|
||||
["write", 250_000],
|
||||
["read", 250_000],
|
||||
["scan", 1_000],
|
||||
["write_gsi", 250_000],
|
||||
["write_rmw", 250_000],
|
||||
# [workload, ops_per_shard, concurrency]
|
||||
["write", 250_000, 100],
|
||||
["read", 250_000, 100],
|
||||
["scan", 1_000, 100],
|
||||
["write_gsi", 250_000, 100],
|
||||
["write_rmw", 250_000, 32], # LWT/Paxos: keep in-flight LSA state bounded
|
||||
]
|
||||
for workload in workloads:
|
||||
# the tool doesn't yet support load balancing so we
|
||||
@@ -682,6 +683,7 @@ async def train_alternator(executable: PathLike, workdir: PathLike) -> None:
|
||||
# we reuse cluster data so don't need to pre-populate
|
||||
"--prepopulate-partitions", "0",
|
||||
"--operations-per-shard", f"{workload[1]}",
|
||||
"--concurrency", f"{workload[2]}",
|
||||
"--cpuset", f'{CS_CPUSET.get()}',
|
||||
"--remote-host", addr,
|
||||
]) for addr in addrs])
|
||||
|
||||
Reference in New Issue
Block a user